In [None]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv("./data/processed/collected_data.csv")

In [None]:
data.head()

### Check missing values

In [None]:
data.isna().sum()

In [None]:
import tensorflow as tf
from transformers import AutoTokenizer

# Configuration
IMG_SIZE = (224, 224)
MAX_LEN = 64
BATCH_SIZE = 32

# Load Tokenizer (RoBERTa is generally more robust for memes)
tokenizer = AutoTokenizer.from_pretrained(
    "roberta-base", 
    use_fast=True, 
    force_download=True 
)

## Text Preprocessing

### Tokenization

In [None]:
def tokenize_text(texts):
    encoded = tokenizer(
        text=texts.tolist(),
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='np'
    )
    return encoded['input_ids'], encoded['attention_mask']

input_ids, attention_masks = tokenize_text(data['text'])

## Image Preprocessing

In [None]:
def preprocess_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)

    img = tf.keras.applications.resnet50.preprocess_input(img)
    return img

In [None]:
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1),
])