In [1]:
import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
train_df = pd.read_csv("./data/processed/train_metadata.csv")
dev_df = pd.read_csv("./data/processed/dev_metadata.csv")

In [3]:
train_df.head()

Unnamed: 0,id,img,label,text,img_path
0,42953,img/42953.png,0,its their character not their color that matters,./data/raw/HM Dataset/img/42953.png
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,./data/raw/HM Dataset/img/23058.png
2,13894,img/13894.png,0,putting bows on your pet,./data/raw/HM Dataset/img/13894.png
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,./data/raw/HM Dataset/img/37408.png
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...",./data/raw/HM Dataset/img/82403.png


In [4]:
dev_df.head()

Unnamed: 0,id,img,label,text,img_path
0,8291,img/08291.png,1,white people is this a shooting range,./data/raw/HM Dataset/img/08291.png
1,46971,img/46971.png,1,bravery at its finest,./data/raw/HM Dataset/img/46971.png
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...,./data/raw/HM Dataset/img/03745.png
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...,./data/raw/HM Dataset/img/83745.png
4,80243,img/80243.png,1,mississippi wind chime,./data/raw/HM Dataset/img/80243.png


In [5]:
DATA_ROOT = r"E:\Machine Learning\_Projects\A Multimodal Framework for Detecting Harmful Memes\notebooks\data\raw\HM Dataset"


def fix_image_paths(df, root_path):
    df['img'] = df['img'].apply(lambda x: os.path.join(root_path, os.path.basename(os.path.dirname(x)), os.path.basename(x)) if '/' in x or '\\' in x else os.path.join(root_path, 'img', x))
    return df

train_df = fix_image_paths(train_df, DATA_ROOT)
dev_df = fix_image_paths(dev_df, DATA_ROOT)

# Verify
sample_path = train_df['img'].iloc[0]
print(f"Checking fixed path: {sample_path}")
print(f"File exists? {os.path.exists(sample_path)}")

Checking fixed path: E:\Machine Learning\_Projects\A Multimodal Framework for Detecting Harmful Memes\notebooks\data\raw\HM Dataset\img\42953.png
File exists? True


## Image Preprocessing

In [6]:
import os
import tensorflow as tf

# Configuration
IMG_SIZE = (224, 224)
MAX_LEN = 64
BATCH_SIZE = 32

In [7]:
def preprocess_image(image_path):
    img = tf.io.read_file(image_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    # Standard ResNet preprocessing (converts RGB to BGR and centers pixels)
    img = tf.keras.applications.resnet50.preprocess_input(img)
    return img

In [8]:
# Data Augmentation (Only for training)
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.1),
    tf.keras.layers.RandomZoom(0.1),
])

## Text Preprocessing

In [9]:
from transformers import AutoTokenizer
# Initialize the Tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [10]:
def tokenize_text(texts):
    encoded = tokenizer(
        text=texts.tolist(),
        add_special_tokens=True,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='np'
    )
    return encoded['input_ids'], encoded['attention_mask']

## Multimodel Dataset

In [11]:
def create_multimodal_dataset(df, training=True):
    # Perform text preprocessing upfront for the entire split
    input_ids, attention_masks = tokenize_text(df['text'])
    
    # Create a TF Dataset from the components
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            "image_input": df['img'].values,
            "input_ids": input_ids,
            "attention_mask": attention_masks
        },
        df['label'].values
    ))

    def map_fn(inputs, label):
        # Apply image preprocessing
        inputs["image_input"] = preprocess_image(inputs["image_input"])
        
        # Data Augmentation (Only if training=True)
        if training:
            inputs["image_input"] = tf.image.random_flip_left_right(inputs["image_input"])
            inputs["image_input"] = tf.image.random_brightness(inputs["image_input"], max_delta=0.1)
            
        return inputs, label

    dataset = dataset.map(map_fn, num_parallel_calls=tf.data.AUTOTUNE)
    
    if training:
        dataset = dataset.shuffle(1000)
        
    return dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [12]:
# Initialize official splits

train_ds = create_multimodal_dataset(train_df, training=True)
val_ds = create_multimodal_dataset(dev_df, training=False)

In [13]:
train_ds

<_PrefetchDataset element_spec=({'image_input': TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), 'input_ids': TensorSpec(shape=(None, 64), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 64), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [14]:
val_ds

<_PrefetchDataset element_spec=({'image_input': TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), 'input_ids': TensorSpec(shape=(None, 64), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 64), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>