# [Task 2C: Multimodal Propagandistic Memes Classification](https://araieval.gitlab.io/task2/) at [ArabicNLP 2024](https://arabicnlp2024.sigarab.org/) @ACL 2024

@Author: Md. Arid Hasan

Given multimodal content (text extracted from meme and the meme itself) the task is to detect whether the content is propagandistic.



### installing required libraries.
 - transformers
 - datasets
 - evaluate
 - accelerate

In [5]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install --upgrade accelerate
!pip install -U datasets

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5
Collecting accelerate
  Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torc

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

### Setting up the training parameters

In [1]:
learning_rate=2e-5
num_train_epochs=2
train_max_seq_len = 512
max_train_samples = None
max_eval_samples=None
max_predict_samples=None
batch_size = 16

#### Define custom dataset Class

In [61]:
import csv
import numpy as np
import torch
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, AutoTokenizer

class MultimodalDataset(Dataset):
    def __init__(self, ids, text_data, image_data, labels, is_test=False):
        self.text_data = text_data
        self.image_data = image_data
        self.ids = ids
        self.is_test = is_test
        #if not self.is_test:
        self.labels = labels
        self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased') #bert-base-multilingual-uncased
        self.transform = transforms.Compose([transforms.Resize(256),
                                             transforms.CenterCrop(224),
                                             transforms.ToTensor(),
                                             transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
                                             ])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        id = self.ids[index]
        text = self.text_data[index]
        image = self.image_data[index]
        #if not self.is_test:
        label = self.labels[index]

        # tokenize text data
        text = self.tokenizer.encode_plus(text, add_special_tokens=True,
                                           max_length=train_max_seq_len, padding='max_length',
                                           return_attention_mask=True, return_tensors='pt')

        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        image = self.transform(Image.open(image).convert("RGB"))

        fdata = {
            'id': id,
            'text': text['input_ids'].squeeze(0),
            'text_mask': text['attention_mask'].squeeze(0),
            'img_path': image,
        }
        if not self.is_test:
            fdata['label'] = torch.tensor(label, dtype=torch.long)
            return fdata
        else:
            return fdata


## Download data from HF: https://huggingface.co/datasets/QCRI/Prop2Hate-Meme
### Defining the training, validation, and test data

In [27]:
from datasets import load_dataset

dataset = load_dataset("QCRI/Prop2Hate-Meme")

# Specify the directory where you want to save the dataset

output_dir="./Prop2Hate-Meme"

# Save the dataset to the specified directory. This will save all splits to the output directory.
dataset.save_to_disk(output_dir)

# If you want to get the raw images from HF dataset format

from PIL import Image
import os
import json

# Directory to save the images
output_dir="./Prop2Hate-Meme/"
os.makedirs(output_dir, exist_ok=True)

# Iterate over the dataset and save each image
for split in ['train','dev','test']:
    jsonl_path = os.path.join(output_dir, f"arabic_hateful_meme_{split}.jsonl")
    with open(jsonl_path, 'w', encoding='utf-8') as f:
        for idx, item in enumerate(dataset[split]):
            # Access the image directly as it's already a PIL.Image object
            image = item['image']
            image_path = os.path.join(output_dir, item['img_path'])
            # Ensure the directory exists
            os.makedirs(os.path.dirname(image_path), exist_ok=True)
            image.save(image_path)
            del item['image']
            del item['prop_label']
            del item['hate_fine_grained_label']
            item['label'] = item.pop('hate_label')
            f.write(json.dumps(item, ensure_ascii=False) + '\n')



Saving the dataset (0/1 shards):   0%|          | 0/2143 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/606 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/312 [00:00<?, ? examples/s]

In [28]:
import os
os.chdir("Prop2Hate-Meme")

train_file = './arabic_hateful_meme_train.jsonl'
validation_file = './arabic_hateful_meme_dev.jsonl'
test_file = './arabic_hateful_meme_test.jsonl'



In [29]:
!ls

arabic_hateful_meme_dev.jsonl	 data		    test
arabic_hateful_meme_test.jsonl	 dataset_dict.json  train
arabic_hateful_meme_train.jsonl  dev


In [30]:
jsonl_path = "./arabic_hateful_meme_train.jsonl" # Example path, modify as needed
data = []
with open(jsonl_path, 'r', encoding='utf-8') as f:
  for line in f:
    data.append(json.loads(line))

# data is now a list of dictionaries, where each dictionary is a parsed JSON object from a line in the file.
print(f"Loaded {len(data)} entries from {jsonl_path}")
if data:
    print("First entry:")
data[0]

Loaded 2143 entries from ./arabic_hateful_meme_train.jsonl
First entry:


{'id': 'data/arabic_memes_fb_insta_pinterest/Pinterest/images/pinterest_images_part2/www.pinterest.com_pin_374924737743995066/7485ad3c9c4cd8159ce93997a18a53a8.jpg',
 'text': 'زوجة ماكرون تصرح أن الحجاب يرعب ويخيف الأطفال..😅😂😂',
 'img_path': './data/arabic_memes_fb_insta_pinterest/Pinterest/images/pinterest_images_part2/www.pinterest.com_pin_374924737743995066/7485ad3c9c4cd8159ce93997a18a53a8.jpg',
 'label': 0}

#### Defining the Model

In [31]:
text_model_name = 'distilbert-base-multilingual-cased'

#### Loading data files

In [65]:
import json
import pandas as pd
from tqdm import tqdm

import pandas as pd
from datasets import Dataset, DatasetDict

def read_jsonl_to_df(filename):
    return pd.read_json(filename, lines=True)

l2id = {'not-hateful': 0, 'hateful': 1}

# Assume all splits use "img_path" as the image column
def prepare_dataset(file):
    df = read_jsonl_to_df(file)
    # df['label'] = df['label'].map(l2id)
    # Cast "img_path" column as Image
    return Dataset.from_pandas(df) #.cast_column("img_path", Image())

train_df = prepare_dataset(train_file)
train_dataset = MultimodalDataset(train_df['id'], train_df['text'], train_df['img_path'], train_df['label'])

validation_dataset = prepare_dataset(validation_file)
validation_dataset = MultimodalDataset(validation_dataset['id'], validation_dataset['text'], validation_dataset['img_path'], validation_dataset['label'])

test_dataset = prepare_dataset(test_file)
test_dataset = MultimodalDataset(test_dataset['id'], test_dataset['text'], test_dataset['img_path'], test_dataset['label'])

raw_datasets = DatasetDict(
    {"train": train_dataset, "validation": validation_dataset, "test": test_dataset}
)



### Finalize the train data for training

In [67]:
if max_train_samples is not None:
    max_train_samples_n = min(len(train_dataset), max_train_samples)
    train_dataset = train_dataset.select(range(max_train_samples_n))

#### Finalize the development/evaluation data for evaluating the model

In [69]:

if max_eval_samples is not None:
    max_eval_samples_n = min(len(validation_dataset), max_eval_samples)
    validation_dataset = validation_dataset.select(range(max_eval_samples_n))

#### Finalize the test data for predicting the unseen test data using the model

In [70]:

if max_predict_samples is not None:
    max_predict_samples_n = min(len(test_dataset), max_predict_samples)
    test_dataset = test_dataset.select(range(max_predict_samples_n))

#### Log a few random samples from the training set

In [71]:
import random
for index in random.sample(range(len(train_dataset)), 2):
    print(f"Sample {index} of the training set: {train_dataset[index]}.")

Sample 1567 of the training set: {'id': 'data/arabic_memes_fb_insta_pinterest/Facebook/images/islamic.stolen.memes/261673765_434120414908873_906898882982388272_n.jpg', 'text': tensor([   101,  12441,  38901,  68269,    766,  31330,  11086,  75139,  44324,
           785,  10388,  30373,  11797,    777,  18914,  11852,  12497,  59901,
         75047,  22973,  10461,    791,  10502,  40194,  10765,    787,  62347,
         25741,  39274,  10673,  82225,  59901,  33061,  38058,    791,  16506,
         11832,  63742,    793,  14495,  11509,  12616,    788,  26341,  40926,
           791,  55438,  29732,  10765,  54069,  10560,  39274,  10673,  82225,
        105532,  10461,    102,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0, 

### Batchify

In [83]:
train_df = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=True)
validation_df = torch.utils.data.DataLoader(validation_dataset, batch_size=8, shuffle=True, drop_last=True)
test_df = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=True, drop_last=True)

## MultiModal

In [79]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from transformers import BertModel, AutoModel

# Define the multimodal classification model
class MultimodalClassifier(nn.Module):
    def __init__(self, num_classes):
        super(MultimodalClassifier, self).__init__()

        # BERT model for text input
        #config = AutoConfig.from_pretrained('xlm-roberta-xlarge', num_labels=2,use_auth_token=None)
        self.bert = AutoModel.from_pretrained(text_model_name)

        self.bert_drop = nn.Dropout(0.3)
        self.bert_fc = nn.Linear(768, 512) #for BERT=768

        # ResNet model for image input
        self.resnet = models.resnet50(pretrained=True)
        self.resnet_fc = nn.Linear(1000, 512)

        # Fusion layer
        self.fusion_fc = nn.Linear(1024, 512)
        # Output layer
        self.output_fc = nn.Linear(512, num_classes)

    def forward(self, text, image, mask):
        #image = image.unsqueeze(0)
        # Text input through BERT model
        bert_output = self.bert(text, attention_mask=mask, return_dict=False) #attention_mask=mask,
        #bert_output = self.bert(text, attention_mask=mask, return_dict=False) #attention_mask=mask,
        #print(bert_output)
        bert_output = self.bert_drop(bert_output[0][:, -1, :])
        bert_output = self.bert_fc(bert_output)


        # Image input through ResNet model
        resnet_output = self.resnet(image)
        resnet_output = self.resnet_fc(resnet_output)

        # Concatenate the text and image features
        # bert_output = bert_output.squeeze(2)
        # print(bert_output.shape)
        # print(resnet_output.shape)
        features = torch.cat((bert_output, resnet_output), dim=1)

        # Fusion layer
        features = self.fusion_fc(features)
        # Output layer
        output = self.output_fc(features)

        return output

# Define the training and testing functions
def train(model, train_loader, criterion, optimizer, device):
    model.train()
    train_loss = 0.0
    correct = 0
    for data in tqdm(train_loader):
        optimizer.zero_grad()
        text = data["text"].to(device)
        #print(text.shape)
        image = data["img_path"].to(device)
        mask = data["text_mask"].to(device)
        #print(mask.shape)
        labels = data['label'].to(device)
        output = model(text, image, mask)
        #print(output)
        loss = criterion(output, labels)
        #print(loss)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * labels.size(0)
        _, predicted = torch.max(output, 1)
        correct += (predicted == labels).sum().item()
    train_loss /= len(train_loader.dataset)
    accuracy = correct / len(train_loader.dataset)
    return train_loss, accuracy

def test(model, test_loader, criterion, device):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for data in tqdm(test_loader):
            text = data["text"].to(device)
            image = data["img_path"].to(device)
            mask = data["text_mask"].to(device)
            labels = data['label'].to(device)
            output = model(text, image, mask)
            loss = criterion(output, labels)
            test_loss += loss.item() * labels.size(0)
            _, predicted = torch.max(output, 1)
            correct += (predicted == labels).sum().item()
    test_loss /= len(test_loader.dataset)
    accuracy = correct / len(test_loader.dataset)
    return test_loss, accuracy


In [80]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalClassifier(num_classes=2)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)

# Train the model
num_epochs = 1
for epoch in range(num_epochs):
    train_loss, acc = train(model, train_df, criterion, optimizer, device)
    #dev_loss, accuracy = test(model, eval_dataset, criterion, device)
    print('Epoch {}/{}: Train Loss = {:.4f}, Accuracy = {:.4f}'.format(epoch+1, num_epochs, train_loss, acc))


100%|██████████| 267/267 [02:25<00:00,  1.83it/s]

Epoch 1/1: Train Loss = 0.3130, Accuracy = 0.8922





### Evaluate

In [82]:
def evaluate(model, test_loader, device):
    model.eval()
    predictions = []
    y_test_pred = []
    ids = []
    with torch.no_grad():
        for data in tqdm(test_loader):
            text = data["text"].to(device)
            image = data["img_path"].to(device)
            mask = data["text_mask"].to(device)
            output = model(text, image, mask)
            _, predicted = torch.max(output, 1)
            predictions.append(predicted)
            ids.append(data["id"])

    with open(f'task2_TeamName.tsv', 'w') as f:
      f.write("id\tlabel\n")
      indx = 0
      id2l = {0:'not-hateful', 1:'hateful'}
      for i, line in enumerate(predictions):
        for indx, l in enumerate(line.tolist()):
          f.write(f"{ids[i][indx]}\t{id2l[l]}\n")

evaluate(model, validation_df, device)

100%|██████████| 39/39 [00:05<00:00,  7.56it/s]
