# Environment setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install torch torchvision torchaudio -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip install -r /content/drive/MyDrive/BC/meme/requirements.txt

In [None]:
!pip install git+https://github.com/openai/CLIP.git

# Weights & Biases setup

In [None]:
import wandb

wandb.login(key='YOUR_WANDB_API_KEY')

# Recreation of reults on HMC dataset

For training the model, adjust the '--reproduce t' tag to '--reproduce f'

## HMC dataset

In [None]:
!python3 /content/drive/MyDrive/BC/meme/src/main.py --dataset 'hmc' --num_mapping_layers 1 --map_dim 1024 --fusion align --num_pre_output_layers 1 --drop_probs 0.2 0.4 0.1 --gpus '0' --batch_size 64 --lr 0.0001 --max_epochs 50 --name 'text-inv-comb' --pretrained_model 'hmc_text-inv-comb_best.ckpt' --reproduce t --pretrained_proj_weights t --freeze_proj_layers t --comb_proj t --comb_fusion align --convex_tensor f --phi_inv_proj t --text_inv_proj t --post_inv_proj t --enh_text t --phi_freeze t --fast_process t

## HARMEME dataset

In [None]:
!python3 /content/drive/MyDrive/BC/meme/src/main.py --dataset 'harmeme' --num_mapping_layers 1 --map_dim 768 --fusion align --num_pre_output_layers 3 --drop_probs 0.2 0.4 0.1 --gpus '0' --batch_size 64 --lr 0.000013 --max_epochs 60 --name 'text-inv-comb' --pretrained_model 'harmeme_text-inv-comb_best.ckpt' --reproduce t --pretrained_proj_weights t --freeze_proj_layers t --comb_proj t --comb_fusion align --convex_tensor f --phi_inv_proj t --text_inv_proj t --post_inv_proj t --enh_text t --phi_freeze t --fast_process t

# Visualization

In [None]:
%cd /content/drive/MyDrive/BC/meme/src

## Visualization for random test_unseen HMC meme

In [None]:
%matplotlib inline
import os
import argparse
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

from torchvision import transforms

from pytorch_lightning import Trainer, seed_everything

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from datasets import MemesCollator, load_dataset
from engine import create_model, HateClassifier
from utils import str2bool, generate_name

from PIL import Image
from IPython.display import Image as IPImage, display


def get_arg_parser():
    parser = argparse.ArgumentParser(description='Training and evaluation script for hateful memes classification')

    parser.add_argument('--dataset', default='hmc', choices=['hmc', 'harmeme'])
    parser.add_argument('--image_size', type=int, default=224)

    parser.add_argument('--num_mapping_layers', default=1, type=int)
    parser.add_argument('--map_dim', default=768, type=int)

    parser.add_argument('--fusion', default='align',
                        choices=['align', 'concat'])

    parser.add_argument('--num_pre_output_layers', default=1, type=int)

    parser.add_argument('--drop_probs', type=float, nargs=3, default=[0.1, 0.4, 0.2],
                        help="Set drop probabilities for map, fusion, pre_output")

    parser.add_argument('--gpus', default='0', help='GPU ids concatenated with space')
    parser.add_argument('--limit_train_batches', default=1.0)
    parser.add_argument('--limit_val_batches', default=1.0)
    parser.add_argument('--max_steps', type=int, default=-1)
    parser.add_argument('--max_epochs', type=int, default=-1)
    parser.add_argument('--log_every_n_steps', type=int, default=25)
    parser.add_argument('--val_check_interval', default=1.0)
    parser.add_argument('--batch_size', type=int, default=16, help='Batch size')
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('--weight_decay', type=float, default=1e-4)
    parser.add_argument('--gradient_clip_val', type=float, default=0.1)

    parser.add_argument('--proj_map', default=False, type=str2bool)

    parser.add_argument('--pretrained_proj_weights', default=False, type=str2bool)
    parser.add_argument('--freeze_proj_layers', default=False, type=str2bool)

    parser.add_argument('--comb_proj', default=False, type=str2bool)
    parser.add_argument('--comb_fusion', default='align',
                        choices=['concat', 'align'])
    parser.add_argument('--convex_tensor', default=False, type=str2bool)

    parser.add_argument('--text_inv_proj', default=False, type=str2bool)
    parser.add_argument('--phi_inv_proj', default=False, type=str2bool)
    parser.add_argument('--post_inv_proj', default=False, type=str2bool)

    parser.add_argument('--enh_text', default=False, type=str2bool)

    parser.add_argument('--phi_freeze', default=False, type=str2bool)

    parser.add_argument('--name', type=str, default='adaptation',
                        choices=['adaptation', 'hate-clipper', 'image-only', 'text-only', 'sum', 'combiner', 'text-inv',
                                 'text-inv-fusion', 'text-inv-comb']
                        )
    parser.add_argument('--pretrained_model', type=str, default='')
    parser.add_argument('--reproduce', default=False, type=str2bool)
    parser.add_argument('--print_model', default=False, type=str2bool)
    parser.add_argument('--fast_process', default=False, type=str2bool)

    return parser


# setting the arguments as a string
args_str = "--dataset hmc --image_size 224 --num_mapping_layers 1 --map_dim 1024 --fusion align --num_pre_output_layers 1 --drop_probs 0.2 0.4 0.1 --gpus 0 --limit_train_batches 1.0 --limit_val_batches 1.0 --max_steps -1 --max_epochs -1 --log_every_n_steps 25 --val_check_interval 1.0 --batch_size 16 --lr 0.000013 --weight_decay 1e-4 --gradient_clip_val 0.1 --proj_map False --pretrained_proj_weights t --freeze_proj_layers t --comb_proj t --comb_fusion align --convex_tensor False --text_inv_proj t --phi_inv_proj t --post_inv_proj t --enh_text t --phi_freeze t --name adaptation --pretrained_model 'hmc_text-inv-comb_best.ckpt' --reproduce t --print_model False --fast_process t"

# parsing the arguments
arguments = get_arg_parser().parse_args(args_str.split())

# setting the GPU ids
arguments.gpus = [int(id_) for id_ in arguments.gpus.split()]

# loading dataset and creating dataloader
dataset_test = load_dataset(args=arguments, split='test_unseen')
collator = MemesCollator(arguments)
dataloader_test = DataLoader(dataset_test, batch_size=arguments.batch_size, collate_fn=collator, num_workers=0)
print("Number of samples in the dataset:", len(dataloader_test.dataset))

# loading or reproducing of trained model
model = create_model(arguments)

In [None]:
class MemesDataset(Dataset):
    def __init__(self, root_folder, dataset, split='train', image_size=224):
        super(MemesDataset, self).__init__()
        self.root_folder = root_folder
        self.dataset = dataset
        self.split = split
        self.image_size = image_size

        self.info_file = os.path.join(root_folder, dataset, f'/content/drive/MyDrive/BC/meme/resources/datasets/hmc/labels/hmc_info.csv')
        self.df = pd.read_csv(self.info_file)
        self.df = self.df[self.df['split'] == self.split].reset_index(drop=True)
        float_cols = self.df.select_dtypes(float).columns
        self.df[float_cols] = self.df[float_cols].fillna(-1).astype('Int64')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        if row['text'] == 'nothing':
            txt = 'null'
        else:
            txt = row['text']

        if self.dataset == 'hmc':
            image_fn = row['img'].split('/')[1]
        else:
            image_fn = row['img']

        image_path = os.path.join(self.root_folder, image_fn)
        image = Image.open(image_path).convert('RGB').resize((self.image_size, self.image_size))

        item = {
            'image': image,
            'text': txt,
            'label': row['label'],
            'idx_meme': row['id'],
            'origin_text': txt
        }

        return item


# loading of the dataset
dataset_test = MemesDataset(root_folder='/content/drive/MyDrive/BC/meme/resources/datasets/hmc', dataset='img', split='test_unseen')

In [None]:
def show_random_image_with_prediction(dataset, model):
    # setting the model to evaluation mode
    model.eval()

    # selecting a random index
    idx = random.randint(0, len(dataset) - 1)

    # retrieving the sample from the dataset
    sample = dataset[idx]

    # getting the image, label, and text from the sample
    image = sample['image']
    label = sample['label']
    text = sample['text']

    # preparation of the image for model prediction
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    # adding batch dimension
    input_image = transform(image).unsqueeze(0)

    # making predictions using the model
    with torch.no_grad():
        model_output = model({'images': input_image, 'texts': text, 'labels': label})

    # extracting the predicted label from the model output
    predicted_label = int(torch.round(torch.sigmoid(model_output['logits'])).item())

    # converting the PIL image to a NumPy array for display
    image_np = np.array(image)
    image_np = image_np / 255.0
    image_np = np.clip(image_np, 0, 1)

    # displaying the image with the true label and predicted label
    plt.imshow(image_np)
    plt.title(f"True Label: {label}, Predicted Label: {predicted_label}")
    plt.show()


# usage
show_random_image_with_prediction(dataset_test, model)

## Visualization for random test_unseen HARMEME meme

In [None]:
%matplotlib inline
import os
import argparse
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch

from torchvision import transforms

from pytorch_lightning import Trainer, seed_everything

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from datasets import MemesCollator, load_dataset
from engine import create_model, HateClassifier
from utils import str2bool, generate_name

from PIL import Image
from IPython.display import Image as IPImage, display


def get_arg_parser():
    parser = argparse.ArgumentParser(description='Training and evaluation script for hateful memes classification')

    parser.add_argument('--dataset', default='hmc', choices=['hmc', 'harmeme'])
    parser.add_argument('--image_size', type=int, default=224)

    parser.add_argument('--num_mapping_layers', default=1, type=int)
    parser.add_argument('--map_dim', default=768, type=int)

    parser.add_argument('--fusion', default='align',
                        choices=['align', 'concat'])

    parser.add_argument('--num_pre_output_layers', default=1, type=int)

    parser.add_argument('--drop_probs', type=float, nargs=3, default=[0.1, 0.4, 0.2],
                        help="Set drop probabilities for map, fusion, pre_output")

    parser.add_argument('--gpus', default='0', help='GPU ids concatenated with space')
    parser.add_argument('--limit_train_batches', default=1.0)
    parser.add_argument('--limit_val_batches', default=1.0)
    parser.add_argument('--max_steps', type=int, default=-1)
    parser.add_argument('--max_epochs', type=int, default=-1)
    parser.add_argument('--log_every_n_steps', type=int, default=25)
    parser.add_argument('--val_check_interval', default=1.0)
    parser.add_argument('--batch_size', type=int, default=16, help='Batch size')
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('--weight_decay', type=float, default=1e-4)
    parser.add_argument('--gradient_clip_val', type=float, default=0.1)

    parser.add_argument('--proj_map', default=False, type=str2bool)

    parser.add_argument('--pretrained_proj_weights', default=False, type=str2bool)
    parser.add_argument('--freeze_proj_layers', default=False, type=str2bool)

    parser.add_argument('--comb_proj', default=False, type=str2bool)
    parser.add_argument('--comb_fusion', default='align',
                        choices=['concat', 'align'])
    parser.add_argument('--convex_tensor', default=False, type=str2bool)

    parser.add_argument('--text_inv_proj', default=False, type=str2bool)
    parser.add_argument('--phi_inv_proj', default=False, type=str2bool)
    parser.add_argument('--post_inv_proj', default=False, type=str2bool)

    parser.add_argument('--enh_text', default=False, type=str2bool)

    parser.add_argument('--phi_freeze', default=False, type=str2bool)

    parser.add_argument('--name', type=str, default='adaptation',
                        choices=['adaptation', 'hate-clipper', 'image-only', 'text-only', 'sum', 'combiner', 'text-inv',
                                 'text-inv-fusion', 'text-inv-comb']
                        )
    parser.add_argument('--pretrained_model', type=str, default='')
    parser.add_argument('--reproduce', default=False, type=str2bool)
    parser.add_argument('--print_model', default=False, type=str2bool)
    parser.add_argument('--fast_process', default=False, type=str2bool)

    return parser


# setting the arguments as a string
args_str = "--dataset harmeme --image_size 224 --num_mapping_layers 1 --map_dim 768 --fusion align --num_pre_output_layers 3 --drop_probs 0.2 0.4 0.1 --gpus 0 --limit_train_batches 1.0 --limit_val_batches 1.0 --max_steps -1 --max_epochs -1 --log_every_n_steps 25 --val_check_interval 1.0 --batch_size 16 --lr 0.000013 --weight_decay 1e-4 --gradient_clip_val 0.1 --proj_map False --pretrained_proj_weights t --freeze_proj_layers t --comb_proj t --comb_fusion align --convex_tensor False --text_inv_proj t --phi_inv_proj t --post_inv_proj t --enh_text t --phi_freeze t --name adaptation --pretrained_model 'harmeme_text-inv-comb_best.ckpt' --reproduce t --print_model False --fast_process t"

# parsing the arguments
arguments = get_arg_parser().parse_args(args_str.split())

# setting the GPU ids
arguments.gpus = [int(id_) for id_ in arguments.gpus.split()]

# loading dataset and creating dataloader
dataset_test = load_dataset(args=arguments, split='test')
collator = MemesCollator(arguments)
dataloader_test = DataLoader(dataset_test, batch_size=arguments.batch_size, collate_fn=collator, num_workers=0)
print("Number of samples in the dataset:", len(dataloader_test.dataset))

# loading or reproducing of trained model
model = create_model(arguments)

In [None]:
class MemesDataset(Dataset):
    def __init__(self, root_folder, dataset, split='train', image_size=224):
        super(MemesDataset, self).__init__()
        self.root_folder = root_folder
        self.dataset = dataset
        self.split = split
        self.image_size = image_size

        self.info_file = os.path.join(root_folder, dataset, f'/content/drive/MyDrive/BC/meme/resources/datasets/harmeme/labels/harmeme_info.csv')
        self.df = pd.read_csv(self.info_file)
        self.df = self.df[self.df['split'] == self.split].reset_index(drop=True)
        float_cols = self.df.select_dtypes(float).columns
        self.df[float_cols] = self.df[float_cols].fillna(-1).astype('Int64')

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        if row['text'] == 'nothing':
            txt = 'null'
        else:
            txt = row['text']

        if self.dataset == 'hmc':
            image_fn = row['img'].split('/')[1]
        else:
            image_fn = row['image']

        image_path = os.path.join(self.root_folder, image_fn)
        image = Image.open(image_path).convert('RGB').resize((self.image_size, self.image_size))

        item = {
            'image': image,
            'text': txt,
            'label': row['label'],
            'idx_meme': row['id'],
            'origin_text': txt
        }

        return item


# loading of the dataset
dataset_test = MemesDataset(root_folder='/content/drive/MyDrive/BC/meme/resources/datasets/harmeme/img', dataset='img', split='test')

In [None]:
def show_random_image_with_prediction(dataset, model):
    # setting the model to evaluation mode
    model.eval()

    # selecting a random index
    idx = random.randint(0, len(dataset) - 1)

    # retrieving the sample from the dataset
    sample = dataset[idx]

    # getting the image, label, and text from the sample
    image = sample['image']
    label = sample['label']
    text = sample['text']

    # preparation of the image for model prediction
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    # adding batch dimension
    input_image = transform(image).unsqueeze(0)

    # making predictions using the model
    with torch.no_grad():
        model_output = model({'images': input_image, 'texts': text, 'labels': label})

    # extracting the predicted label from the model output
    predicted_label = int(torch.round(torch.sigmoid(model_output['logits'])).item())


    # converting the PIL image to a NumPy array for display
    image_np = np.array(image)
    image_np = image_np / 255.0  # Normalize to [0, 1]
    image_np = np.clip(image_np, 0, 1)  # Ensure the image is in the valid range for display

    # displaying the image with the ground truth and predicted labels
    plt.imshow(image_np)
    plt.title(f"True Label: {label}, Predicted Label: {predicted_label}")
    plt.show()


# usage
show_random_image_with_prediction(dataset_test, model)