# Load annotations

In [9]:
import os
import cv2
import json
import torch
import tqdm
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

In [26]:
model = "visual_bert_coco"

annotation_folder = "/opt/datasets/mmf/datasets/hateful_memes/defaults/annotations"
image_folder = "/opt/datasets/mmf/datasets/hateful_memes/defaults/images"

In [27]:
train_df = pd.read_json(os.path.join(annotation_folder, "train.jsonl"), lines=True)
train_df.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


In [28]:
dev_seen_df = pd.read_json(os.path.join(annotation_folder, "dev_seen.jsonl"), lines=True)
dev_seen_df.head()

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...
4,80243,img/80243.png,1,mississippi wind chime


# Compute gradient weightage for each modality

In [53]:
def get_gradient_weightage(idx):
    text_gradients = load_npy(attr_text_dir, f"{idx}_text_gradients.npy").squeeze()
    text_gradients = np.abs(text_gradients)
    text_gradients = text_gradients.sum(axis=1)

    img_gradients = load_npy(attr_img_dir, f"{idx}_img_gradients.npy", True)
    img_gradients = img_gradients.sum(axis=1)

    # As text and visual inputs have different number of input features, 
    # we will normalize the text and visual inputs to better represent the attribution
    text_norm = np.linalg.norm(text_gradients) 
    img_norm = np.linalg.norm(img_gradients)
    
    return text_gradients / text_norm, img_gradients / img_norm

In [64]:
for model in ['visual_bert', 'visual_bert_coco']:
    text_weightages, visual_weightages = [], []
    
    attr_text_dir = f"../model_outputs/{model}/text"
    attr_img_dir = f"../model_outputs/{model}/img"
    
    # compute for each sample
    for idx in dev_seen_df['id']:
        text_contri, img_contri = get_gradient_weightage(idx)

        text_contri = text_contri.sum()
        img_contri = img_contri.sum()

        text_weightages.append(text_contri)
        visual_weightages.append(img_contri)
    
    # print stats
    print(model)
    print(f"text inputs mean: {np.mean(text_weightages):.3f}, {np.std(text_weightages):.3f}")
    print(f"visual inputs mean: {np.mean(visual_weightages):.3f}, {np.std(visual_weightages):.3f}")
    print()

visual_bert
text inputs mean: 3.006, 0.804
visual inputs mean: 7.705, 2.179

visual_bert_coco
text inputs mean: 3.112, 0.843
visual inputs mean: 6.444, 0.976

