In [20]:
# imports
import math, statistics, time
from collections import defaultdict
import numpy as np
from datetime import datetime
import pickle
import pandas as pd
import torch.nn as nn
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample
from tqdm import tqdm

# HF token
token = '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'
from huggingface_hub import HfApi, HfFolder
api=HfApi()
folder=HfFolder()
api.set_access_token(token)
folder.save_token(token)
base_model = 'roberta-base'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

num_epochs = 20
# model_save_path = '../models/sentence_transformer_'+str(num_epochs)
model_save_path = '../models/sentence_transformer_30'

with open('../data/training_label.pkl', 'rb') as f:
    labels = pickle.load(f)

In [21]:
# load meme dataset
meme_dict = None
with open('../data/meme_900k_cleaned_data_v2.pkl', 'rb') as f:
    meme_dict = pickle.load(f)
print("Keys in meme dict dataset:", meme_dict.keys())
print("Number of uuids:", len(meme_dict['uuid_label_dic']))

Keys in meme dict dataset: dict_keys(['label_uuid_dic', 'uuid_label_dic', 'uuid_caption_dic', 'uuid_image_path_dic', 'uuid_caption_cased_dic'])
Number of uuids: 300


In [22]:
# utility functions
def clean_and_unify_caption(caption):
    return caption[0].strip()+'; '+caption[1].strip()

In [23]:
# create pandas dataframe
training_uuids = labels.keys()
temp_arr = []
for uuid in training_uuids:
    for caption in meme_dict['uuid_caption_dic'][uuid]:
        temp_arr.append([uuid, clean_and_unify_caption(caption)])
df = pd.DataFrame(temp_arr, columns=['category', 'text'])

# split dataset
np.random.seed(42)
df_train, df_test = np.split(df.sample(frac=1, random_state=42), [int(.9*len(df))])

print(len(df_train), len(df_test))

202500 22500


## Creating DataLoader

In [11]:
class Dataset():
    def __init__(self, df):
        self.labels = [labels[label] for label in df['category']]
        self.texts = [text for text in df['text']]
    
    def __len__(self):
        return len(self.labels)

    def classes(self):
        return self.labels

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [12]:
train_dataset = Dataset(df_train)
# val_dataset = Dataset(df_val)
test_dataset = Dataset(df_test)

In [13]:
class SentenceBertDataloader():
    def __init__(self, dataset, batch_size):
        self.batch_size=batch_size
        self.labels = np.array(dataset.labels)
        self.texts = np.array(dataset.texts)
        self.num_data_points = len(self.labels)
        self.num_meme_keys = len(set(self.labels))
        self.datapoints_per_meme = self.num_data_points//self.num_meme_keys
        
        # create mapping from meme id to list of texts for sampling +ve/-ve examples
        self.meme_id_text_dic = defaultdict(list)
        for meme_id, text in tqdm(zip(self.labels, self.texts)):
            self.meme_id_text_dic[meme_id].append(text)
        
        self.index = 0
    
    def __len__(self):
        return int(len(self.labels)//self.batch_size)
    
    def samplePositives(self, true_label, true_text):
        count = 0
        positive_examples = []
        while count<2:
            random_text = np.random.choice(self.meme_id_text_dic[true_label])
            if random_text!=true_text:
                count+=1
                positive_examples.append(random_text)
        return positive_examples
    
    def sampleNegatives(self, true_label, true_text):
        count = 0
        negative_examples = []
        while count<2:
            random_meme_id = np.random.randint(0, self.num_meme_keys)
            random_text = np.random.choice(self.meme_id_text_dic[random_meme_id])
            if random_meme_id!=true_label and random_text!=true_text:
                count+=1
                negative_examples.append(random_text)
        return negative_examples
    
    def __iter__(self):
        return self
    
    def __next__(self):
        X = self.texts[self.index: self.index+self.batch_size]
        y = self.labels[self.index: self.index+self.batch_size]
        X_final_batch = []
        for i in range(0, len(X)):
            positive_examples = self.samplePositives(y[i], X[i])
            negative_examples = self.sampleNegatives(y[i], X[i])
            for example in positive_examples:
                X_final_batch.append(InputExample(texts=[X[i], example], label=1))
            for example in negative_examples:
                X_final_batch.append(InputExample(texts=[X[i], example], label=0))
        
        self.index+=self.batch_size
        return self.collate_fn(X_final_batch)

In [14]:
train_loader = SentenceBertDataloader(train_dataset, 32)
# val_loader = SentenceBertDataloader(val_dataset, 32)
test_loader = SentenceBertDataloader(test_dataset, 32)

202500it [00:00, 790801.18it/s]
22500it [00:00, 712046.84it/s]


## Model Training

In [15]:
model = SentenceTransformer('../models/sentence_transformer_roberta_20', device=device)
train_loss = losses.ContrastiveLoss(model=model)

In [None]:
model.fit(train_objectives=[(train_loader, train_loss)],
                              epochs=num_epochs, 
                              warmup_steps=100, 
                              output_path=model_save_path)

## Analyzing Results

In [24]:
from sentence_transformers.util import cos_sim
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def topKPrediction(k, model, sentences, true_labels, uuid_to_emb_dict):
    embeddings = model.encode(sentences)
    final_score = 0
    for i in range(len(sentences)):        
        scores = []
        for key, v in uuid_to_emb_dict.items():
            scores.append((cos_sim(embeddings[i], v), labels[key]))
        scores.sort(reverse=True)
        for _, l in scores[:k]:
            if l==true_labels[i]:
                final_score += 1
    return final_score

In [25]:
def topKAccuracy(k, model, df_test, uuid_to_emb_dict):
    accuracy = 0
    texts = list(df_test.text)
    true_meme_ids = [labels[uuid] for uuid in list(df_test.category)]
    batch_size = 512
    for i in tqdm(range(0,len(texts), batch_size)):
        accuracy += topKPrediction(3, model, texts[i:i+batch_size], true_meme_ids[i:i+batch_size], uuid_to_emb_dict)
    return accuracy/len(texts)

In [26]:
def getCategoryEmbeddings(df_train, model):
    uuid_to_emb_dict = {}
    uuid_count_dict = defaultdict(int)
    batch_size = 512
    
    for i in tqdm(range(0, df_train.shape[0], batch_size)):
        texts = list(df_train.text[i:i+batch_size])
        uuids = list(df_train.category[i:i+batch_size])
        embeddings = model.encode(texts)
        for i, uuid in enumerate(uuids):
            uuid_count_dict[uuid]+=1
            if uuid in uuid_to_emb_dict:
                uuid_to_emb_dict[uuid]=uuid_to_emb_dict[uuid]+embeddings[i]
            else:
                uuid_to_emb_dict[uuid]=embeddings[i]
    
    for k, v in uuid_to_emb_dict.items():
        uuid_to_emb_dict[k] = uuid_to_emb_dict[k]/uuid_count_dict[k] 
    
    return uuid_to_emb_dict

### 1. Baseline model

In [29]:
# load model
model = SentenceTransformer('../models/roberta_base', device=device)

In [30]:
# get category embeddings for model
category_embeddings = getCategoryEmbeddings(df_train, model)

100%|██████████| 396/396 [02:18<00:00,  2.87it/s]


In [34]:
# with open('../models/model_utils//roberta_base/category_embeddings.pkl', 'wb') as f:
#     pickle.dump(category_embeddings, f)

In [17]:
# get top k accuracy
accuracy = topKAccuracy(3, model, df_test, category_embeddings)

100%|██████████| 44/44 [03:50<00:00,  5.25s/it]


In [18]:
print(accuracy)

0.4806666666666667


### 2. MLI V6 5 epochs

In [39]:
# load model
model_mli_5 = SentenceTransformer('../models/sentence_transformer_5/', device=device)

In [40]:
category_embeddings_mli_5 = getCategoryEmbeddings(df_train, model_mli_5)

100%|██████████| 396/396 [01:24<00:00,  4.69it/s]


In [41]:
# with open('../models/model_utils/sentence_transformer_5/category_embeddings.pkl', 'wb') as f:
#     pickle.dump(category_embeddings_mli_5, f)

In [23]:
accuracy_mli_5 = topKAccuracy(3, model_mli_5, df_test, category_embeddings_mli_5)

100%|██████████| 44/44 [03:42<00:00,  5.06s/it]


In [24]:
print(accuracy_mli_5)

0.6351555555555556


### 3. Roberta 20 epochs

In [42]:
# load model
model_roberta_20 = SentenceTransformer('../models/sentence_transformer_roberta_20/', device=device)

In [43]:
category_embeddings_roberta_20 = getCategoryEmbeddings(df_train, model_roberta_20)

100%|██████████| 396/396 [01:24<00:00,  4.68it/s]


In [44]:
# with open('../models/model_utils/sentence_transformer_roberta_20/category_embeddings.pkl', 'wb') as f:
#     pickle.dump(category_embeddings_roberta_20, f)

In [27]:
accuracy_roberta_20 = topKAccuracy(3, model_roberta_20, df_test, category_embeddings_roberta_20)

100%|██████████| 44/44 [03:41<00:00,  5.03s/it]


In [28]:
print(accuracy_roberta_20)

0.6532444444444444
