In [1]:
# imports
import math, statistics, time
from collections import defaultdict
import numpy as np
from tqdm.auto import tqdm
from datetime import datetime
# import torch_xla
# import torch_xla.core.xla_model as xm
import pickle
import pandas as pd
import torch.nn as nn
import torch
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# HF token
token = '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'
from huggingface_hub import HfApi, HfFolder
api=HfApi()
folder=HfFolder()
api.set_access_token(token)
folder.save_token(token)

with open('./models/data/training_label.pkl', 'rb') as f:
    labels = pickle.load(f)

In [2]:
# load meme dataset
meme_dict = None
with open('./meme_900k_cleaned_data.pkl', 'rb') as f:
    meme_dict = pickle.load(f)
print("Keys in meme dict dataset:", meme_dict.keys())
print("Number of uuids:", len(meme_dict['uuid_label_dic']))

Keys in meme dict dataset: dict_keys(['label_uuid_dic', 'uuid_label_dic', 'uuid_caption_dic', 'uuid_image_path_dic'])
Number of uuids: 300


In [3]:
# utility functions
def clean_and_unify_caption(caption):
    return caption[0].strip()+'; '+caption[1].strip()

In [4]:
# create pandas dataframe
training_uuids = labels.keys()
temp_arr = []
for uuid in training_uuids:
    for caption in meme_dict['uuid_caption_dic'][uuid]:
        temp_arr.append([uuid, clean_and_unify_caption(caption)])
df = pd.DataFrame(temp_arr, columns=['category', 'text'])

# split dataset
np.random.seed(42)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), 
                                     [int(.8*len(df)), int(.9*len(df))])

print(len(df_train),len(df_val), len(df_test))

180000 22500 22500


In [5]:
from sentence_transformers import SentenceTransformer, InputExample, losses

In [6]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['category']]
        self.texts = [text for text in df['text']]

    def __len__(self):
        return len(self.labels)

In [7]:
train_dataset = Dataset(df_train)
val_dataset = Dataset(df_val)
test_dataset = Dataset(df_test)

In [44]:
class SentenceBertDataloader:
    def __init__(self, dataset, tokenizer, batch_size):
        self.labels = np.array(dataset.labels)
        self.texts = np.array(dataset.texts)
        self.batch_size = batch_size
        self.num_data_points = len(self.labels)
        self.num_meme_keys = len(set(self.labels))
        self.datapoints_per_meme = self.num_data_points//self.num_meme_keys
        self.tokenizer = tokenizer
        
        # create mapping from meme id to list of texts for sampling +ve/-ve examples
        self.meme_id_text_dic = defaultdict(list)
        for meme_id, text in tqdm(zip(self.labels, self.texts)):
            self.meme_id_text_dic[meme_id].append(text)
    
    def __len__(self):
        return int(len(self.labels)/self.batch_size + 1)
    
    def samplePositives(self, true_label, true_text):
        count = 0
        positive_examples = []
        while count<2:
            random_text = np.random.choice(self.meme_id_text_dic[true_label])
            if random_text!=true_text:
                count+=1
                positive_examples.append(random_text)
        return positive_examples
    
    def sampleNegatives(self, true_label, true_text):
        count = 0
        negative_examples = []
        while count<2:
            random_meme_id = np.random.randint(0, self.num_meme_keys)
            random_text = np.random.choice(self.meme_id_text_dic[random_meme_id])
            if random_meme_id!=true_label and random_text!=true_text:
                count+=1
                negative_examples.append(random_text)
        return negative_examples
    
    def __iter__(self):
        # shuffle data
        indices = np.arange(self.num_data_points)
        np.random.shuffle(indices)
        X = self.texts[indices]
        y = self.labels[indices]
        
        for i in range(0, self.num_data_points, self.batch_size):
            X_batch = X[i:i+self.batch_size]
            y_batch = y[i:i+self.batch_size]
            
            X_final_batch = []
            y_final_batch = []
            for i in range(len(X_batch)):
                positive_examples = self.samplePositives(y_batch[i], X_batch[i])
                negative_examples = self.sampleNegatives(y_batch[i], X_batch[i])
                for example in positive_examples:
                    X_final_batch.append(tokenizer([X_batch[i], example],padding='max_length', max_length = 50, truncation=True, return_tensors="pt"))
                    y_final_batch.append(1)
                for example in negative_examples:
                    X_final_batch.append(tokenizer([X_batch[i], example],padding='max_length', max_length = 50, truncation=True, return_tensors="pt"))
                    y_final_batch.append(0)
            
            yield X_final_batch, torch.tensor(y_final_batch, dtype=torch.float)

In [45]:
train_loader = SentenceBertDataloader(train_dataset, tokenizer, 32)
val_loader = SentenceBertDataloader(val_dataset, tokenizer, 32)
test_loader = SentenceBertDataloader(test_dataset, tokenizer, 32)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [49]:
model = SentenceTransformer('bert-base-uncased')
train_loss = losses.CosineSimilarityLoss(model=model)

No sentence-transformers model found with name /home/ykhandelwal/.cache/torch/sentence_transformers/bert-base-uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /home/ykhandelwal/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificat

In [50]:
model.fit(train_objectives=[(train_loader, train_loss)],epochs=1, warmup_steps=100)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5626 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (2) must match the size of tensor b (128) at non-singleton dimension 0

In [27]:
for data in train_dataloader:
    temp = data

In [28]:
temp

([{'input_ids': tensor([[ 101, 2178, 3940,  102,    0],
           [ 101, 2026, 2034, 6251,  102]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 0],
           [1, 1, 1, 1, 1]])},
  {'input_ids': tensor([[  101, 15142,  6251,   102,     0],
           [  101,  2026,  2117,  6251,   102]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 0],
           [1, 1, 1, 1, 1]])}],
 tensor([0.3000, 0.8000]))

In [29]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [30]:
tokenizer.tokenize(['My first sentence', 'My second sentence'])

['my', 'first', 'sentence', 'my', 'second', 'sentence']

In [51]:
tokenizer(['My first sentence', 'My second sentence'],return_tensors="pt")

{'input_ids': tensor([[ 101, 2026, 2034, 6251,  102],
        [ 101, 2026, 2117, 6251,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]])}

In [32]:
temp

([{'input_ids': tensor([[ 101, 2178, 3940,  102,    0],
           [ 101, 2026, 2034, 6251,  102]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 0],
           [1, 1, 1, 1, 1]])},
  {'input_ids': tensor([[  101, 15142,  6251,   102,     0],
           [  101,  2026,  2117,  6251,   102]]),
   'token_type_ids': tensor([[0, 0, 0, 0, 0],
           [0, 0, 0, 0, 0]]),
   'attention_mask': tensor([[1, 1, 1, 1, 0],
           [1, 1, 1, 1, 1]])}],
 tensor([0.3000, 0.8000]))