In [3]:
# imports
import sys
sys.path.append('../')
import math, statistics, time
from collections import defaultdict
import numpy as np
from datetime import datetime
import pickle
import pandas as pd
import torch.nn as nn
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample
from tqdm import tqdm
from utils.sbert_meme_classifier import Classifier

base_model = 'roberta-base'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

num_epochs = 20
# model_save_path = '../models/sentence_transformer_'+str(num_epochs)
model_save_path = '../models/sentence_transformer_30'

with open('../data/training_label.pkl', 'rb') as f:
    labels = pickle.load(f)

In [4]:
# load meme dataset
meme_dict = None
with open('../data/meme_900k_cleaned_data_v2.pkl', 'rb') as f:
    meme_dict = pickle.load(f)
print("Keys in meme dict dataset:", meme_dict.keys())
print("Number of uuids:", len(meme_dict['uuid_label_dic']))

Keys in meme dict dataset: dict_keys(['label_uuid_dic', 'uuid_label_dic', 'uuid_caption_dic', 'uuid_image_path_dic', 'uuid_caption_cased_dic'])
Number of uuids: 300


In [5]:
# utility functions
def clean_and_unify_caption(caption):
    return caption[0].strip()+'; '+caption[1].strip()

In [6]:
# create pandas dataframe
training_uuids = labels.keys()
temp_arr = []
for uuid in training_uuids:
    for caption in meme_dict['uuid_caption_dic'][uuid]:
        temp_arr.append([uuid, clean_and_unify_caption(caption)])
df = pd.DataFrame(temp_arr, columns=['category', 'text'])

# split dataset
np.random.seed(42)
df_train, df_test = np.split(df.sample(frac=1, random_state=42), [int(.9*len(df))])

print(len(df_train), len(df_test))

64800 7200


### Analysis for sbert models

In [10]:
ks = [1,3,5,10]
model_names = ['roberta_base',
                'sentence_transformer_5',
              'sentence_transformer_roberta_20',
              'sentence_transformer_roberta_30']

In [11]:
accuracy_dict = {}

In [13]:
for model_name in model_names:
    clf = Classifier(model_name=model_name)
    accuracy_dict[model_name] = clf.topKAccuracy(ks, df_test[:1])

In [14]:
# with open('accuracy_dict.pkl', 'wb') as f:
#     pickle.dump(accuracy_dict, f)

### Analysis for short sbert

In [7]:
ks = [1,3,5,10]
model_names = ['sentence_transformer_num_classes_24_num_epochs_5']

In [8]:
accuracy_dict = {}

In [9]:
for model_name in model_names:
    clf = Classifier(model_name=model_name)
    accuracy_dict[model_name] = clf.topKAccuracy(ks, df_test)

100%|██████████| 15/15 [00:28<00:00,  1.89s/it]


In [10]:
accuracy_dict

{'sentence_transformer_num_classes_24_num_epochs_5': {1: 0.6145833333333334,
  3: 0.7684722222222222,
  5: 0.8393055555555555,
  10: 0.9293055555555556}}