In [None]:
!pip install datasets

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gzip -d cc.en.300.bin.gz

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
from torch import nn
import torch
import spacy
import fasttext
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torchmetrics import Accuracy
from datasets import load_dataset
import pickle

english = spacy.load("en_core_web_sm")
eng_tokenizer = english.tokenizer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
VOCAB_SIZE=2000000
NUM_LAYERS=4 # 8
NUM_HEADS=6 # 10
DROPOUT=0.1
EMBEDDING_DIM=300



In [None]:
def build_vocab(ft, vocab_size, embedding_dim=EMBEDDING_DIM):
    word2idx = {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 3}
    idx2word = dict([(v, k) for k, v in word2idx.items()])
    mat = torch.zeros((vocab_size, embedding_dim), dtype=torch.float, device=DEVICE)
    count = 4
    for word in ft.words:
        word2idx[word] = count
        idx2word[count] = word
        mat[count] = torch.tensor(ft.get_word_vector(word))
        count += 1
        if count >= vocab_size:
            break
    return word2idx, idx2word, mat

ftmodel = fasttext.load_model("cc.en.300.bin")
word2idx, idx2word, emb_mat = build_vocab(ftmodel, VOCAB_SIZE)
del ftmodel

with open("vocab.pkl", "wb") as f:
    pickle.dump(word2idx, f)

with open("embedding.mat", "wb") as f:
    torch.save(emb_mat, f)

# RiddleSense dataset

In [3]:
with open("vocab.pkl", "rb") as f:
    word2idx = pickle.load(f)
    idx2word = dict([(v, k) for k, v in word2idx.items()])
    
with open("embedding.mat", "rb") as f:
    emb_mat = torch.load(f)

In [4]:
def convert_to_sentence(tokens, idx2word):
    sentence = []
    for tok in tokens:
        if tok not in [0, 1, 2]:
            sentence.append(idx2word[tok])
    return " ".join(sentence)

def convert_sentences_to_tokens(source, tokenizer):
    sentences = []
    for doc in tokenizer.pipe(source):
        sentences.append([token.text.lower() for token in doc])
    return sentences

# Training dataset

In [123]:
riddle_data = load_dataset("riddle_sense")
riddle_data = pd.DataFrame(riddle_data["train"])
riddle_data = riddle_data[riddle_data["answerKey"] != 'E']
riddle_data = riddle_data.reset_index()
second_data = pd.DataFrame.from_records(riddle_data["choices"])
riddle_data[["choice1", "choice2", "choice3", "choice4"]] = np.array(second_data["text"].tolist())[:, :4]
riddle_data["answer"] = pd.Categorical(riddle_data["answerKey"]).codes
riddle_data = riddle_data.drop(["index", "answerKey", "choices"], axis=1)
riddle_data.head()

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,question,choice1,choice2,choice3,choice4,answer
0,What gets smaller as it gets fuller?,bit,put,hole,rice,2
1,"Whats weightless, visible to the naked eye, an...",amoeba,vision,hole,hydride,2
2,"what is weightless, and colorless. . but when ...",hole,measuring,heft,color,0
3,"What is lighter then a feather, can be seen by...",hole,find,dust,sclera,0
4,"I am always Hungery, I must always be Fed. th...",cyan,rust,maroon,flame,3


In [124]:
sen_data = pd.read_csv("/kaggle/input/cs779-brainteaser/SP_train.csv")
sen_data["answer"] = sen_data["label"]
sen_data = sen_data.drop(["Unnamed: 0", "label"], axis=1)
sen_data.head()

Unnamed: 0,question,choice1,choice2,choice3,choice4,answer
0,Mr. and Mrs. Mustard have six daughters and ea...,Some daughters get married and have their own ...,Each daughter shares the same brother.,Some brothers were not loved by family and mov...,None of above.,1
1,The six daughters of Mr. and Mrs. Mustard each...,Some brothers were not loved by family and mov...,Some daughters get married and have their own ...,Each daughter shares the same brother.,None of above.,2
2,"A chess team has five players, and each player...",Each player shares the same coach.,Some players are backups and not allowed to play.,Some coaches get a raise.,None of above.,0
3,A woman shoots her husband. Then she holds him...,The woman gets arrested for murder after dinner.,The woman gets a new partner.,The woman was a photographer. She shot a pictu...,None of above.,2
4,An individual shoots their spouse. She continu...,The woman gets arrested for murder after dinner.,The woman was a photographer. She shot a pictu...,The woman gets a new partner.,None of above.,1


In [125]:
word_data = pd.read_csv("/kaggle/input/cs779-brainteaser/WP_train.csv")
word_data["answer"] = word_data["label"]
word_data = word_data.drop(["Unnamed: 0", "label"], axis=1)
word_data.head()

Unnamed: 0,question,choice1,choice2,choice3,choice4,answer
0,How do you spell COW in thirteen letters?,SEE OH DEREFORD,SEE O DOUBLE YOU.,COWCOWCOWCOWW,None of above.,1
1,"In thirteen letters, how do you spell COW?",SEE OH DEREFORD,COWCOWCOWCOWW,SEE O DOUBLE YOU.,None of above.,2
2,How do you spell COB in seven letters?,COBCOBB,COBBLER,SEE O BEE,None of above.,2
3,"If eleven plus two equals one, what does nine ...",Four.,Two.,Three.,None of above.,1
4,What does nine plus five equal if eleven plus ...,Three.,Two.,Four.,None of above.,1


In [127]:
data = pd.concat([riddle_data, sen_data, word_data], axis=0)
data.head()

Unnamed: 0,question,choice1,choice2,choice3,choice4,answer
0,What gets smaller as it gets fuller?,bit,put,hole,rice,2
1,"Whats weightless, visible to the naked eye, an...",amoeba,vision,hole,hydride,2
2,"what is weightless, and colorless. . but when ...",hole,measuring,heft,color,0
3,"What is lighter then a feather, can be seen by...",hole,find,dust,sclera,0
4,"I am always Hungery, I must always be Fed. th...",cyan,rust,maroon,flame,3


In [129]:
question = convert_sentences_to_tokens(data["question"], eng_tokenizer)
choice1 = convert_sentences_to_tokens(data["choice1"], eng_tokenizer)
choice2 = convert_sentences_to_tokens(data["choice2"], eng_tokenizer)
choice3 = convert_sentences_to_tokens(data["choice3"], eng_tokenizer)
choice4 = convert_sentences_to_tokens(data["choice4"], eng_tokenizer)
labels = data["answer"].to_numpy().astype(np.uint8)

In [130]:
del riddle_data, sen_data, word_data, data

In [95]:
class TrainDataset(DataLoader):
    def __init__(self, question, choice1, choice2, choice3, choice4, label, randomize_place=False):
        self.randomize_place = randomize_place
        self.question = np.array(question, dtype=object)
        self.choice1 = np.array(choice1, dtype=object)
        self.choice2 = np.array(choice2, dtype=object)
        self.choice3 = np.array(choice3, dtype=object)
        self.choice4 = np.array(choice4, dtype=object)
        self.label = np.array(label)
    
    def __len__(self):
        return self.question.shape[0]
    
    def __getitem__(self, idx):
        question = torch.tensor([word2idx.get(word, 3) for word in self.question[idx]])
        choice1 = torch.tensor([word2idx.get(word, 3) for word in self.choice1[idx]])
        choice2 = torch.tensor([word2idx.get(word, 3) for word in self.choice2[idx]])
        choice3 = torch.tensor([word2idx.get(word, 3) for word in self.choice3[idx]])
        choice4 = torch.tensor([word2idx.get(word, 3) for word in self.choice4[idx]])
        if self.randomize_place:
            ridx = np.argsort(np.random.randint(0, 256, 4))
            label = torch.tensor([ridx[self.label[idx]]])
            choice1, choice2, choice3, choice4 = np.array([choice1, choice2, choice3, choice4], dtype=object)[ridx]
        else:
            label = torch.tensor([self.label[idx]])
        return question, choice1, choice2, choice3, choice4, label
    
    
class TestDataset(DataLoader):
    def __init__(self, question, choice1, choice2, choice3, choice4, label=None):
        self.question = np.array(question, dtype=object)
        self.choice1 = np.array(choice1, dtype=object)
        self.choice2 = np.array(choice2, dtype=object)
        self.choice3 = np.array(choice3, dtype=object)
        self.choice4 = np.array(choice4, dtype=object)
        if label is not None:
            self.label = np.array(label, dtype=object)
        else:
            self.label = None
    
    def __len__(self):
        return self.question.shape[0]
    
    def __getitem__(self, idx):
        question = torch.tensor([word2idx.get(word, 3) for word in self.question[idx]])
        choice1 = torch.tensor([word2idx.get(word, 3) for word in self.choice1[idx]])
        choice2 = torch.tensor([word2idx.get(word, 3) for word in self.choice2[idx]])
        choice3 = torch.tensor([word2idx.get(word, 3) for word in self.choice3[idx]])
        choice4 = torch.tensor([word2idx.get(word, 3) for word in self.choice4[idx]])
        if self.label is not None:
            label = torch.tensor([self.label[idx]])
        else:
            label = torch.tensor([0])
        return question, choice1, choice2, choice3, choice4, label
    
def pad_collate(batch):
    (a, b, c, d, e, f) = zip(*batch)
    a = nn.utils.rnn.pad_sequence(a, batch_first=True, padding_value=2)
    b = nn.utils.rnn.pad_sequence(b, batch_first=True, padding_value=2)
    c = nn.utils.rnn.pad_sequence(c, batch_first=True, padding_value=2)
    d = nn.utils.rnn.pad_sequence(d, batch_first=True, padding_value=2)
    e = nn.utils.rnn.pad_sequence(e, batch_first=True, padding_value=2)
    f = torch.tensor(f)
    return a, b, c, d, e, f

def test_pad_collate(batch):
    (a, b, c, d, e) = zip(*batch)
    a = nn.utils.rnn.pad_sequence(a, batch_first=True, padding_value=2)
    b = nn.utils.rnn.pad_sequence(b, batch_first=True, padding_value=2)
    c = nn.utils.rnn.pad_sequence(c, batch_first=True, padding_value=2)
    d = nn.utils.rnn.pad_sequence(d, batch_first=True, padding_value=2)
    e = nn.utils.rnn.pad_sequence(e, batch_first=True, padding_value=2)
    return a, b, c, d, e

In [96]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=DROPOUT, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[0, :x.size(1)]
        return self.dropout(x)


class BrainTeaserModel(nn.Module):
    def __init__(self, 
                 vocab_size=VOCAB_SIZE, 
                 embedding_dim=EMBEDDING_DIM, 
                 num_layers=NUM_LAYERS, 
                 num_heads=NUM_HEADS, 
                 dropout=DROPOUT, 
                 embed_mat=None, 
                 freeze=True,
                 n_class=4,
                 batch_first=True):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dropout_rate = dropout
        self.freeze=freeze
        self.n_class=n_class
        self.batch_first=True
        
        if embed_mat is None:
            self.embeddings = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=2).to("cpu")
        else:
            self.embeddings = nn.Embedding.from_pretrained(embed_mat, padding_idx=2, freeze=self.freeze).to("cpu")
        self.pos_embeddings = PositionalEncoding(self.embedding_dim, dropout=self.dropout_rate).to(DEVICE)

        self.encoder_layer = nn.TransformerEncoderLayer(self.embedding_dim, nhead=self.num_heads, dropout=self.dropout_rate, batch_first=self.batch_first)
        self.decoder_layer = nn.TransformerDecoderLayer(self.embedding_dim, nhead=self.num_heads, dropout=self.dropout_rate, batch_first=self.batch_first)
        self.encoder1 = nn.TransformerEncoder(self.encoder_layer, self.num_layers).to(DEVICE)
        self.decoder1 = nn.TransformerDecoder(self.decoder_layer, self.num_layers).to(DEVICE)
        self.decoder2 = nn.TransformerDecoder(self.decoder_layer, self.num_layers).to(DEVICE)
        self.decoder3 = nn.TransformerDecoder(self.decoder_layer, self.num_layers).to(DEVICE)
        self.decoder4 = nn.TransformerDecoder(self.decoder_layer, self.num_layers).to(DEVICE)
        self.linear1 = nn.LazyLinear(self.n_class).to(DEVICE)
        
    def forward(self, a, b, c, d, e):
        a = self.pos_embeddings(self.embeddings(a.cpu()).cuda())
        b = self.pos_embeddings(self.embeddings(b.cpu()).cuda())
        c = self.pos_embeddings(self.embeddings(c.cpu()).cuda())
        d = self.pos_embeddings(self.embeddings(d.cpu()).cuda())
        e = self.pos_embeddings(self.embeddings(e.cpu()).cuda())
        
        a = self.encoder1(a)
        b = self.decoder1(b, a).mean(dim=1)
        c = self.decoder2(c, a).mean(dim=1)
        d = self.decoder3(d, a).mean(dim=1)
        e = self.decoder4(e, a).mean(dim=1)
        x = torch.cat([b, c, d, e], 1)
        x = self.linear1(x)
        return x

In [185]:
model = BrainTeaserModel(vocab_size=VOCAB_SIZE, 
                         embedding_dim=EMBEDDING_DIM, 
                         num_layers=NUM_LAYERS, 
                         num_heads=NUM_HEADS, 
                         dropout=0, 
                         embed_mat=emb_mat, 
                         freeze=True, 
                         batch_first=True)
# model.load_state_dict(torch.load("brain-teaser-4.pth"))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
criterion = nn.CrossEntropyLoss()
dataset = TrainDataset(question, choice1, choice2, choice3, choice4, labels, randomize_place=False)
sampler = torch.utils.data.RandomSampler(dataset, num_samples=50000)
training_data = DataLoader(dataset, batch_size=5, collate_fn=pad_collate, sampler=sampler)
accuracy = Accuracy(task="multiclass", num_classes=4).to(DEVICE)



In [None]:
EPOCHS=1000
for epoch in range(1, EPOCHS):
    postfix = {"loss": 0, "accuracy": 0}
    loss_val = 0
    acc_val = 0
    bar = tqdm(training_data, desc=f'Epoch {epoch}', postfix=postfix)
    for idx, (A, B, C, D, E, F) in enumerate(bar):
        A = A.to(DEVICE)
        B = B.to(DEVICE)
        C = C.to(DEVICE)
        D = D.to(DEVICE)
        E = E.to(DEVICE)
        F = F.to(DEVICE)

        model.train()
        optimizer.zero_grad()
        preds = model(A, B, C, D, E)
        loss = criterion(preds, F.long())
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.inference_mode():
            if not np.isnan(loss.item()):
                loss_val += loss.item()
            acc_val += accuracy(preds, F).item()
            postfix["loss"] = loss_val / (idx + 1)
            postfix["accuracy"] = acc_val / (idx + 1)
        bar.set_postfix(postfix)
    torch.save(model.state_dict(), f"brain-teaser-4.pth")

In [198]:
torch.save(model.state_dict(), f"brain-teaser-4.pth")

# Validation

In [187]:
spval_data = pd.read_csv("/kaggle/input/cs779-brainteaser/SP_validation.csv")
spval_data["answer"] = spval_data["label"]
spval_data = spval_data.drop(["Unnamed: 0", "label"], axis=1)

sp_question = convert_sentences_to_tokens(spval_data["question"], eng_tokenizer)
sp_choice1 = convert_sentences_to_tokens(spval_data["choice1"], eng_tokenizer)
sp_choice2 = convert_sentences_to_tokens(spval_data["choice2"], eng_tokenizer)
sp_choice3 = convert_sentences_to_tokens(spval_data["choice3"], eng_tokenizer)
sp_choice4 = convert_sentences_to_tokens(spval_data["choice4"], eng_tokenizer)
sp_labels = spval_data["answer"].to_numpy().astype(np.uint8)

In [188]:
val_dataset = TestDataset(sp_question, sp_choice1, sp_choice2, sp_choice3, sp_choice4, sp_labels)
val_data = DataLoader(val_dataset, batch_size=5, collate_fn=pad_collate)

sp_data = []
bar = tqdm(val_data, postfix={"accuracy": 0})
acc = 0
for idx, (A, B, C, D, E, F) in enumerate(bar):
    A = A.to(DEVICE)
    B = B.to(DEVICE)
    C = C.to(DEVICE)
    D = D.to(DEVICE)
    E = E.to(DEVICE)
    F = F.to(DEVICE)
    
    preds = model(A, B, C, D, E)
    preds = preds.argmax(1)
    acc += accuracy(preds, F).item()
    bar.set_postfix({"accuracy": acc / (idx + 1)})
    
    for i in range(len(preds)):
        sp_data.append(preds[i].item())
        
sp_data = np.array(sp_data)

100%|██████████| 20/20 [00:00<00:00, 35.11it/s, accuracy=0.685]


In [189]:
wpval_data = pd.read_csv("/kaggle/input/cs779-brainteaser/WP_validation.csv")
wpval_data["answer"] = wpval_data["label"]
wpval_data = wpval_data.drop(["Unnamed: 0", "label"], axis=1)
wpval_data = wpval_data.iloc[2:]

wp_question = convert_sentences_to_tokens(wpval_data["question"], eng_tokenizer)
wp_choice1 = convert_sentences_to_tokens(wpval_data["choice1"], eng_tokenizer)
wp_choice2 = convert_sentences_to_tokens(wpval_data["choice2"], eng_tokenizer)
wp_choice3 = convert_sentences_to_tokens(wpval_data["choice3"], eng_tokenizer)
wp_choice4 = convert_sentences_to_tokens(wpval_data["choice4"], eng_tokenizer)
wp_labels = wpval_data["answer"].to_numpy().astype(np.uint8)

In [190]:
val_dataset = TestDataset(wp_question, wp_choice1, wp_choice2, wp_choice3, wp_choice4, wp_labels)
val_data = DataLoader(val_dataset, batch_size=5, collate_fn=pad_collate)

wp_data = []
bar = tqdm(val_data, postfix={"accuracy": 0})
acc = 0
for idx, (A, B, C, D, E, F) in enumerate(bar):
    A = A.to(DEVICE)
    B = B.to(DEVICE)
    C = C.to(DEVICE)
    D = D.to(DEVICE)
    E = E.to(DEVICE)
    F = F.to(DEVICE)
    
    preds = model(A, B, C, D, E)
    preds = preds.argmax(1)
    acc += accuracy(preds, F).item()
    bar.set_postfix({"accuracy": acc / (idx + 1)})
    
    for i in range(len(preds)):
        wp_data.append(preds[i].item())
        
wp_data = np.array(wp_data)

100%|██████████| 16/16 [00:00<00:00, 36.08it/s, accuracy=0.6]  


In [194]:
results = []

def calculate_acc(pred, true):
    return pred[pred == true].size / pred.size

original_acc = calculate_acc(sp_data[0::3], sp_labels[0::3])
semantic_acc = calculate_acc(sp_data[1::3], sp_labels[1::3])
context_acc = calculate_acc(sp_data[2::3], sp_labels[2::3])

original_semantic_acc = 0
original_semantic_context_acc = 0
for i in range(0, len(sp_data), 3):
    original_semantic_acc += np.floor(calculate_acc(sp_data[i: i+2], sp_labels[i: i+2]))
    original_semantic_context_acc += np.floor(calculate_acc(sp_data[i: i+3], sp_labels[i: i+3]))
original_semantic_acc /= (len(sp_data) // 3)
original_semantic_context_acc /= (len(sp_data) // 3)
total_acc = calculate_acc(sp_data, sp_labels)
results.append(["Sentence", original_acc, semantic_acc, context_acc, original_semantic_acc, original_semantic_context_acc, total_acc])
print("Sentence", original_acc, semantic_acc, context_acc, original_semantic_acc, original_semantic_context_acc, total_acc)

Sentence 0.7272727272727273 0.6666666666666666 0.6666666666666666 0.5151515151515151 0.36363636363636365 0.6868686868686869


In [195]:
def calculate_acc(pred, true):
    return pred[pred == true].size / pred.size

original_acc = calculate_acc(wp_data[0::3], wp_labels[0::3])
semantic_acc = calculate_acc(wp_data[1::3], wp_labels[1::3])
context_acc = calculate_acc(wp_data[2::3], wp_labels[2::3])

original_semantic_acc = 0
original_semantic_context_acc = 0
for i in range(0, len(wp_data), 3):
    original_semantic_acc += np.floor(calculate_acc(wp_data[i: i+2], wp_labels[i: i+2]))
    original_semantic_context_acc += np.floor(calculate_acc(wp_data[i: i+3], wp_labels[i: i+3]))
original_semantic_acc /= (len(wp_data) // 3)
original_semantic_context_acc /= (len(wp_data) // 3)
total_acc = calculate_acc(wp_data, wp_labels)
results.append(["Word", original_acc, semantic_acc, context_acc, original_semantic_acc, original_semantic_context_acc, total_acc])
print("Word", original_acc, semantic_acc, context_acc, original_semantic_acc, original_semantic_context_acc, total_acc)

Word 0.5384615384615384 0.5384615384615384 0.7692307692307693 0.38461538461538464 0.3076923076923077 0.6153846153846154


In [197]:
results = pd.DataFrame(results, columns=["Type", "Original", "Semantic", "Context", "Original+Semantic", "Original+Semantic+Reconstruction", "Accuracy"])
results.to_csv("results.csv", index=False)