In [None]:
!pip install datasets

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gzip -d cc.en.300.bin.gz

# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
from torch import nn
import torch
import spacy
import fasttext
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from torchmetrics import Accuracy
from datasets import load_dataset
import pickle

english = spacy.load("en_core_web_sm")
eng_tokenizer = english.tokenizer

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
VOCAB_SIZE=2000000
NUM_LAYERS=4 # 8
NUM_HEADS=6 # 10
DROPOUT=0.1
EMBEDDING_DIM=300

In [None]:
# os.remove("cc.en.300.bin.gz")
# os.remove("machine-translation-transformer.pth")

# Loading data

In [None]:
data = np.load("/kaggle/input/cs779-brainteaser/SP-train.npy", allow_pickle=True)
data = pd.DataFrame.from_records(data)
data = data.drop(["answer", "distractor1", "distractor2", "distractor(unsure)", "choice_order"], axis=1)
data[["choice1", "choice2", "choice3", "choice4"]] = np.array(data["choice_list"].to_list())
data = data.drop(["choice_list", "id"], axis=1)
data.head()

In [None]:
val_data = np.load("/kaggle/input/cs779-brainteaser/SP_val_question_random.npy", allow_pickle=True)
val_data = pd.DataFrame.from_records(val_data)
val_data[["choice1", "choice2", "choice3", "choice4"]] = np.array(val_data["choice_list"].to_list())
val_data = val_data.drop(["choice_list"], axis=1)
val_data.head()

# Preprocessing data

In [None]:
def build_vocab(ft, vocab_size, embedding_dim=EMBEDDING_DIM):
    word2idx = {"<SOS>": 0, "<EOS>": 1, "<PAD>": 2, "<UNK>": 3}
    idx2word = dict([(v, k) for k, v in word2idx.items()])
    mat = torch.zeros((vocab_size, embedding_dim), dtype=torch.float, device=DEVICE)
    count = 4
    for word in ft.words:
        word2idx[word] = count
        idx2word[count] = word
        mat[count] = torch.tensor(ft.get_word_vector(word))
        count += 1
        if count >= vocab_size:
            break
    return word2idx, idx2word, mat

ftmodel = fasttext.load_model("cc.en.300.bin")
word2idx, idx2word, emb_mat = build_vocab(ftmodel, VOCAB_SIZE)
del ftmodel

In [None]:
with open("vocab.pkl", "wb") as f:
    pickle.dump(word2idx, f)

with open("embedding.mat", "wb") as f:
    torch.save(emb_mat, f)

In [12]:
with open("vocab.pkl", "rb") as f:
    word2idx = pickle.load(f)
    idx2word = dict([(v, k) for k, v in word2idx.items()])
    
with open("embedding.mat", "rb") as f:
    emb_mat = torch.load(f)

# Tokenize data

In [None]:
def convert_to_sentence(tokens, idx2word):
    sentence = []
    for tok in tokens:
        if tok not in [0, 1, 2]:
            sentence.append(idx2word[tok])
    return " ".join(sentence)

def convert_sentences_to_tokens(source, tokenizer):
    sentences = []
    for doc in tokenizer.pipe(source):
        sentences.append([token.text.lower() for token in doc])
    return sentences

In [None]:
idx = np.argsort(np.random.randint(0, 256, len(data)))
idx, ridx = idx[:457], idx[457:]

In [None]:
question = convert_sentences_to_tokens(data["question"].iloc[idx], eng_tokenizer)
choice1 = convert_sentences_to_tokens(data["choice1"].iloc[idx], eng_tokenizer)
choice2 = convert_sentences_to_tokens(data["choice2"].iloc[idx], eng_tokenizer)
choice3 = convert_sentences_to_tokens(data["choice3"].iloc[idx], eng_tokenizer)
choice4 = convert_sentences_to_tokens(data["choice4"].iloc[idx], eng_tokenizer)
labels = data["label"].iloc[idx]

In [None]:
test_question = convert_sentences_to_tokens(data["question"].iloc[ridx], eng_tokenizer)
test_choice1 = convert_sentences_to_tokens(data["choice1"].iloc[ridx], eng_tokenizer)
test_choice2 = convert_sentences_to_tokens(data["choice2"].iloc[ridx], eng_tokenizer)
test_choice3 = convert_sentences_to_tokens(data["choice3"].iloc[ridx], eng_tokenizer)
test_choice4 = convert_sentences_to_tokens(data["choice4"].iloc[ridx], eng_tokenizer)
test_labels = data["label"].iloc[ridx]

In [None]:
val_question = convert_sentences_to_tokens(val_data["question"], eng_tokenizer)
val_choice1 = convert_sentences_to_tokens(val_data["choice1"], eng_tokenizer)
val_choice2 = convert_sentences_to_tokens(val_data["choice2"], eng_tokenizer)
val_choice3 = convert_sentences_to_tokens(val_data["choice3"], eng_tokenizer)
val_choice4 = convert_sentences_to_tokens(val_data["choice4"], eng_tokenizer)

# Analyzing dataset

In [None]:
import seaborn as sns

sns.histplot(data["label"], bins=4)

In [None]:
c = Counter()
for i in range(len(question)):
    for j in question[i]:
        c[j] += 1
    for j in choice1[i]:
        c[j] += 1
    for j in choice2[i]:
        c[j] += 1
    for j in choice3[i]:
        c[j] += 1
    for j in choice4[i]:
        c[j] += 1

d = pd.DataFrame(c.most_common(50))
plt.figure(figsize=(30, 10))
sns.barplot(d, x=0, y=1)

# Dataset

In [None]:
class TrainDataset(DataLoader):
    def __init__(self, question, choice1, choice2, choice3, choice4, label, randomize_place=False):
        self.randomize_place = randomize_place
        self.question = np.array(question, dtype=object)
        self.choice1 = np.array(choice1, dtype=object)
        self.choice2 = np.array(choice2, dtype=object)
        self.choice3 = np.array(choice3, dtype=object)
        self.choice4 = np.array(choice4, dtype=object)
        self.label = np.array(label)
    
    def __len__(self):
        return self.question.shape[0]
    
    def __getitem__(self, idx):
        question = torch.tensor([word2idx.get(word, 3) for word in self.question[idx]])
        choice1 = torch.tensor([word2idx.get(word, 3) for word in self.choice1[idx]])
        choice2 = torch.tensor([word2idx.get(word, 3) for word in self.choice2[idx]])
        choice3 = torch.tensor([word2idx.get(word, 3) for word in self.choice3[idx]])
        choice4 = torch.tensor([word2idx.get(word, 3) for word in self.choice4[idx]])
        if self.randomize_place:
            ridx = np.argsort(np.random.randint(0, 256, 4))
            label = torch.tensor([ridx[self.label[idx]]])
            choice1, choice2, choice3, choice4 = np.array([choice1, choice2, choice3, choice4], dtype=object)[ridx]
        else:
            label = torch.tensor([self.label[idx]])
        return question, choice1, choice2, choice3, choice4, label
    
    
class TestDataset(DataLoader):
    def __init__(self, question, choice1, choice2, choice3, choice4, label=None):
        self.question = np.array(question, dtype=object)
        self.choice1 = np.array(choice1, dtype=object)
        self.choice2 = np.array(choice2, dtype=object)
        self.choice3 = np.array(choice3, dtype=object)
        self.choice4 = np.array(choice4, dtype=object)
        if label is not None:
            self.label = np.array(label, dtype=object)
        else:
            self.label = None
    
    def __len__(self):
        return self.question.shape[0]
    
    def __getitem__(self, idx):
        question = torch.tensor([word2idx.get(word, 3) for word in self.question[idx]])
        choice1 = torch.tensor([word2idx.get(word, 3) for word in self.choice1[idx]])
        choice2 = torch.tensor([word2idx.get(word, 3) for word in self.choice2[idx]])
        choice3 = torch.tensor([word2idx.get(word, 3) for word in self.choice3[idx]])
        choice4 = torch.tensor([word2idx.get(word, 3) for word in self.choice4[idx]])
        if self.label is not None:
            label = torch.tensor([self.label[idx]])
        else:
            label = torch.tensor([0])
        return question, choice1, choice2, choice3, choice4, label
    
def pad_collate(batch):
    (a, b, c, d, e, f) = zip(*batch)
    a = nn.utils.rnn.pad_sequence(a, batch_first=True, padding_value=2)
    b = nn.utils.rnn.pad_sequence(b, batch_first=True, padding_value=2)
    c = nn.utils.rnn.pad_sequence(c, batch_first=True, padding_value=2)
    d = nn.utils.rnn.pad_sequence(d, batch_first=True, padding_value=2)
    e = nn.utils.rnn.pad_sequence(e, batch_first=True, padding_value=2)
    f = torch.tensor(f)
    return a, b, c, d, e, f

def test_pad_collate(batch):
    (a, b, c, d, e) = zip(*batch)
    a = nn.utils.rnn.pad_sequence(a, batch_first=True, padding_value=2)
    b = nn.utils.rnn.pad_sequence(b, batch_first=True, padding_value=2)
    c = nn.utils.rnn.pad_sequence(c, batch_first=True, padding_value=2)
    d = nn.utils.rnn.pad_sequence(d, batch_first=True, padding_value=2)
    e = nn.utils.rnn.pad_sequence(e, batch_first=True, padding_value=2)
    return a, b, c, d, e

# First Model

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=DROPOUT, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[0, :x.size(1)]
        return self.dropout(x)


class BrainTeaserModel(nn.Module):
    def __init__(self, 
                 vocab_size=VOCAB_SIZE, 
                 embedding_dim=EMBEDDING_DIM, 
                 num_layers=NUM_LAYERS, 
                 num_heads=NUM_HEADS, 
                 dropout=DROPOUT, 
                 embed_mat=None, 
                 freeze=True,
                 n_class=4,
                 batch_first=True):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dropout_rate = dropout
        self.freeze=freeze
        self.n_class=n_class
        self.batch_first=True
        
        if embed_mat is None:
            self.embeddings = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=2)
        else:
            self.embeddings = nn.Embedding.from_pretrained(embed_mat, padding_idx=2, freeze=self.freeze)
        self.pos_embeddings = PositionalEncoding(self.embedding_dim, dropout=self.dropout_rate)

        self.encoder_layer = nn.TransformerEncoderLayer(self.embedding_dim, nhead=self.num_heads, dropout=self.dropout_rate, batch_first=self.batch_first)
        self.decoder_layer = nn.TransformerDecoderLayer(self.embedding_dim, nhead=self.num_heads, dropout=self.dropout_rate, batch_first=self.batch_first)
        self.encoder1 = nn.TransformerEncoder(self.encoder_layer, self.num_layers)
        self.decoder1 = nn.TransformerDecoder(self.decoder_layer, self.num_layers)
        self.decoder2 = nn.TransformerDecoder(self.decoder_layer, self.num_layers)
        self.decoder3 = nn.TransformerDecoder(self.decoder_layer, self.num_layers)
        self.decoder4 = nn.TransformerDecoder(self.decoder_layer, self.num_layers)
        self.linear1 = nn.LazyLinear(self.n_class)
        
    def forward(self, a, b, c, d, e):
        a = self.pos_embeddings(self.embeddings(a))
        b = self.pos_embeddings(self.embeddings(b))
        c = self.pos_embeddings(self.embeddings(c))
        d = self.pos_embeddings(self.embeddings(d))
        e = self.pos_embeddings(self.embeddings(e))
        
        a = self.encoder1(a)
        b = self.decoder1(b, a).mean(dim=1)
        c = self.decoder2(c, a).mean(dim=1)
        d = self.decoder3(d, a).mean(dim=1)
        e = self.decoder4(e, a).mean(dim=1)
#         x = torch.cat([a.mean(dim=1), b, c, d, e], 1)
        x = torch.cat([b, c, d, e], 1)
        x = self.linear1(x)
        return x

# Instantiation

In [None]:
model = BrainTeaserModel(vocab_size=VOCAB_SIZE, 
                         embedding_dim=EMBEDDING_DIM, 
                         num_layers=NUM_LAYERS, 
                         num_heads=NUM_HEADS, 
                         dropout=DROPOUT, 
#                          embed_mat=emb_mat, 
                         freeze=True, 
                         batch_first=True).to(DEVICE)
model.load_state_dict(torch.load("brain-teaser-1.pth"))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 2000*5, gamma=0.5)
criterion = nn.CrossEntropyLoss()
dataset = TrainDataset(question, choice1, choice2, choice3, choice4, labels)
sampler = torch.utils.data.RandomSampler(dataset, num_samples=10000)
training_data = DataLoader(dataset, batch_size=5, collate_fn=pad_collate, sampler=sampler)
accuracy = Accuracy(task="multiclass", num_classes=4).to(DEVICE)

# Training

In [None]:
EPOCHS=1000
for epoch in range(1, EPOCHS):
    postfix = {"loss": 0, "accuracy": 0}
    loss_val = 0
    acc_val = 0
    bar = tqdm(training_data, desc=f'Epoch {epoch}', postfix=postfix)
    for idx, (A, B, C, D, E, F) in enumerate(bar):
        A = A.to(DEVICE)
        B = B.to(DEVICE)
        C = C.to(DEVICE)
        D = D.to(DEVICE)
        E = E.to(DEVICE)
        F = F.to(DEVICE)

        model.train()
        optimizer.zero_grad()
        preds = model(A, B, C, D, E)
        loss = criterion(preds, F.long())
        loss.backward()
        optimizer.step()
        scheduler.step()

        model.eval()
        with torch.inference_mode():
            if not np.isnan(loss.item()):
                loss_val += loss.item()
            acc_val += accuracy(preds, F).item()
            postfix["loss"] = loss_val / (idx + 1)
            postfix["accuracy"] = acc_val / (idx + 1)
        bar.set_postfix(postfix)
    torch.save(model.state_dict(), f"brain-teaser-2.pth")

In [None]:
torch.save(model.state_dict(), f"brain-teaser-2.pth")

In [None]:
data = []
bar = tqdm(test_data, postfix={"accuracy": 0})
acc = 0
for idx, (A, B, C, D, E, F) in enumerate(bar):
    A = A.to(DEVICE)
    B = B.to(DEVICE)
    C = C.to(DEVICE)
    D = D.to(DEVICE)
    E = E.to(DEVICE)
    F = F.to(DEVICE)
    
    preds = model(A, B, C, D, E)
    preds = preds.argmax(1)
    acc += accuracy(preds, F.long()).item()
    for i in range(len(preds)):
        q = convert_to_sentence(A[i].cpu().numpy(), idx2word)
        c1 = convert_to_sentence(B[i].cpu().numpy(), idx2word)
        c2 = convert_to_sentence(C[i].cpu().numpy(), idx2word)
        c3 = convert_to_sentence(D[i].cpu().numpy(), idx2word)
        c4 = convert_to_sentence(E[i].cpu().numpy(), idx2word)
        print(f"Question: {q}\n1. {c1}\n2. {c2}\n3. {c3}\n4. {c4}")
        print(f"Prediction: {preds[i].item()+1}\t Correct: {F[i].item()+1}\n\n")
        data.append(str(preds[i].item()))
    break
    bar.set_postfix({"accuracy": acc / (idx + 1)})

# Race Dataset

In [None]:
race_data = load_dataset("race", "high")
race_data = pd.DataFrame(race_data["train"])
race_data["question"] = "Context: "+ race_data["article"] + "\nQuestion: " + race_data["question"]
race_data[["choice1", "choice2", "choice3", "choice4"]] = race_data["options"].tolist()
race_data["answer"] = pd.Categorical(race_data["answer"]).codes
race_data = race_data.drop(["example_id", "options", "article"], axis=1)
race_data.head()

In [None]:
question = convert_sentences_to_tokens(race_data["question"], eng_tokenizer)
choice1 = convert_sentences_to_tokens(race_data["choice1"], eng_tokenizer)
choice2 = convert_sentences_to_tokens(race_data["choice2"], eng_tokenizer)
choice3 = convert_sentences_to_tokens(race_data["choice3"], eng_tokenizer)
choice4 = convert_sentences_to_tokens(race_data["choice4"], eng_tokenizer)
labels = race_data["answer"].to_numpy().astype(np.uint8)

In [None]:
del race_data

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=DROPOUT, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[0, :x.size(1)]
        return self.dropout(x)


class BrainTeaserModel(nn.Module):
    def __init__(self, 
                 vocab_size=VOCAB_SIZE, 
                 embedding_dim=EMBEDDING_DIM, 
                 num_layers=NUM_LAYERS, 
                 num_heads=NUM_HEADS, 
                 dropout=DROPOUT, 
                 embed_mat=None, 
                 freeze=True,
                 n_class=4,
                 batch_first=True):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dropout_rate = dropout
        self.freeze=freeze
        self.n_class=n_class
        self.batch_first=True
        
        if embed_mat is None:
            self.embeddings = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=2)
        else:
            self.embeddings = nn.Embedding.from_pretrained(embed_mat, padding_idx=2, freeze=self.freeze)
        self.pos_embeddings = PositionalEncoding(self.embedding_dim, dropout=self.dropout_rate)

        self.encoder_layer = nn.TransformerEncoderLayer(self.embedding_dim, nhead=self.num_heads, dropout=self.dropout_rate, batch_first=self.batch_first)
        self.decoder_layer = nn.TransformerDecoderLayer(self.embedding_dim, nhead=self.num_heads, dropout=self.dropout_rate, batch_first=self.batch_first)
        self.encoder1 = nn.TransformerEncoder(self.encoder_layer, self.num_layers)
        self.decoder1 = nn.TransformerDecoder(self.decoder_layer, self.num_layers)
        self.decoder2 = nn.TransformerDecoder(self.decoder_layer, self.num_layers)
        self.decoder3 = nn.TransformerDecoder(self.decoder_layer, self.num_layers)
        self.decoder4 = nn.TransformerDecoder(self.decoder_layer, self.num_layers)
        self.linear1 = nn.LazyLinear(self.n_class)
        
    def forward(self, a, b, c, d, e):
        a = self.pos_embeddings(self.embeddings(a))
        b = self.pos_embeddings(self.embeddings(b))
        c = self.pos_embeddings(self.embeddings(c))
        d = self.pos_embeddings(self.embeddings(d))
        e = self.pos_embeddings(self.embeddings(e))
        
        a = self.encoder1(a)
        b = self.decoder1(b, a).mean(dim=1)
        c = self.decoder2(c, a).mean(dim=1)
        d = self.decoder3(d, a).mean(dim=1)
        e = self.decoder4(e, a).mean(dim=1)
#         x = torch.cat([a.mean(dim=1), b, c, d, e], 1)
        x = torch.cat([b, c, d, e], 1)
        x = self.linear1(x)
        return x

In [11]:
model = BrainTeaserModel(vocab_size=VOCAB_SIZE, 
                         embedding_dim=EMBEDDING_DIM, 
                         num_layers=NUM_LAYERS, 
                         num_heads=NUM_HEADS, 
                         dropout=DROPOUT, 
                         embed_mat=emb_mat, 
                         freeze=True, 
                         batch_first=True).to(DEVICE)
# model.load_state_dict(torch.load("brain-teaser-2.pth"))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 50000, gamma=0.15)
criterion = nn.CrossEntropyLoss()
dataset = TrainDataset(question, choice1, choice2, choice3, choice4, labels)
sampler = torch.utils.data.RandomSampler(dataset, num_samples=50000)
training_data = DataLoader(dataset, batch_size=5, collate_fn=pad_collate, sampler=sampler)
accuracy = Accuracy(task="multiclass", num_classes=4).to(DEVICE)

NameError: name 'emb_mat' is not defined

In [None]:
EPOCHS=1000
for epoch in range(1, EPOCHS):
    postfix = {"loss": 0, "accuracy": 0}
    loss_val = 0
    acc_val = 0
    bar = tqdm(training_data, desc=f'Epoch {epoch}', postfix=postfix)
    for idx, (A, B, C, D, E, F) in enumerate(bar):
        A = A.to(DEVICE)
        B = B.to(DEVICE)
        C = C.to(DEVICE)
        D = D.to(DEVICE)
        E = E.to(DEVICE)
        F = F.to(DEVICE)

        model.train()
        optimizer.zero_grad()
        preds = model(A, B, C, D, E)
        loss = criterion(preds, F.long())
        loss.backward()
        optimizer.step()
        scheduler.step()

        model.eval()
        with torch.inference_mode():
            if not np.isnan(loss.item()):
                loss_val += loss.item()
            acc_val += accuracy(preds, F).item()
            postfix["loss"] = loss_val / (idx + 1)
            postfix["accuracy"] = acc_val / (idx + 1)
        bar.set_postfix(postfix)
    torch.save(model.state_dict(), f"brain-teaser-2.pth")

In [None]:
torch.save(model.state_dict(), f"brain-teaser-2.pth")

In [None]:
for idx, (A, B, C, D, E, F) in enumerate(bar):
    print(A[0].unique(return_counts=True))
    break

# RiddleSense dataset

In [None]:
with open("vocab.pkl", "rb") as f:
    word2idx = pickle.load(f)
    idx2word = dict([(v, k) for k, v in word2idx.items()])
    
with open("embedding.mat", "rb") as f:
    emb_mat = torch.load(f)

In [2]:
def convert_to_sentence(tokens, idx2word):
    sentence = []
    for tok in tokens:
        if tok not in [0, 1, 2]:
            sentence.append(idx2word[tok])
    return " ".join(sentence)

def convert_sentences_to_tokens(source, tokenizer):
    sentences = []
    for doc in tokenizer.pipe(source):
        sentences.append([token.text.lower() for token in doc])
    return sentences

In [3]:
riddle_data = load_dataset("riddle_sense")
riddle_data = pd.DataFrame(riddle_data["train"])
riddle_data = riddle_data[riddle_data["answerKey"] != 'E']
riddle_data = riddle_data.reset_index()
second_data = pd.DataFrame.from_records(riddle_data["choices"])
riddle_data[["choice1", "choice2", "choice3", "choice4"]] = np.array(second_data["text"].tolist())[:, :4]
riddle_data["answer"] = pd.Categorical(riddle_data["answerKey"]).codes
riddle_data = riddle_data.drop(["index", "answerKey", "choices"], axis=1)
riddle_data.head()

Downloading builder script:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Downloading and preparing dataset riddle_sense/default (download: 1.99 MiB, generated: 1.09 MiB, post-processed: Unknown size, total: 3.08 MiB) to /root/.cache/huggingface/datasets/riddle_sense/default/0.1.0/1b311d24c97e1fd41975315faf11fd918a56db0289367a99944ef0fa3dfd6811...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/375k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/414k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3510 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1021 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1184 [00:00<?, ? examples/s]

Dataset riddle_sense downloaded and prepared to /root/.cache/huggingface/datasets/riddle_sense/default/0.1.0/1b311d24c97e1fd41975315faf11fd918a56db0289367a99944ef0fa3dfd6811. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,question,choice1,choice2,choice3,choice4,answer
0,What gets smaller as it gets fuller?,bit,put,hole,rice,2
1,"Whats weightless, visible to the naked eye, an...",amoeba,vision,hole,hydride,2
2,"what is weightless, and colorless. . but when ...",hole,measuring,heft,color,0
3,"What is lighter then a feather, can be seen by...",hole,find,dust,sclera,0
4,"I am always Hungery, I must always be Fed. th...",cyan,rust,maroon,flame,3


In [4]:
sen_data = np.load("/kaggle/input/cs779-brainteaser/SP-train.npy", allow_pickle=True)
sen_data = pd.DataFrame.from_records(sen_data)
sen_data = sen_data.drop(["answer", "distractor1", "distractor2", "distractor(unsure)", "choice_order"], axis=1)
sen_data[["choice1", "choice2", "choice3", "choice4"]] = np.array(sen_data["choice_list"].to_list())
sen_data["answer"] = sen_data["label"]
sen_data = sen_data.drop(["choice_list", "id", "label"], axis=1)
sen_data.head()

Unnamed: 0,question,choice1,choice2,choice3,choice4,answer
0,Mr. and Mrs. Mustard have six daughters and ea...,Some daughters get married and have their own ...,Each daughter shares the same brother.,Some brothers were not loved by family and mov...,None of above.,1
1,The six daughters of Mr. and Mrs. Mustard each...,Some brothers were not loved by family and mov...,Some daughters get married and have their own ...,Each daughter shares the same brother.,None of above.,2
2,"A chess team has five players, and each player...",Each player shares the same coach.,Some players are backups and not allowed to play.,Some coaches get a raise.,None of above.,0
3,A woman shoots her husband. Then she holds him...,The woman gets arrested for murder after dinner.,The woman gets a new partner.,The woman was a photographer. She shot a pictu...,None of above.,2
4,An individual shoots their spouse. She continu...,The woman gets arrested for murder after dinner.,The woman was a photographer. She shot a pictu...,The woman gets a new partner.,None of above.,1


In [5]:
word_data = np.load("/kaggle/input/cs779-brainteaser/WP-train.npy", allow_pickle=True)
word_data = pd.DataFrame.from_records(word_data)
word_data = word_data.drop(["answer", "distractor1", "distractor2", "distractor(unsure)", "choice_order"], axis=1)
word_data[["choice1", "choice2", "choice3", "choice4"]] = np.array(word_data["choice_list"].to_list())
word_data["answer"] = word_data["label"]
word_data = word_data.drop(["choice_list", "id", "label"], axis=1)
word_data.head()

Unnamed: 0,question,choice1,choice2,choice3,choice4,answer
0,How do you spell COW in thirteen letters?,SEE OH DEREFORD,SEE O DOUBLE YOU.,COWCOWCOWCOWW,None of above.,1
1,"In thirteen letters, how do you spell COW?",SEE OH DEREFORD,COWCOWCOWCOWW,SEE O DOUBLE YOU.,None of above.,2
2,How do you spell COB in seven letters?,COBCOBB,COBBLER,SEE O BEE,None of above.,2
3,"If eleven plus two equals one, what does nine ...",Four.,Two.,Three.,None of above.,1
4,What does nine plus five equal if eleven plus ...,Three.,Two.,Four.,None of above.,1


In [6]:
data = pd.concat([riddle_data, sen_data, word_data], axis=0)
data.head()

Unnamed: 0,question,choice1,choice2,choice3,choice4,answer
0,What gets smaller as it gets fuller?,bit,put,hole,rice,2
1,"Whats weightless, visible to the naked eye, an...",amoeba,vision,hole,hydride,2
2,"what is weightless, and colorless. . but when ...",hole,measuring,heft,color,0
3,"What is lighter then a feather, can be seen by...",hole,find,dust,sclera,0
4,"I am always Hungery, I must always be Fed. th...",cyan,rust,maroon,flame,3


In [7]:
question = convert_sentences_to_tokens(data["question"], eng_tokenizer)
choice1 = convert_sentences_to_tokens(data["choice1"], eng_tokenizer)
choice2 = convert_sentences_to_tokens(data["choice2"], eng_tokenizer)
choice3 = convert_sentences_to_tokens(data["choice3"], eng_tokenizer)
choice4 = convert_sentences_to_tokens(data["choice4"], eng_tokenizer)
labels = data["answer"].to_numpy().astype(np.uint8)

In [8]:
del riddle_data, sen_data, word_data, data

In [9]:
class TrainDataset(DataLoader):
    def __init__(self, question, choice1, choice2, choice3, choice4, label, randomize_place=False):
        self.randomize_place = randomize_place
        self.question = np.array(question, dtype=object)
        self.choice1 = np.array(choice1, dtype=object)
        self.choice2 = np.array(choice2, dtype=object)
        self.choice3 = np.array(choice3, dtype=object)
        self.choice4 = np.array(choice4, dtype=object)
        self.label = np.array(label)
    
    def __len__(self):
        return self.question.shape[0]
    
    def __getitem__(self, idx):
        question = torch.tensor([word2idx.get(word, 3) for word in self.question[idx]])
        choice1 = torch.tensor([word2idx.get(word, 3) for word in self.choice1[idx]])
        choice2 = torch.tensor([word2idx.get(word, 3) for word in self.choice2[idx]])
        choice3 = torch.tensor([word2idx.get(word, 3) for word in self.choice3[idx]])
        choice4 = torch.tensor([word2idx.get(word, 3) for word in self.choice4[idx]])
        if self.randomize_place:
            ridx = np.argsort(np.random.randint(0, 256, 4))
            label = torch.tensor([ridx[self.label[idx]]])
            choice1, choice2, choice3, choice4 = np.array([choice1, choice2, choice3, choice4], dtype=object)[ridx]
        else:
            label = torch.tensor([self.label[idx]])
        return question, choice1, choice2, choice3, choice4, label
    
    
class TestDataset(DataLoader):
    def __init__(self, question, choice1, choice2, choice3, choice4, label=None):
        self.question = np.array(question, dtype=object)
        self.choice1 = np.array(choice1, dtype=object)
        self.choice2 = np.array(choice2, dtype=object)
        self.choice3 = np.array(choice3, dtype=object)
        self.choice4 = np.array(choice4, dtype=object)
        if label is not None:
            self.label = np.array(label, dtype=object)
        else:
            self.label = None
    
    def __len__(self):
        return self.question.shape[0]
    
    def __getitem__(self, idx):
        question = torch.tensor([word2idx.get(word, 3) for word in self.question[idx]])
        choice1 = torch.tensor([word2idx.get(word, 3) for word in self.choice1[idx]])
        choice2 = torch.tensor([word2idx.get(word, 3) for word in self.choice2[idx]])
        choice3 = torch.tensor([word2idx.get(word, 3) for word in self.choice3[idx]])
        choice4 = torch.tensor([word2idx.get(word, 3) for word in self.choice4[idx]])
        if self.label is not None:
            label = torch.tensor([self.label[idx]])
        else:
            label = torch.tensor([0])
        return question, choice1, choice2, choice3, choice4, label
    
def pad_collate(batch):
    (a, b, c, d, e, f) = zip(*batch)
    a = nn.utils.rnn.pad_sequence(a, batch_first=True, padding_value=2)
    b = nn.utils.rnn.pad_sequence(b, batch_first=True, padding_value=2)
    c = nn.utils.rnn.pad_sequence(c, batch_first=True, padding_value=2)
    d = nn.utils.rnn.pad_sequence(d, batch_first=True, padding_value=2)
    e = nn.utils.rnn.pad_sequence(e, batch_first=True, padding_value=2)
    f = torch.tensor(f)
    return a, b, c, d, e, f

def test_pad_collate(batch):
    (a, b, c, d, e) = zip(*batch)
    a = nn.utils.rnn.pad_sequence(a, batch_first=True, padding_value=2)
    b = nn.utils.rnn.pad_sequence(b, batch_first=True, padding_value=2)
    c = nn.utils.rnn.pad_sequence(c, batch_first=True, padding_value=2)
    d = nn.utils.rnn.pad_sequence(d, batch_first=True, padding_value=2)
    e = nn.utils.rnn.pad_sequence(e, batch_first=True, padding_value=2)
    return a, b, c, d, e

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=DROPOUT, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[0, :x.size(1)]
        return self.dropout(x)


class BrainTeaserModel(nn.Module):
    def __init__(self, 
                 vocab_size=VOCAB_SIZE, 
                 embedding_dim=EMBEDDING_DIM, 
                 num_layers=NUM_LAYERS, 
                 num_heads=NUM_HEADS, 
                 dropout=DROPOUT, 
                 embed_mat=None, 
                 freeze=True,
                 n_class=4,
                 batch_first=True):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.num_heads = num_heads
        self.dropout_rate = dropout
        self.freeze=freeze
        self.n_class=n_class
        self.batch_first=True
        
        if embed_mat is None:
            self.embeddings = nn.Embedding(self.vocab_size, self.embedding_dim, padding_idx=2).to("cpu")
        else:
            self.embeddings = nn.Embedding.from_pretrained(embed_mat, padding_idx=2, freeze=self.freeze).to("cpu")
        self.pos_embeddings = PositionalEncoding(self.embedding_dim, dropout=self.dropout_rate).to(DEVICE)

        self.encoder_layer = nn.TransformerEncoderLayer(self.embedding_dim, nhead=self.num_heads, dropout=self.dropout_rate, batch_first=self.batch_first)
        self.decoder_layer = nn.TransformerDecoderLayer(self.embedding_dim, nhead=self.num_heads, dropout=self.dropout_rate, batch_first=self.batch_first)
        self.encoder1 = nn.TransformerEncoder(self.encoder_layer, self.num_layers).to(DEVICE)
        self.decoder1 = nn.TransformerDecoder(self.decoder_layer, self.num_layers).to(DEVICE)
        self.decoder2 = nn.TransformerDecoder(self.decoder_layer, self.num_layers).to(DEVICE)
        self.decoder3 = nn.TransformerDecoder(self.decoder_layer, self.num_layers).to(DEVICE)
        self.decoder4 = nn.TransformerDecoder(self.decoder_layer, self.num_layers).to(DEVICE)
        self.linear1 = nn.LazyLinear(self.n_class).to(DEVICE)
        
    def forward(self, a, b, c, d, e):
        a = self.pos_embeddings(self.embeddings(a.cpu()).cuda())
        b = self.pos_embeddings(self.embeddings(b.cpu()).cuda())
        c = self.pos_embeddings(self.embeddings(c.cpu()).cuda())
        d = self.pos_embeddings(self.embeddings(d.cpu()).cuda())
        e = self.pos_embeddings(self.embeddings(e.cpu()).cuda())
        
        a = self.encoder1(a)
        b = self.decoder1(b, a).mean(dim=1)
        c = self.decoder2(c, a).mean(dim=1)
        d = self.decoder3(d, a).mean(dim=1)
        e = self.decoder4(e, a).mean(dim=1)
        x = torch.cat([b, c, d, e], 1)
        x = self.linear1(x)
        return x

In [13]:
model = BrainTeaserModel(vocab_size=VOCAB_SIZE, 
                         embedding_dim=EMBEDDING_DIM, 
                         num_layers=NUM_LAYERS, 
                         num_heads=NUM_HEADS, 
                         dropout=DROPOUT, 
                         embed_mat=emb_mat, 
                         freeze=True, 
                         batch_first=True)
model.load_state_dict(torch.load("brain-teaser-3.pth"))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)
criterion = nn.CrossEntropyLoss()
dataset = TrainDataset(question, choice1, choice2, choice3, choice4, labels, randomize_place=False)
sampler = torch.utils.data.RandomSampler(dataset, num_samples=50000)
training_data = DataLoader(dataset, batch_size=5, collate_fn=pad_collate, sampler=sampler)
accuracy = Accuracy(task="multiclass", num_classes=4).to(DEVICE)



In [None]:
EPOCHS=1000
for epoch in range(1, EPOCHS):
    postfix = {"loss": 0, "accuracy": 0}
    loss_val = 0
    acc_val = 0
    bar = tqdm(training_data, desc=f'Epoch {epoch}', postfix=postfix)
    for idx, (A, B, C, D, E, F) in enumerate(bar):
        A = A.to(DEVICE)
        B = B.to(DEVICE)
        C = C.to(DEVICE)
        D = D.to(DEVICE)
        E = E.to(DEVICE)
        F = F.to(DEVICE)

        model.train()
        optimizer.zero_grad()
        preds = model(A, B, C, D, E)
        loss = criterion(preds, F.long())
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.inference_mode():
            if not np.isnan(loss.item()):
                loss_val += loss.item()
            acc_val += accuracy(preds, F).item()
            postfix["loss"] = loss_val / (idx + 1)
            postfix["accuracy"] = acc_val / (idx + 1)
        bar.set_postfix(postfix)
    torch.save(model.state_dict(), f"brain-teaser-3.pth")

Epoch 1: 100%|██████████| 10000/10000 [13:49<00:00, 12.06it/s, loss=0.419, accuracy=0.835]
Epoch 2: 100%|█████████▉| 9966/10000 [13:46<00:02, 12.26it/s, loss=0.389, accuracy=0.847]

In [None]:
torch.save(model.state_dict(), f"brain-teaser-3.pth")

# Validation

In [18]:
val_data = np.load("/kaggle/input/cs779-brainteaser/WP_eval_data_for_practice.npy", allow_pickle=True)
val_data = pd.DataFrame.from_records(val_data)
val_data[["choice1", "choice2", "choice3", "choice4"]] = np.array(val_data["choice_list"].to_list())
val_data = val_data.drop(["choice_list"], axis=1)
val_data.head()

Unnamed: 0,question,choice1,choice2,choice3,choice4
0,What kind of nut has no shell?,A peanut.,A Doughnut.,A walnut.,None of above.
1,Which nut doesn't have a shell?,A Doughnut.,A walnut.,A peanut.,None of above.
2,Which type of bell doesn't make a sound?\n\n,A fire bell.,A cow bell.,A Bluebell.,None of above.
3,What does a stone become when in the water?,A whetstone.,A limestone,A sandstone.,None of above.
4,What changes a stone makes when submerged in w...,A whetstone.,A sandstone.,A limestone,None of above.


In [19]:
val_question = convert_sentences_to_tokens(val_data["question"], eng_tokenizer)
val_choice1 = convert_sentences_to_tokens(val_data["choice1"], eng_tokenizer)
val_choice2 = convert_sentences_to_tokens(val_data["choice2"], eng_tokenizer)
val_choice3 = convert_sentences_to_tokens(val_data["choice3"], eng_tokenizer)
val_choice4 = convert_sentences_to_tokens(val_data["choice4"], eng_tokenizer)

In [20]:
val_dataset = TestDataset(val_question, val_choice1, val_choice2, val_choice3, val_choice4)
val_data = DataLoader(val_dataset, batch_size=5, collate_fn=pad_collate)

data = []
bar = tqdm(val_data)
acc = 0
for idx, (A, B, C, D, E, F) in enumerate(bar):
    A = A.to(DEVICE)
    B = B.to(DEVICE)
    C = C.to(DEVICE)
    D = D.to(DEVICE)
    E = E.to(DEVICE)
    F = F.to(DEVICE)
    
    preds = model(A, B, C, D, E)
    preds = preds.argmax(1)
    
    for i in range(len(preds)):
        q = convert_to_sentence(A[i].cpu().numpy(), idx2word)
        c1 = convert_to_sentence(B[i].cpu().numpy(), idx2word)
        c2 = convert_to_sentence(C[i].cpu().numpy(), idx2word)
        c3 = convert_to_sentence(D[i].cpu().numpy(), idx2word)
        c4 = convert_to_sentence(E[i].cpu().numpy(), idx2word)
#         print(f"Question: {q}\n1. {c1}\n2. {c2}\n3. {c3}\n4. {c4}")
#         print(f"Prediction: {preds[i].item()+1}\n\n")
        data.append(str(preds[i].item()))

100%|██████████| 24/24 [00:00<00:00, 30.84it/s]


In [21]:
with open("answer_word.txt", "w") as f:
    f.write("\n".join(data))