In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from copy import copy
from tqdm import tqdm
import json
import pandas as pd

import torch
from transformers import BertModel, BertTokenizerFast
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from rouge import Rouge

### Hyperparameters

In [8]:
NUM_CLASSES = 4
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 128

LR = 2e-5
TRAINING_BATCH_SIZE = 16
VAL_BATCH_SIZE = 1
EPOCHS = 4
DROPOUT = 0.3
NUM_WORKERS = 4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Read dataset

In [9]:
loc = './dataset/'

train = pd.read_csv(loc + 'train.csv')
test = pd.read_json(loc + 'test.jsonl', lines=True)
val = pd.read_csv(loc + 'valid.csv')

### Create Dataset

In [10]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, add_CLS=True):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.add_CLS = add_CLS

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        id = row['id']
        context = row['context']
        question = row['question']
        opt1 = row['answer0']
        opt2 = row['answer1']
        opt3 = row['answer2']
        opt4 = row['answer3']
        correct_opt = row['label']

        context_emb = self.tokenizer(context, max_length=self.max_len, padding='max_length', 
                                     return_attention_mask=True, truncation=True, 
                                     add_special_tokens=self.add_CLS, return_tensors='pt')
        print(context_emb)
        return 0


**Initialize the pretrained tokenizer**

In [11]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

In [12]:
train_dataset = Dataset(train, tokenizer, MAX_LEN)
val_dataset = Dataset(val, tokenizer, MAX_LEN)
test_dataset = Dataset(test, tokenizer, MAX_LEN)

In [13]:
train_dataset[0]

{'input_ids': tensor([[ 101, 2204, 2214, 2162, 1998, 2711, 1048, 1024, 1045, 2387, 2119, 1997,
         2122, 4996, 9317, 2305, 1010, 1998, 2027, 2119, 8682, 2033, 2185, 1012,
         5667, 1012, 2204, 2214, 2162, 2003, 6490, 1998, 3084, 2033, 2868, 1012,
         1045, 2428, 2064, 2025, 2393, 2021, 2022, 3407, 2043, 1045, 4952, 2000,
         2068, 1025, 1045, 2228, 2009, 1005, 1055, 1996, 2755, 2008, 2027, 2790,
         2061, 3407, 3209, 2043, 2027, 2209, 1012,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

0