In [None]:
import json
import pandas as pd
import numpy as np
import torch
import matplotlib
import matplotlib.pyplot as plt
from torch import nn
import torch.nn.functional as F
#import spacy
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import random
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
nltk.download('punkt')
import os
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
import torch.optim as optim

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data_path = '/content/drive/MyDrive/BIDAF/train-v2.0.json'
dev_data_path = '/content/drive/MyDrive/BIDAF/dev-v2.0.json'
with open(train_data_path, 'r') as f:
    train_data = json.load(f)
with open(dev_data_path, 'r') as f:
    dev_data = json.load(f)

In [None]:
def data_to_df(data:dict)->list:
  data = data['data']
  data_list = []

  for para in data:

      for p in para['paragraphs']:
          context = p['context']

          for qa in p['qas']:

              id = qa['id']
              question = qa['question']

              for ans in qa['answers']:
                  answer = ans['text']
                  s_idx = ans['answer_start']
                  e_idx = s_idx + len(answer)
                  data_list.append({
                      "id": id,
                      "context": context,
                      "question": question,
                      "ans_idx": [s_idx, e_idx],
                      "answer": answer
                  })
  return data_list

In [None]:
f_train=data_to_df(train_data)
f_dev=data_to_df(dev_data)
train_df = pd.DataFrame(f_train)
dev_df = pd.DataFrame(f_dev)

In [None]:
train_df['context'] = train_df['context'].str.lower()
train_df['question'] = train_df['question'].str.lower()
train_df['answer'] = train_df['answer'].str.lower()

dev_df['context'] = dev_df['context'].str.lower()
dev_df['question'] = dev_df['question'].str.lower()
dev_df['answer'] = dev_df['answer'].str.lower()

In [None]:
dev_df.head(5)

Unnamed: 0,id,context,question,ans_idx,answer
0,56ddde6b9a695914005b9628,the normans (norman: nourmands; french: norman...,in what country is normandy located?,"[159, 165]",france
1,56ddde6b9a695914005b9628,the normans (norman: nourmands; french: norman...,in what country is normandy located?,"[159, 165]",france
2,56ddde6b9a695914005b9628,the normans (norman: nourmands; french: norman...,in what country is normandy located?,"[159, 165]",france
3,56ddde6b9a695914005b9628,the normans (norman: nourmands; french: norman...,in what country is normandy located?,"[159, 165]",france
4,56ddde6b9a695914005b9629,the normans (norman: nourmands; french: norman...,when were the normans in normandy?,"[94, 117]",10th and 11th centuries


In [None]:
train_df.head(5)

Unnamed: 0,id,context,question,ans_idx,answer
0,56be85543aeaaa14008c9063,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,when did beyonce start becoming popular?,"[269, 286]",in the late 1990s
1,56be85543aeaaa14008c9065,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,what areas did beyonce compete in when she was...,"[207, 226]",singing and dancing
2,56be85543aeaaa14008c9066,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,when did beyonce leave destiny's child and bec...,"[526, 530]",2003
3,56bf6b0f3aeaaa14008c9601,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,in what city and state did beyonce grow up?,"[166, 180]","houston, texas"
4,56bf6b0f3aeaaa14008c9602,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,in which decade did beyonce become famous?,"[276, 286]",late 1990s


In [None]:
len(train_df)

86821

In [None]:
len(dev_df)

20302

In [None]:
#Mini sample for training and testing
#train_df_mini = train_df.sample(50)
#dev_df_mini = dev_df.sample(20)
train_val_sample = train_df.sample(100)
train_df_mini, dev_df_mini = train_test_split(train_val_sample, test_size = 0.2, random_state = 47, shuffle = True)
test_df_mini = dev_df.sample(50)

In [None]:
len(train_df_mini)

80

In [None]:
len(dev_df_mini)

20

In [None]:
len(test_df_mini)

50

In [None]:
def unique_sents(dfs: list):
    unique_texts = set()
    for df in dfs:
        unique_texts.update(df['context'].unique())
        unique_texts.update(df['question'].unique())

    return list(unique_texts)

def build_vocab(dfs):
    words = []
    chars = []
    text = unique_sents(dfs)
    for sent in text:
        words.extend(word_tokenize(sent))
        chars.extend(sent)

    word_counter = Counter(words)
    word_vocab = ['<unk>', '<pad>'] + [word for word, count in word_counter.items()]
    word2idx = {word: idx for idx, word in enumerate(word_vocab)}
    idx2word = {idx: word for word, idx in word2idx.items()}

    char_counter = Counter(chars)
    high_freq_chars = [char for char, count in char_counter.items() if count >= 10]
    char_vocab = ['<unk>', '<pad>'] + high_freq_chars
    char2idx = {char: idx for idx, char in enumerate(char_vocab)}

    return word2idx, idx2word, word_vocab, char2idx, char_vocab


In [None]:
word2idx, idx2word, word_vocab, char2idx, char_vocab = build_vocab([train_df, dev_df])
print("len of word2idx:",len(word2idx))

len of word2idx: 106972


In [None]:
def convert_to_ids(df, word2idx):
    def text_to_ids(text):
        tokens = word_tokenize(text)
        ids = [word2idx.get(token, word2idx['<unk>']) for token in tokens]
        return ids
    df['context_ids'] = df['context'].apply(text_to_ids)
    df['question_ids'] = df['question'].apply(text_to_ids)
    return df


In [None]:
train_df = convert_to_ids(train_df,word2idx)
dev_df = convert_to_ids(dev_df,word2idx)

In [None]:
train_df_mini = convert_to_ids(train_df_mini,word2idx)
dev_df_mini = convert_to_ids(dev_df_mini,word2idx)

In [None]:
print(train_df_mini.head(2))
print(dev_df_mini.head(2))

                             id  \
61175  57299a913f37b319004784ee   
37035  572616bfec44d21400f3d8a6   

                                                 context  \
61175  camouflage is an important defense strategy, w...   
37035  directly underneath the state apartments is a ...   

                                                question     ans_idx  \
61175  what does coloration and shape help an insect ...    [94, 99]   
37035  what type of entertaining are the semi-state a...  [172, 196]   

                         answer  \
61175                     blend   
37035  less formal entertaining   

                                             context_ids  \
61175  [14360, 64, 36, 1324, 6706, 269, 4, 53, 7030, ...   
37035  [7198, 14167, 8, 841, 5179, 64, 141, 16239, 16...   

                                            question_ids  
61175  [29, 242, 49968, 34, 2719, 459, 36, 4717, 1087...  
37035  [29, 425, 16, 24853, 125, 8, 57979, 5179, 601,...  
                             id 

In [None]:
def add_ans_label(df, idx2word):
    err_idx = set()
    df['ans_label_idx'] = ' '
    #print(type(df))
    for index, row in df.iterrows():
        answer_start, answer_end = row['ans_idx']
        answer_tokens = word_tokenize(row['answer'])

        context_tokens = word_tokenize(row['context'])

        start_idx = None
        end_idx = None

        for i, token in enumerate(context_tokens):
            if i < len(context_tokens) - len(answer_tokens) + 1:
                if context_tokens[i:i+len(answer_tokens)] == answer_tokens:
                    start_idx = i
                    end_idx = i + len(answer_tokens) - 1
                    break

        if start_idx is None or end_idx is None:
            err_idx.add(index)
            continue

        if idx2word.get(row['context_ids'][start_idx]) != answer_tokens[0] or \
           idx2word.get(row['context_ids'][end_idx]) != answer_tokens[-1]:
            err_idx.add(index)

        df.at[index, 'ans_label_idx'] = [start_idx, end_idx]

    df_final = df.drop(err_idx)

    return df_final

In [None]:
train_df = add_ans_label(train_df, idx2word)
dev_df = add_ans_label(dev_df, idx2word)

In [None]:
len(word_vocab)

106972

In [None]:
#glove embedding
def glove_dict():

    glove = {}
    with open('/content/drive/MyDrive/BIDAF/glove.6B.100d.txt', "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            glove[word] = vector

    f.close()

    return glove

glove_dict = glove_dict()
def create_weights_matrix(glove_dict):

    weights_matrix = np.zeros((len(word_vocab), 100)) #300
    words = 0
    for i, word in enumerate(word_vocab):
        try:
            weights_matrix[i] = glove_dict[word]
            words += 1
        except:
            #print(word)
            pass
    return weights_matrix

In [None]:
weights_matrix = create_weights_matrix(glove_dict)
np.save('/content/drive/MyDrive/BIDAF/bidafglove_fullvocab_100d.npy', weights_matrix)

In [None]:
glove_dict= np.load('/content/drive/MyDrive/BIDAF/bidafglove_fullvocab_100d.npy')

In [None]:
glove_dict.shape

(106972, 100)

In [None]:
print(glove_dict[:100])

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.27891001 -0.22973999 -0.47454    ...  0.42433     0.14577
  -0.59513003]
 ...
 [ 0.37601     0.31900999  0.91013998 ... -1.30990005 -0.087775
   1.1415    ]
 [ 0.30658001  0.40744001  0.047907   ... -0.24964     0.51479
   0.50348997]
 [ 0.027048   -0.053833    0.16562    ... -0.50854999  0.13723999
   0.046243  ]]


In [None]:
weights_matrix.shape

(106972, 100)

In [None]:
train_val_sample = train_df.sample(1000)
train_df_mini, dev_df_mini = train_test_split(train_val_sample, test_size = 0.2, random_state = 47, shuffle = True)
test_df_mini = dev_df.sample(500)

In [None]:

class DatasetLoader(Dataset):

    def __init__(self, data, batch_size):
        self.batch_size = batch_size
        self.data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        batch = self.data[index]
        return self.process_batch(batch)

    def process_batch(self, batch):
        spans, ctx_text, answer_text = [], [], []

        for ctx in batch.context:
            ctx_text.append(ctx)
            spans.append(self.get_span(ctx))

        for ans in batch.answer:
            answer_text.append(ans)

        padded_context = self.pad_sequences([torch.LongTensor(ctx) for ctx in batch.context_ids])
        char_ctx = self.pad_char_sequences([self.make_char_vector(ctx) for ctx in batch.context])

        padded_question = self.pad_sequences([torch.LongTensor(ques) for ques in batch.question_ids])
        char_ques = self.pad_char_sequences([self.make_char_vector(ques) for ques in batch.question])

        ids = list(batch.id)
        label = torch.LongTensor(list(batch.ans_label_idx))

        return padded_context, padded_question, char_ctx, char_ques, label, ctx_text, answer_text, ids

    def pad_sequences(self, sequences):
        max_len = max(len(seq) for seq in sequences)
        padded_seq = torch.ones(len(sequences), max_len).long()
        for i, seq in enumerate(sequences):
            padded_seq[i, :len(seq)] = seq
        return padded_seq

    def pad_char_sequences(self, sequences):
        max_seq_len = max(len(seq) for seq in sequences)
        max_word_len = max(len(word) for seq in sequences for word in seq)
        padded_seq = torch.ones(len(sequences), max_seq_len, max_word_len).long()
        for i, seq in enumerate(sequences):
            for j, word in enumerate(seq):
                padded_seq[i, j, :len(word)] = torch.tensor(word)
        return padded_seq

    def make_char_vector(self, sentence):
        char_vec = []
        for word in word_tokenize(sentence):
            char_idx = [char2idx.get(ch, 0) for ch in word]
            char_vec.append(char_idx)
        return char_vec

    def get_span(self, text):
        tokens = word_tokenize(text)
        span = [(0, len(token)) for token in tokens]
        return span


In [None]:

train_dataset = DatasetLoader(train_df_mini, 50)
valid_dataset = DatasetLoader(dev_df_mini,50)
test_dataset = DatasetLoader(test_df_mini,50)

In [None]:
#Model
#T, C, C=1-T, y = h(x)*t(x)+x(1-t(x)) #implement 2 layers
'''class Highway(nn.Module):
    def __init__(self, input_size):
        super(Highway, self).__init__()
        self.transform_gate = nn.Linear(input_size, input_size)
        self.highway = nn.Linear(input_size, input_size)
        self.activation = nn.ReLU()

    def forward(self, x):
        transform_gate = torch.sigmoid(self.transform_gate(x))
        transformed = self.activation(self.highway(x))
        carry = 1 - transform_gate
        output = transform_gate * transformed + carry * x
        return output'''

#2 layer HN
class Highway(nn.Module):
  def __init__(self,input_size):
    super(Highway, self).__init__()
    self.input_size = input_size
    self.t_gate1 = nn.Linear(input_size, input_size)
    self.t_gate2 = nn.Linear(input_size, input_size)
    self.h_layer1 = nn.Linear(input_size, input_size)
    self.h_layer2 = nn.Linear(input_size, input_size)
    self.activation = nn.ReLU()

  def forward(self, x):
    tg1 = torch.sigmoid(self.t_gate1(x))
    h1 = self.activation(self.h_layer1(x))
    x = h1*tg1 + x*(1-tg1)
    tg2 = torch.sigmoid(self.t_gate2(x))
    h2 = self.activation(self.h_layer2(x))
    x = h2*tg2 + x*(1-tg2)
    return x

#Charecter Embedding Layer (rewrite)
class CharEmbedding(nn.Module):
  def __init__(self,  c_vocab_size, c_emb_dim, output_shape , cnn_window_size ):
    super(CharEmbedding, self).__init__()
    self.c_emb_dim = c_emb_dim
    self.char_embedding = nn.Embedding(c_vocab_size, c_emb_dim, padding_idx = 1)
    self.conv = nn.Conv2d(in_channels = 1, out_channels = output_shape, kernel_size = cnn_window_size)
    self.dropout = nn.Dropout(.2)
    self.act = nn.ReLU()

  def forward(self, x):
    batch_size = x.shape[0] #x = [bs, seq_len, w_len]
    x = self.dropout(self.char_embedding(x)) #[bs, seq_len, w_len, c_emb_dim]
    x = x.permute(0,1,3,2) #switch dim 2 and 3 #[bs, seq, c_emb_dim, w_len]
    x = x.view(-1, self.c_emb_dim, x.shape[3]) # reshape [bs*seq_len, c_emb_dim, w_len)
    x = x.unsqueeze(1) #[bs*seq len, 1, c_emb_dim, w_len)
    x = self.act(self.conv(x)) #[bs*seq_len, output, 1, w_out]
    x = x.squeeze() #[bs*seq_len, output, w_out]
    x = F.max_pool1d(x, x.shape[2]).squeeze() #[bs*seq_len, output]
    x = x.view(batch_size, -1, x.shape[-1]) #[bs, seq_len, output ]
    return x


#biderectional LSTM,concat o/p wordemb charemb,
class ContextEmbedding(nn.Module):
  def __init__(self,input_size, hidden_dim):
    super(ContextEmbedding, self).__init__()
    self.ctx_emb = nn.LSTM(input_size, hidden_dim, batch_first=True, dropout=0.2, bidirectional=True)
  def forward(self,x):
    op,_h = self.ctx_emb(x)
    return op


class AttnLayer(nn.Module):
  def __init__(self,input_size):
    super(AttnLayer, self).__init__()
    self.sim_w = nn.Linear(input_size ,1 , bias=False) #w

  def forward(self, con, ques):
    q_exp = ques.unsqueeze(1) #U #[bs, 1, q_len,
    c_exp = con.unsqueeze(2) #H
    ques_f = q_exp.expand(-1, con.size(1), -1,-1)
    con_f = c_exp.expand(-1,-1,ques.size(1),-1)
    elem_mul = torch.mul(con_f, ques_f)
    a = torch.cat([con_f, ques_f, elem_mul], 3) #[h;u;h o u]
    sim_mat = self.sim_w(a).view(-1, con.size(1), ques.size(1)) #sim score/ Stj/alpha(H,U)
    c2q_attn = torch.matmul(F.softmax(sim_mat, dim=-1),ques) #att_w_c2q = F.softmax(sim_mat, dim=-1)
    q2c_attn = torch.matmul(F.softmax(torch.max(sim_mat, 2)[0], dim=-1).unsqueeze(1),con) #attn_w_q2c- = F.softmax(torch.max(sim_mat, 2)[0], dim=-1)
    q2c_attn = q2c_attn.expand(-1, con.size(1), -1)
    G = torch.cat([con, c2q_attn, torch.mul(con, c2q_attn), torch.mul(con , q2c_attn)], 2) #8d
    return G


In [None]:
class Bidaf(nn.Module):

  def glove_embedding(self):

      weights_matrix = np.load('/content/drive/MyDrive/BIDAF/bidafglove_fullvocab_100d.npy')
      num_embeddings, embedding_dim = weights_matrix.shape
      embedding = nn.Embedding.from_pretrained(torch.FloatTensor(weights_matrix).to(self.device),freeze=True)

      return embedding

  def __init__(self, args):
    super(Bidaf,self).__init__()
    self.device = device

    #Highway network
    self.highway_net = Highway(args.emb_size*2)
    #Word embedding
    self.word_embedding = self.glove_embedding()
    #CNN char embedding
    self.char_embedding = CharEmbedding(args.c_vocab_size, args.c_emb_dim, args.output_shape , args.cnn_window_size)

    #Contextual embedding
    self.ctx_embedding = ContextEmbedding(args.emb_size * 2, args.h_dim)

    #Attention layer

    self.attention_layer = AttnLayer(args.emb_size*6) #w(s) = 6d

    #Modelling layer
    self.model_layer = nn.LSTM(args.emb_size*8, args.emb_size, num_layers = 2, batch_first = True, dropout = 0.2, bidirectional = True)

    #Output layer
    self.p1_w = nn.Linear(args.emb_size*10, 1, bias = False) #g,m
    self.op_lstm = nn.LSTM(args.emb_size*2, args.h_dim, bidirectional=True, batch_first= True, dropout = 0.2)
    self.p2_w = nn.Linear(args.emb_size*10, 1, bias = False) #g,m (8+2)


  def forward(self, ctx, ques, char_ctx, char_ques):
    #Word embedding
     c_w_emb = self.word_embedding(ctx)
     q_w_emb = self.word_embedding(ques)

    #CNN char embedding
     c_ch_emb = self.char_embedding(char_ctx)
     q_ch_emb = self.char_embedding(char_ques)

     #Highway net
     c_input = torch.cat([c_w_emb, c_ch_emb],dim=2)
     q_input = torch.cat([q_w_emb, q_ch_emb],dim=2)
     hw_c = self.highway_net(c_input)
     hw_q = self.highway_net(q_input)

    #Contextual embedding
     c_contextual_emb = self.ctx_embedding(hw_c)
     q_contextual_emb = self.ctx_embedding(hw_q)

    #Attention layer

     G = self.attention_layer(c_contextual_emb,q_contextual_emb)

    #Modelling layer
     M, _h = self.model_layer(G)

    #Output layer
     p1 = self.p1_w(torch.cat([G,M],dim=2)).squeeze()
     M2, _h = self.op_lstm(M)
     p2 = self.p1_w(torch.cat([G,M2],dim=2)).squeeze()
     return p1, p2


In [None]:
def preprocess_answer(answer):
    answer = answer.lower()
    answer = answer.strip()
    answer = re.sub(r'[^\w\s]', '', answer)
    tokens = word_tokenize(answer)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    preprocessed_answer = ' '.join(filtered_tokens)
    return preprocessed_answer

def max_pred_val(metric_fn, prediction, ground_truths):
    max_score = 0
    for ground_truth in ground_truths:
        score = metric_fn(prediction, ground_truth)
        if score > max_score:
            max_score = score
    return max_score


def f1_score(prediction, ground_truth):

    prediction_tokens = preprocess_answer(prediction).split()
    ground_truth_tokens = preprocess_answer(ground_truth).split()
    match = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    TP = sum(match.values())
    if TP == 0:
      return 0
    precision = TP / len(prediction_tokens)
    recall = TP / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):

    return (preprocess_answer(prediction) == preprocess_answer(ground_truth))

def evaluate(predictions, valid_dataset):
    f1 = exact_match = total = 0
    for index, row in valid_dataset.iterrows():
        total += 1
        qa_id = row['id']
        if qa_id not in predictions:
            continue

        ground_truths = [row['answer']]

        prediction = predictions[qa_id]

        exact_match += max_pred_val(
            exact_match_score, prediction, ground_truths)

        f1 += max_pred_val(
            f1_score, prediction, ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return f1, exact_match


In [None]:
class ModelArgs:
    def __init__(self, c_vocab_size, c_emb_dim, output_shape, cnn_window_size, emb_size, h_dim):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.c_vocab_size = len(char_vocab)
        self.c_emb_dim = c_emb_dim
        self.output_shape = output_shape
        self.cnn_window_size = cnn_window_size
        self.emb_size = emb_size
        self.h_dim = h_dim

args = ModelArgs(
    c_vocab_size=len(char_vocab),
    c_emb_dim=8,
    output_shape = 100,
    cnn_window_size=(8,5),#window size = 5
    emb_size=100,
    h_dim=100
)

In [None]:

import torch.optim as optim
import os
m = nn.LogSoftmax(dim=1)
#criterion =  nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
model = Bidaf(args).to(device)
optimizer = optim.AdamW(model.parameters(),lr=0.001)
#optimizer = optim.Adadelta(model.parameters(),lr=0.05)
num_epochs = 10
#scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma = 0.0009)

# Training loop
train_losses = []
val_losses = []
best_val_loss = float('inf')
best_checkpoint_path = None

checkpoint_dir = "/content/drive/MyDrive/BIDAF/checkpoints_test_100/"
os.makedirs(checkpoint_dir, exist_ok=True)

with open("/content/drive/MyDrive/BIDAF/train_loss_test_100.txt", "w") as train_file, open("/content/drive/MyDrive/BIDAF/val_loss_test_100.txt", "w") as val_file:
    for epoch in range(num_epochs):
        model.train()
        epoch_train_losses = []
        num_batches = len(train_dataset)

        for i, batch in enumerate(train_dataset):
            optimizer.zero_grad()
            context, question, char_ctx, char_ques, label, ctx, answers, ids = batch

            context, question, char_ctx, char_ques, label = context.to(device), question.to(device),\
                                      char_ctx.to(device), char_ques.to(device), label.to(device)

            # Forward pass
            output_start, output_end = model(context, question, char_ctx, char_ques)

            # Calculate loss
            loss = criterion(m(output_start), label[:, 0]) + criterion(m(output_end), label[:, 1])
            epoch_train_losses.append(loss.item())

            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{i+1}/{num_batches}], Loss: {loss.item():.4f}")
        train_loss_avg = sum(epoch_train_losses) / len(epoch_train_losses)

        #scheduler.step()





        model.eval()
        num_val_batches = len(valid_dataset)
        with torch.no_grad():
            epoch_val_losses = []
            predictions = {}
            for val_batch in valid_dataset:

                val_context, val_question, val_char_ctx, val_char_ques, val_label, val_ctx, val_answers, val_ids = val_batch
                val_context, val_question, val_char_ctx, val_char_ques, val_label = val_context.to(device), val_question.to(device), val_char_ctx.to(device), val_char_ques.to(device), val_label.to(device)

                # Forward pass
                val_output_start, val_output_end = model(val_context, val_question, val_char_ctx, val_char_ques)

                # Process predictions
                for i in range(len(val_ids)):
                    id = val_ids[i]
                    start_idx = torch.argmax(val_output_start[i])
                    end_idx = torch.argmax(val_output_end[i])
                    # Convert token indices to words
                    answer_tokens = val_context[i][start_idx:end_idx+1]
                    answer = ' '.join([idx2word[idx.item()] for idx in answer_tokens])
                    predictions[id] = answer

                val_loss = criterion(m(val_output_start), val_label[:, 0]) + criterion(m(val_output_end), val_label[:, 1])
                epoch_val_losses.append(val_loss.item())

            # Calculate average validation loss for the epoch
            val_loss_avg = (sum(epoch_val_losses) / len(epoch_val_losses))

            if val_loss_avg < best_val_loss:
                best_val_loss = val_loss_avg
                best_checkpoint_path = os.path.join(checkpoint_dir, f"best_model_epoch_{epoch + 1}.pt")
                torch.save(model.state_dict(), best_checkpoint_path)
                print(f"Best model checkpoint saved at {best_checkpoint_path}, Epoch: {epoch + 1}, Val Loss: {val_loss_avg:.4f}")


        train_file.write(f"Epoch {epoch + 1}, Train Loss: {train_loss_avg:.4f}\n")
        val_file.write(f"Epoch {epoch + 1}, Val Loss: {val_loss_avg:.4f}\n")

        train_losses.append(train_loss_avg)
        val_losses.append(val_loss_avg)

        f1,em = evaluate(predictions, dev_df_mini)
        #val_file.write(f"Epoch {epoch + 1}, EM: {em:.4f}, F1: {f1:.4f}\n")
        print(f"Epoch {epoch + 1}, EM: {em:.4f}, F1: {f1:.4f}")

        with open("/content/drive/MyDrive//BIDAF/val_em_f1_test_100.txt", "a") as eval_file:
            eval_file.write(f"Epoch {epoch + 1}, EM: {em:.4f}, F1: {f1:.4f}\n")

        checkpoint_path = os.path.join(checkpoint_dir, f"epoch_{epoch + 1}.pt")
        torch.save(model.state_dict(), checkpoint_path)
        print(f"Checkpoint saved at {checkpoint_path}")





best_model = Bidaf(args).to(device)
best_model.load_state_dict(torch.load(best_checkpoint_path))
predictions = {}

model.eval()
num_test_batches = len(test_dataset)
with torch.no_grad():
    predictions = {}
    for test_batch in test_dataset:

        test_context, test_question, test_char_ctx, test_char_ques, test_label, test_ctx, test_answers, test_ids = test_batch
        test_context, test_question, test_char_ctx, test_char_ques, test_label = test_context.to(device), test_question.to(device), test_char_ctx.to(device), test_char_ques.to(device), test_label.to(device)

        # Forward pass
        test_output_start, test_output_end = model(test_context, test_question, test_char_ctx, test_char_ques)

        # Process predictions
        for i in range(len(test_ids)):
            id = test_ids[i]
            start_idx = torch.argmax(test_output_start[i])
            end_idx = torch.argmax(test_output_end[i])
            # Convert token indices to words
            answer_tokens = test_context[i][start_idx:end_idx+1]
            answer = ' '.join([idx2word[idx.item()] for idx in answer_tokens])
            predictions[id] = answer

with open("/content/drive/MyDrive/BIDAF/test_predictions.txt", "w") as pred_file:
    for id, answer in predictions.items():
        pred_file.write(f"{id}\t{answer}\n")

f1,em = evaluate(predictions, test_df_mini)
print(f" EM: {em:.4f}, F1: {f1:.4f}")

with open("/content/drive/MyDrive//BIDAF/test_em_f1_test_100.txt", "a") as eval_file:
    eval_file.write(f" EM: {em:.4f}, F1: {f1:.4f}\n")


In [None]:
print(predictions)

{'5728ebcb3acd2414000e01da': 'civil disobedients', '572941273f37b319004781ae': '', '57115b8b50c2381900b54a89': 'oliver evans', '571cbe35dd7acb1400e4c13f': 'vegetation canopies', '5728ec6a4b864d19001650aa': 'las vegas', '572885023acd2414000dfa87': 'mongols', '572807802ca10214002d9bf8': 'legrande', '57339ad74776f41900660e88': 'teatr wielki', '572945b11d04691400779230': 'temperatures and sea levels have been rising at or above the maximum rates proposed during the last ipcc report in 2001. the study compared ipcc 2001 projections on temperature and sea level change with observations', '5733f8dc4776f419006615f9': "johnson 's expedition was better organized than shirley 's , which was noticed by new france 's governor , the marquis de vaudreuil", '57340b1bd058e614000b686a': 'provisions was the reservation of lands west of the appalachian mountains to its indian population , a demarcation that was at best a temporary impediment to a rising tide of westward-bound settlers', '57115b2850c238190

In [None]:
dev_df.head(5)

Unnamed: 0,id,context,question,ans_idx,answer,context_ids,question_ids,ans_label_idx
0,56ddde6b9a695914005b9628,the normans (norman: nourmands; french: norman...,in what country is normandy located?,"[159, 165]",france,"[9, 24365, 302, 7277, 363, 60826, 785, 99, 363...","[13, 243, 1272, 92, 5667, 978, 155]","[34, 34]"
1,56ddde6b9a695914005b9628,the normans (norman: nourmands; french: norman...,in what country is normandy located?,"[159, 165]",france,"[9, 24365, 302, 7277, 363, 60826, 785, 99, 363...","[13, 243, 1272, 92, 5667, 978, 155]","[34, 34]"
2,56ddde6b9a695914005b9628,the normans (norman: nourmands; french: norman...,in what country is normandy located?,"[159, 165]",france,"[9, 24365, 302, 7277, 363, 60826, 785, 99, 363...","[13, 243, 1272, 92, 5667, 978, 155]","[34, 34]"
3,56ddde6b9a695914005b9628,the normans (norman: nourmands; french: norman...,in what country is normandy located?,"[159, 165]",france,"[9, 24365, 302, 7277, 363, 60826, 785, 99, 363...","[13, 243, 1272, 92, 5667, 978, 155]","[34, 34]"
4,56ddde6b9a695914005b9629,the normans (norman: nourmands; french: norman...,when were the normans in normandy?,"[94, 117]",10th and 11th centuries,"[9, 24365, 302, 7277, 363, 60826, 785, 99, 363...","[103, 131, 9, 24365, 13, 5667, 155]","[21, 24]"


In [None]:
# Plotting training and validation losses
plt.title('AdamW')
plt.plot(train_losses,label='Train Loss')
#plt.xlabel('Epochs'))
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Train Loss')
plt.legend()
plt.show()