In [1]:
# !git clone https://github.com/Pennywise881/question_answering_assignment_celloscope.git

In [2]:
# !wget -nc -q {"https://nlp.stanford.edu/data/glove.6B.zip"}

In [3]:
# %pwd
# %cd /content/question_answering_assignment_celloscope
# %mkdir output

In [4]:
# !unzip '/home/nafi/Documents/Work/question_answering_assignment_celloscope/glove.6B.zip' -d '/home/nafi/Documents/Work/question_answering_assignment_celloscope/glove_6B' 

In [5]:
# !pip install -r requirements_new.txt
# !python -m spacy download en

In [6]:
# external libraries
import numpy as np
import pickle
import os
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
# from tensorboardX import SummaryWriter
from tqdm import tqdm

# internal utilities
import config
from model import BiDAF
from data_loader import SquadDataset
from utils import save_checkpoint, compute_batch_metrics

2022-07-04 05:21:44.930174: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-04 05:21:44.930194: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [7]:
# print(config.train_dir)

In [8]:
# preprocessing values used for training
prepro_params = {
    "max_words": config.max_words,
    "word_embedding_size": config.word_embedding_size,
    "char_embedding_size": config.char_embedding_size,
    "max_len_context": config.max_len_context,
    "max_len_question": config.max_len_question,
    "max_len_word": config.max_len_word
}

# hyper-parameters setup
hyper_params = {
    "num_epochs": config.num_epochs,
    "batch_size": config.batch_size,
    "learning_rate": config.learning_rate,
    "hidden_size": config.hidden_size,
    "char_channel_width": config.char_channel_width,
    "char_channel_size": config.char_channel_size,
    "drop_prob": config.drop_prob,
    "cuda": config.cuda,
    "pretrained": config.pretrained
}

experiment_params = {"preprocessing": prepro_params, "model": hyper_params}

# train on GPU if CUDA variable is set to True (a GPU with CUDA is needed to do so)
device = torch.device("cuda" if hyper_params["cuda"] else "cpu")
torch.manual_seed(42)

# define a path to save experiment logs
experiment_path = "output/{}".format(config.exp)
if not os.path.exists(experiment_path):
    os.mkdir(experiment_path)

# save the preprocesisng and model parameters used for this training experiemnt
with open(os.path.join(experiment_path, "config_{}.json".format(config.exp)), "w") as f:
    json.dump(experiment_params, f)

# start TensorboardX writer
# writer = SummaryWriter(experiment_path)

# open features file and store them in individual variables (train + dev)
train_features = np.load(os.path.join(config.train_dir, "train_features.npz"), allow_pickle=True)
t_w_context, t_c_context, t_w_question, t_c_question, t_labels = train_features["context_idxs"],\
                                                                 train_features["context_char_idxs"],\
                                                                 train_features["question_idxs"],\
                                                                 train_features["question_char_idxs"],\
                                                                 train_features["label"]

dev_features = np.load(os.path.join(config.dev_dir, "dev_features.npz"), allow_pickle=True)
d_w_context, d_c_context, d_w_question, d_c_question, d_labels = dev_features["context_idxs"],\
                                                                 dev_features["context_char_idxs"],\
                                                                 dev_features["question_idxs"],\
                                                                 dev_features["question_char_idxs"],\
                                                                 dev_features["label"]

# load the embedding matrix created for our word vocabulary
with open(os.path.join(config.train_dir, "word_embeddings.pkl"), "rb") as e:
    word_embedding_matrix = pickle.load(e)
with open(os.path.join(config.train_dir, "char_embeddings.pkl"), "rb") as e:
    char_embedding_matrix = pickle.load(e)

# load mapping between words and idxs
with open(os.path.join(config.train_dir, "word2idx.pkl"), "rb") as f:
    word2idx = pickle.load(f)

idx2word = dict([(y, x) for x, y in word2idx.items()])

# transform them into Tensors
word_embedding_matrix = torch.from_numpy(np.array(word_embedding_matrix)).type(torch.float32)
char_embedding_matrix = torch.from_numpy(np.array(char_embedding_matrix)).type(torch.float32)

# load datasets
# train_dataset = SquadDataset(t_w_context, t_c_context, t_w_question, t_c_question, t_labels)
# valid_dataset = SquadDataset(d_w_context, d_c_context, d_w_question, d_c_question, d_labels)

I have to change the data as some labels are empty and this creates an error in the DataLoader. Therefore I have to find and select only those rows from the dataset that have 2 labels i.e. a start position and an end position 

In [9]:
def get_encodings(word_context, char_context, word_question, char_question, labels):
    encodings = {'word_context':None, 'char_context':None, 'word_question':None, 'char_question':None, 'labels':None}
    empty_label_indices = []
    for i in range(len(labels)):
        if not len(list(labels[i])):
            empty_label_indices.append(i)

    encodings['word_context'] = np.delete(word_context, empty_label_indices, 0)
    encodings['char_context'] = np.delete(char_context, empty_label_indices, 0)
    encodings['word_question'] = np.delete(word_question, empty_label_indices, 0)
    encodings['char_question'] = np.delete(char_question, empty_label_indices, 0)
    encodings['labels'] = np.delete(labels, empty_label_indices, 0)

    return encodings

train_encodings = get_encodings(t_w_context, t_c_context, t_w_question, t_c_question, t_labels)
val_encodings = get_encodings(d_w_context, d_c_context, d_w_question, d_c_question, d_labels)

In [10]:
# print(type(t_w_context[0]))
# print(type(train_encodings['word_context'][0]))
# # print(t_w_context[0])
# # print(train_encodings['word_context'][0])
# # print(type(train_encodings['char_context']))
# # print(type(train_encodings['word_question']))
# # print(type(train_encodings['char_question']))
# # print(type(train_encodings['labels']))
# # print(type(t_c_context[0]))
# print(train_encodings['word_context'].shape)
# print(train_encodings['char_context'].shape)
# print(train_encodings['word_question'].shape)
# print(train_encodings['char_question'].shape)
# print(train_encodings['labels'].shape)

In [11]:
# print(len(train_encodings['word_context']))
# print(len(train_encodings['questions']))
# print(len(train_encodings['labels']))

# class SquadDataset(torch.utils.data.Dataset):
#     def __init__(self, encodings):
#         self.encodings = encodings

#     def __getitem__(self, idx):
#         return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

#     def __len__(self):
#         return len(self.encodings['word_context'])

train_dataset = SquadDataset(train_encodings)
valid_dataset = SquadDataset(val_encodings)
# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# for i, batch in enumerate(train_loader):
#     pass

In [12]:
# import torch.utils.data as data


# class SquadDataset(data.Dataset):
#     """Custom Dataset for SQuAD data compatible with torch.utils.data.DataLoader."""

#     def __init__(self, w_context, c_context, w_question, c_question, labels):
#         """Set the path for context, question and labels."""
#         self.w_context = w_context
#         self.c_context = c_context
#         self.w_question = w_question
#         self.c_question = c_question
#         self.labels = labels

#     def __getitem__(self, index):
#         """Returns one data tuple of the form ( word context, character context, word question,
#          character question, answer)."""
#         # return self.w_context[index], self.c_context[index], self.w_question[index], self.c_question[index],\
#         #        self.labels[index]
#         return self.w_context[index], self.w_question[index], self.labels[index]

#     def __len__(self):
#         return len(self.w_context)

# train_dataset = SquadDataset(t_w_context, t_c_context, t_w_question, t_c_question, t_labels)
# valid_dataset = SquadDataset(d_w_context, d_c_context, d_w_question, d_c_question, d_labels)

# print(train_encodings.keys())
# print(len(train_encodings['context']))
# print(len(train_encodings['questions'][0]))
# print(len(train_encodings['labels']))
# print("Word context shape:", t_w_context[0].shape)
# print("Char context shape:", t_c_context.shape)
# print("Word question shape:", t_w_question.shape)
# print("Char question shape:", t_c_question.shape)
# print("Labels shape:", t_labels.shape)

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# foo = next(iter(train_loader))['context']
# print(foo.shape)
# for i, batch in enumerate(train_loader):
#     print(batch['context'].shape)
#     print(batch['question'].shape)
    # print(batch['question'].shape)
    # print(i)


In [13]:
# # load data generators
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=hyper_params["batch_size"],
                              num_workers=4)

valid_dataloader = DataLoader(valid_dataset,
                              shuffle=True,
                              batch_size=hyper_params["batch_size"],
                              num_workers=4)

print("Length of training data loader is:", len(train_dataloader))
print("Length of valid data loader is:", len(valid_dataloader))

# load the model
model = BiDAF(word_vectors=word_embedding_matrix,
              char_vectors=char_embedding_matrix,
              hidden_size=hyper_params["hidden_size"],
              drop_prob=hyper_params["drop_prob"])
if hyper_params["pretrained"]:
    model.load_state_dict(torch.load(os.path.join(experiment_path, "model.pkl"))["state_dict"])
model.to(device)

# define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adadelta(model.parameters(), hyper_params["learning_rate"], weight_decay=1e-4)

# best loss so far
if hyper_params["pretrained"]:
    best_valid_loss = torch.load(os.path.join(experiment_path, "model.pkl"))["best_valid_loss"]
    epoch_checkpoint = torch.load(os.path.join(experiment_path, "model_last_checkpoint.pkl"))["epoch"]
    print("Best validation loss obtained after {} epochs is: {}".format(epoch_checkpoint, best_valid_loss))
else:
    best_valid_loss = 100
    epoch_checkpoint = 0

Length of training data loader is: 1353
Length of valid data loader is: 92


In [14]:
# next(iter(train_dataset[0]))

In [15]:
# train_dataloader = DataLoader(train_dataset,
#                               # shuffle=True,
#                               batch_size=2)
#                               # num_workers=4)

# print("Length of training data loader is:", len(train_dataloader))

# print(train_dataset.__getitem__(5)[0].shape)
# print(train_dataset.__getitem__(5)[1].shape)
# print(train_dataset.__getitem__(5)[2].shape)
# print(train_dataset.__getitem__(5)[3].shape)
# print(train_dataset.__getitem__(5)[4].shape)
# print(train_dataset.__getitem__(5)[4])

# # print(train_dataloader.__len__())
with torch.no_grad():
  # loop = tqdm(train_dataloader, position=0)
  print(device)
  for i, batch in enumerate(train_dataloader):
    # pass
    w_context, c_context, w_question, c_question, label1, label2 = batch['word_context'].long().to(device),\
                                                                    batch['char_context'].long().to(device), \
                                                                    batch['word_question'].long().to(device), \
                                                                    batch['char_question'].long().to(device), \
                                                                    batch['labels'][:, 0].long().to(device),\
                                                                    batch['labels'][:, 1].long().to(device)
    pred1, pred2 = model(w_context, c_context, w_question, c_question)
    loss = criterion(pred1, label1) + criterion(pred2, label2)
    print(loss)
    # w_context
    # print(w_context.shape)
    break

cuda
tensor(9.8299, device='cuda:0')
