In [2]:
from os import path

In [3]:
%load_ext autoreload
%autoreload 2

import os

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader

from constants import DatasetPaths

from DataLoader import SquadDataset, collate_fn, GloVeEmbeddings
from models import EncoderBILSTM, DecoderLSTM
from train import train, greedy_search

In [4]:
!wget 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
!unzip 'glove.840B.300d.zip'
!rm glove.840B.300d.zip
!mv glove.840B.300d.txt data/

'wget' is not recognized as an internal or external command,
operable program or batch file.
'unzip' is not recognized as an internal or external command,
operable program or batch file.
'rm' is not recognized as an internal or external command,
operable program or batch file.
'mv' is not recognized as an internal or external command,
operable program or batch file.


In [5]:
!mkdir dataset
!curl -o 'dataset/squad-train-v1.1.json' 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json'
!curl -o 'dataset/squad-dev-v1.1.json' 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json'

A subdirectory or file dataset already exists.
curl: (3) URL using bad/illegal format or missing URL
curl: (3) URL using bad/illegal format or missing URL


In [6]:
from DataProcessor import SquadPreProcessor, GlovePreproccesor

train_ds = SquadPreProcessor(path=DatasetPaths["squad"]["train"], split="train", q_vocab_size=45000, a_vocab_size=28000)
paragraphs, question_answer_pairs = train_ds.preprocess()
train_ds.persist(paragraphs, question_answer_pairs)

dev_ds = SquadPreProcessor(path=DatasetPaths["squad"]["dev"], split="dev", q_vocab_size=45000, a_vocab_size=28000)
paragraphs, question_answer_pairs = dev_ds.preprocess()
dev_ds.persist(paragraphs, question_answer_pairs)

GlovePreproccesor().obtain_glove_embeddings(glove_filename=DatasetPaths["glove"]["original-embeddings"],
                                            word_to_ix=train_ds.a_word_to_idx,
                                            pruned_glove_filename=DatasetPaths["glove"]["answer-embeddings"])

GlovePreproccesor().obtain_glove_embeddings(glove_filename=DatasetPaths["glove"]["original-embeddings"],
                                            word_to_ix=train_ds.q_word_to_idx,
                                            pruned_glove_filename=DatasetPaths["glove"]["question-embeddings"])


AssertionError: Dataset file [dataset/squad-train-v1.1.json] doesn't exist

In [None]:

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

def plot_losses(losses):
  plt.plot(losses)

  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.title('Loss vs Epoch')
  plt.grid(True)

  plt.show()


In [None]:
use_cuda = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

train_dataset = SquadDataset(split="train")
word_to_idx_sent = train_dataset.get_answer_word_to_idx()
word_to_idx_q = train_dataset.get_question_idx_to_word()

train_vocab_size_sent = len(word_to_idx_sent)
train_vocab_size_q = len(word_to_idx_q)
num_epoch = 15
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn, pin_memory=True)

word_embeddings_glove_q = GloVeEmbeddings.load_glove_embeddings(True)
word_embeddings_glove_sent = GloVeEmbeddings.load_glove_embeddings(False)

encoder = EncoderBILSTM(vocab_size=train_vocab_size_sent, n_layers=2, embedding_dim=300, hidden_dim=500, dropout=0, embeddings=word_embeddings_glove_sent)
decoder = DecoderLSTM(vocab_size=train_vocab_size_q, embedding_dim=300, hidden_dim=500, n_layers=1, encoder_hidden_dim=500, embeddings=word_embeddings_glove_q)

if use_cuda:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

n_train = len(train_loader)
batch_per_epoch = n_train // batch_size

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer_enc = torch.optim.RMSprop(encoder.parameters(), lr=1, momentum=0.7)
optimizer_dec = torch.optim.RMSprop(decoder.parameters(), lr=1, momentum=0.7)

if not os.path.isdir("model_weights"):
    os.makedirs("model_weights", exist_ok=True)

losses= train(encoder=encoder, decoder=decoder, epoch_count=num_epoch, batch_per_epoch=batch_per_epoch, idx_to_word_q=None,
                   train_loader=train_loader, criterion=criterion, optimizer_enc=optimizer_enc, optimizer_dec=optimizer_dec,
                   is_cuda=use_cuda, debug=False)



In [None]:
plot_losses(losses)

In [None]:
def predict():
  predict_batch_size = 10

  dev_dataset = SquadDataset(split="dev")

  dev_loader = DataLoader(
      dev_dataset, batch_size=predict_batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn, pin_memory=True)
  dev_idx_to_word_q = dev_dataset.get_question_idx_to_word()
  dev_idx_to_word_sent = dev_dataset.get_answer_idx_to_word()

  encoder = EncoderBILSTM(vocab_size=train_vocab_size_sent, n_layers=2, embedding_dim=300, hidden_dim=500, dropout=0, embeddings=word_embeddings_glove_sent)
  decoder = DecoderLSTM(vocab_size=train_vocab_size_q, embedding_dim=300, hidden_dim=500, n_layers=1, encoder_hidden_dim=500, embeddings=word_embeddings_glove_q)
  if use_cuda:
    encoder.cuda()
    decoder.cuda()
  encoder.load_state_dict(torch.load("model_weights/1-encoder.pth"))
  decoder.load_state_dict(torch.load("model_weights/1-decoder.pth"))

  idx_to_word_sent = train_dataset.get_answer_idx_to_word()
  idx_to_word_q = train_dataset.get_question_idx_to_word()

  greedy_search(encoder, decoder, train_loader, True, idx_to_word_q, idx_to_word_sent, batch_size=predict_batch_size)
  greedy_search(encoder, decoder, dev_loader, True, dev_idx_to_word_q, dev_idx_to_word_sent, batch_size=predict_batch_size)

predict()
