In [None]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
import nltk
from typing import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm.notebook import tqdm
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import string
import numpy as np
import json

device = torch.device("cuda:0" if torch.cuda.is_available else "cpu")
device
CUDA_LAUNCH_BLOCKING=1

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
import pickle
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
dataset_path = "/content/drive/MyDrive/Colab Notebooks/nlp2021-hw1/data/train.jsonl"
dev_dataset_path = "/content/drive/MyDrive/Colab Notebooks/nlp2021-hw1/data/dev.jsonl"



In [None]:
path_to_vocab = 'drive/My Drive/Colab Notebooks/nlp2021-hw1/model/word_vocab.pickle'
with open(path_to_vocab, 'rb') as handle:
  word_vects = pickle.load(handle)

In [None]:



class Vocabulary(object):

  def __init__(self, word_vects, add_unk=True, unk_token="<UNK>", sep_token = '<SEP>', add_sep=True, pad_token = '<PAD>', add_pad=True):
    """
    args: 
    token_to_idx (dict): A pre-existing map of tokens to indices
    add_unk (bool): flag indicating to add UNK, to be added to vocabulary
    """
    self.token_to_idx = {k: v for v, k  in enumerate([i for i in word_vects])}
    self.idx_to_token = {k: v for k, v  in enumerate([i for i in word_vects])}
    self.word_vects = word_vects
    self.add_unk = add_unk
    self.add_sep = add_sep
    self.unk_token = unk_token
    self.sep_token = sep_token
    self.add_pad = add_pad
    self.pad_token = pad_token
    if add_unk:
      self.unk_index = self.add_token(unk_token)
    if add_sep:
      self.sep_index = self.add_token(sep_token)
    if add_pad:
      self.pad_index = self.add_token(pad_token)

  def add_token(self, token):
    "update mapping dicts with the input token"
    if token in self.token_to_idx:
      index=self.token_to_idx[token]
    else:
      index = len(self.token_to_idx)
      self.token_to_idx[token] = index
      self.idx_to_token[index] = token
    return index

  def find_index(self, token):
    "find the index associated to the input token"
    if self.add_unk:
      return self.token_to_idx.get(token, self.unk_index)
    else:
      return self.token_to_idx[token]
  
  def find_token(self, index):
    "find the token associated to the input index"
    if index not in self.idx_to_token:
      raise KeyError("the index (%d) is not in the Vocabulary")
    return self.idx_to_token[index]

  def __len__(self):
    return len(self.token_to_idx)

In [None]:
vocab = Vocabulary(word_vects)

In [None]:
vocab.__len__()

400003

In [None]:
class Preprocesser():

  def __init__(self, vocab):
    self.vocab = vocab
    self.cos = nn.CosineSimilarity(dim=1, eps=1e-6).to(device)
    self.matrix = self.prepare_matrix(vocab)
    self.matrix.to(device)
    
  def remove_stopwords(self, sentence):
    lemmatizer = WordNetLemmatizer()
    text_tokens = word_tokenize(sentence)
    stop = set(stopwords.words('english') + list(string.punctuation))
    tokens_without_sw = [lemmatizer.lemmatize(word).lower() for word in text_tokens if not word in stop]
    return tokens_without_sw

  def word_to_vec(self, word):
    try:
      idx = vocab.find_index(word)
      return self.matrix[idx]
    except KeyError:
      return torch.zeros(300).to(device)

  def preprocess(self, data):
    for entry in data:
      start = int(entry["start1"])
      end = int(entry["end1"])
      entry["target"] = entry["sentence1"][start:end]
      entry["words_1"] = self.remove_stopwords(entry["sentence1"])
      entry["words_2"] = self.remove_stopwords(entry["sentence2"])
      entry["idx_1"] = [self.vocab.find_index(word) for word in entry["words_1"]]
      entry["idx_2"] = [self.vocab.find_index(word) for word in entry["words_2"]]
      entry["X"] = entry["idx_1"]
      entry["X"].append(vocab.sep_index)
      entry["X"].extend(entry["idx_2"])
      entry["X"].append(vocab.sep_index)
      #similars = self.find_similars(self.word_to_vec(entry["target"]))
      #print(similars)
      #entry["X"].extend(similars)
    return data


      

  def find_similars(self, word):
    word = word.expand(len(self.matrix), -1)
    word.to(device)
    sims = self.cos(word, self.matrix).cuda()
    most_similars = torch.topk(sims, 5).indices.tolist()
    return most_similars
  

  def prepare_matrix(self, vocab):
    num_tokens = vocab.__len__()
    embedding_dim = len(vocab.word_vects["mouse"])

  # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for idx in vocab.idx_to_token:
      word = str(vocab.idx_to_token[idx])
      if word not in ["<UNK>", "<SEP>", "<PAD>"]:
        try:
          embedding_matrix[idx] = vocab.word_vects[word]
        except:
          print(word)

      else: 
        print(word)
    return torch.tensor(embedding_matrix)

  







  

In [None]:
preprocesser = Preprocesser(vocab)


<UNK>
<SEP>
<PAD>


In [None]:
class SentenceDataset(Dataset):
   def __init(self, dataset_path):
     self.path = dataset_path

   def _init_data(self, preprocesser):
    self.samples = []
    with open(self.path) as f:
     lista = list(f)
    self.data = []
    for json_str in lista:
      result = json.loads(json_str)
      self.data.append(result)
    self.data = preprocesser.preprocess(self.data)
    self.get_samples()

  
   def convert(self, sample):
    if sample["label"] == "True":
      return int(1)
    else:
      return (0)

   def __len__(self):
    return len(self.samples)


   def __getitem__(self, idx):
    return self.samples[idx]

   def get_samples(self):
    self.samples = [(torch.tensor(entry["X"]), self.convert(entry)) for entry in self.data]

   def collate_fn(self, samples):
     X = [sample[0] for sample in samples]
     lengths = torch.tensor([x.size(0) for x in X], dtype=torch.long).to(device)
     X = torch.nn.utils.rnn.pad_sequence(X, batch_first=True, padding_value=400002).to(device)
     y = [sample[1] for sample in samples]
     y = torch.tensor(y).to(device)

     return X, lengths, y


In [None]:

dataset_path = "/content/drive/MyDrive/Colab Notebooks/nlp2021-hw1/data/train.jsonl"
dev_dataset_path = "/content/drive/MyDrive/Colab Notebooks/nlp2021-hw1/data/dev.jsonl"
dataset = SentenceDataset()
dataset.path = dataset_path
dataset._init_data(preprocesser)
dev_dataset = SentenceDataset()
dev_dataset.path = dev_dataset_path
dev_dataset._init_data(preprocesser)

In [None]:
dataset.data[0]

{'X': [6,
  4710,
  6409,
  4984,
  6689,
  802,
  638,
  282,
  546,
  2951,
  380,
  400001,
  7,
  2148,
  282,
  1698,
  52,
  4424,
  12818,
  87,
  122,
  400001],
 'end1': '73',
 'end2': '14',
 'id': 'train.0',
 'idx_1': [6,
  4710,
  6409,
  4984,
  6689,
  802,
  638,
  282,
  546,
  2951,
  380,
  400001,
  7,
  2148,
  282,
  1698,
  52,
  4424,
  12818,
  87,
  122,
  400001],
 'idx_2': [7, 2148, 282, 1698, 52, 4424, 12818, 87, 122],
 'label': 'False',
 'lemma': 'play',
 'pos': 'NOUN',
 'sentence1': 'In that context of coordination and integration, Bolivia holds a key play in any process of infrastructure development.',
 'sentence2': 'A musical play on the same subject was also staged in Kathmandu for three days.',
 'start1': '69',
 'start2': '10',
 'target': 'play',
 'words_1': ['in',
  'context',
  'coordination',
  'integration',
  'bolivia',
  'hold',
  'key',
  'play',
  'process',
  'infrastructure',
  'development'],
 'words_2': ['a',
  'musical',
  'play',
  'subjec

In [None]:
batch_size = 32
train_dataloader = DataLoader(dataset, batch_size= batch_size, collate_fn=dataset.collate_fn)
dev_dataloader = DataLoader(dev_dataset, batch_size= batch_size, collate_fn=dev_dataset.collate_fn)

In [None]:
class Lstm_Classifier(nn.Module):
  
  def __init__(self, matrix, hidden, device):
    super().__init__()
    self.embedding = torch.nn.Embedding.from_pretrained(matrix).float().to(device)
    self.rnn = torch.nn.LSTM(input_size=matrix.size(1), hidden_size=hidden, dropout = 0.15, num_layers=2, bidirectional=False)
    self.lin1 = torch.nn.Linear(hidden, hidden)
    self.lin2 = torch.nn.Linear(hidden, 1)
    self.loss = torch.nn.BCELoss()
    self.optimizer = torch.optim.Adam(self.parameters(), lr=0.0005)
    self.device= device

    

  def forward(self, X, xlength, y):
    embedding_out = self.embedding(X).cuda()
    recurrent_out = self.rnn(embedding_out)[0]
    batch_size, seq_len, hidden_size = recurrent_out.shape
    flattened_out = recurrent_out.reshape(-1, hidden_size)
    last_word_relative_indices = xlength - 1
    sequences_offsets = torch.arange(batch_size, device=self.device) * seq_len
    summary_vectors_indices = sequences_offsets.to(device) + last_word_relative_indices.to(device)
    summary_vectors = flattened_out[summary_vectors_indices]
    out=self.lin1(summary_vectors)
    out = torch.relu(out)
    out = self.lin2(out).squeeze(1)
    logits = out
    m = nn.Sigmoid()
    pred = m(logits)
    result= {'logits' : logits, 'pred': pred}


    if y is not None:
      loss = self.loss(pred, y)
      result['loss'] = loss
      return result

    



In [None]:
def training_loop(model: nn.Module, optimizer: torch.optim.Optimizer,  adapt: Callable[[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]], epochs: int = 2):

    for epoch in range(epochs):
        progress_bar = tqdm()
        # batches of the training set
        for x, xlength, y in train_dataloader:
            x.to(device)
            y.to(device)
            xlength.to(device)
            x, y = adapt(x, y)
            optimizer.zero_grad()
            batch_out = model(x, xlength, y)
            loss = batch_out['loss']

            # computes the gradient of the loss
            loss.backward()
            # updates parameters based on the gradient information
            optimizer.step()

            progress_bar.update()
            progress_bar.set_postfix(epoch=epoch, loss=loss.item())
        progress_bar.close()

In [None]:
embeddings = preprocesser.matrix
hidden = 30

model = Lstm_Classifier(embeddings, hidden, device).to(device)



In [None]:
training_loop(model, model.optimizer, adapt=lambda x, y : (x, y.float()))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
n = 0
d = 0
adapt=lambda x, y : (x, y.float())
# for each batch in the test set
for x, xlength, y in dev_dataloader:
        
        with torch.no_grad():
            # classify the batch
            x.to(device)
            y.to(device)
            xlength.to(device)
            x, y = adapt(x, y)
            batch_out = model(x, xlength, y)
            pred = batch_out['pred']
            
        pred = torch.round(pred)
        
        
        

        # number of predictions (corresponding to number of batch items to predict)
        d += pred.shape[0]
        # number of correct classifications within the batch
        n += (y == pred).int().sum()


print(f'# accuracy: {(n / d).item():.2f}')

# accuracy: 0.51
