In [0]:
# Vincent notebook
# Mount drive
import os
from google.colab import drive
%cd /content
drive.mount('/content/gdrive', force_remount=True)
%cd '/content/gdrive/My Drive/NLP_Project/'
%ls -l
print(os.listdir())

In [3]:
import pandas as pd
import numpy as np
import random
import re
import time
from string import punctuation
import scipy.io
import matplotlib.pyplot as plt

import nltk
import scipy.sparse as sp
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import SnowballStemmer
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

# from gensim.test.utils import datapath, get_tmpfile
# from gensim.models import KeyedVectors
# from gensim.scripts.glove2word2vec import glove2word2vec

import torch
import torch.nn as nn
from torchtext.data import Field, LabelField, TabularDataset, Iterator, BucketIterator
from torchtext import vocab
import torch.optim as optim

import transform_dataset
import compute_embeddings

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load embeddings

In [0]:
# for google news, need to download and save in an appropriate format
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')


In [6]:
wv.save_word2vec_format('embedding/word2vec-google-news-300')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [8]:
from torchtext.vocab import Vectors
vectors = Vectors(name='word2vec-google-news-300', cache='embedding')

# Glove and FastText are easier to manage (download on the fly in the right format)
# vectors = vocab.GloVe(name='600B', dim=300)
# vectors = vocab.FastText(language='en')

  0%|          | 0/3000000 [00:00<?, ?it/s]Skipping token b'3000000' with 1-dimensional vector [b'300']; likely a header
100%|█████████▉| 2999358/3000000 [18:28<00:00, 7616.65it/s]

In [0]:
# alternative with wiki simple (small embedding)
# ! wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec -P ../embeddings/
# TEXT.vocab.load_vectors('../embeddings/wiki.simple.vec')
# vectors = Vectors('../embeddings/wiki.simple.vec')

--2020-05-13 10:28:59--  https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 2606:4700:10::6816:4a8e, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 293187541 (280M) [binary/octet-stream]
Saving to: ‘../embeddings/wiki.simple.vec’


2020-05-13 10:29:25 (11.3 MB/s) - ‘../embeddings/wiki.simple.vec’ saved [293187541/293187541]



### Import data

In [0]:
path = "data/train.csv"
tokenizer = nltk.word_tokenize
# tokenize = lambda x: x.split()
# tokenizer = text_to_wordlist

data = pd.read_csv(path)
questions1 = data['question1'].astype('str').tolist()
questions2 = data['question2'].astype('str').tolist()
is_duplicates = data['is_duplicate'].tolist()

TEXT = Field(
        sequential=True,
        tokenize = tokenizer,
        pad_first = True,
        dtype = torch.long,
        fix_length = 20,
        lower = True,
        batch_first = True,
        )
TARGET = LabelField(use_vocab = False, dtype = torch.float)

In [10]:
dataset = TabularDataset(path, 'csv', [('id', None),
                                       ('qid1', None),
                                       ('qid2', None),
                                       ('question1', TEXT),
                                       ('question2', TEXT),
                                       ('is_duplicate', TARGET)],
                         skip_header=True)

train_data, valid_data = dataset.split(
                  split_ratio=0.9,
                  random_state=random.seed(42)
                  )

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

# TEXT.build_vocab(train_data, vectors = vectors)
TEXT.build_vocab(train_data)
TARGET.build_vocab(train_data)

Number of training examples: 363861
Number of validation examples: 40429


In [0]:
TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, valid_iter = BucketIterator.splits(
        (train_data, valid_data), # we pass in the datasets we want the iterator to draw data from
        batch_size=64,
        device=device, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.question1)+len(x.question2), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False
        # repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

### Modeling

In [0]:
class siameseNet(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, vectors = TEXT.vocab.vectors):
    super().__init__()
    # self.embedding = nn.EmbeddingBag(vocab_size, embedding_dim, mode='mean')
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
    self.pdist = nn.PairwiseDistance(p=1.0)

  def forward_one(self, x):
    output = self.embedding(x)
    output = self.lstm(output)
    return output

  def forward(self, input1, input2):
    emb1 = self.embedding(input1)
    emb2 = self.embedding(input2)
    output1, _ = self.lstm(emb1)
    output2, _ = self.lstm(emb2)
    output = torch.exp(-self.pdist(output1[:,-1,:], output2[:,-1,:]))
    # output = nn.CosineSimilarity()(output1[:,-1,:], output2[:,-1,:])
    return output

In [0]:
emb_shape = TEXT.vocab.vectors.shape
INPUT_DIM = emb_shape[0]
EMBEDDING_DIM = emb_shape[1]
HIDDEN_DIM = 50
OUTPUT_DIM = 1

model = siameseNet(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM)

pretrained_embeddings = TEXT.vocab.vectors
model.embedding.from_pretrained(pretrained_embeddings, freeze=True)

optimizer = optim.Adam(model.parameters())
# criterion = nn.BCEWithLogitsLoss()
criterion = torch.nn.MSELoss()
# criterion = nn.BCELoss()

model = model.to(device)
criterion = criterion.to(device)

### Learning and evaluation

In [0]:
def compute_accuracy(predictions, labels, thresh=0.5):
    predicted_labels = predictions >= thresh
    accuracy = (predicted_labels == labels).sum()/len(predictions)
    return accuracy.item()

def train(model, iterator, optimizer, criterion, thresh=0.5):
    # Track the loss
    epoch_loss = 0
    epoch_TP_FN = 0
    epoch_FP_TN = 0
    
    model.train()
    
    for batch in iterator:
        optimizer.zero_grad()
        
        predictions = model(batch.question1, batch.question2)
        loss = criterion(predictions, batch.is_duplicate)
        
        loss.backward()
        optimizer.step()

        predicted_labels = predictions >= thresh
        epoch_TP_FN += (predicted_labels==batch.is_duplicate).sum().item()
        epoch_FP_TN += (predicted_labels!=batch.is_duplicate).sum().item()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), epoch_TP_FN/(epoch_TP_FN+epoch_FP_TN)


def evaluate(model, iterator, criterion, thresh=0.5):
    epoch_loss = 0
    epoch_TP_FN = 0
    epoch_FP_TN = 0
    
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.question1, batch.question2)
            loss = criterion(predictions, batch.is_duplicate)

            predicted_labels = predictions >= thresh
            epoch_TP_FN += (predicted_labels==batch.is_duplicate).sum().item()
            epoch_FP_TN += (predicted_labels!=batch.is_duplicate).sum().item()

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator), epoch_TP_FN/(epoch_TP_FN+epoch_FP_TN)

In [16]:
N_EPOCHS = 11

# Track time taken
start_time = time.time()

for epoch in range(N_EPOCHS):
    epoch_start_time = time.time()
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)
    
    print(f'| Epoch: {epoch+1:02} '
          f'| Train Loss: {train_loss:.3f} '
          f'| Train Accuracy: {train_acc:.3f} '
          f'| Val. Loss: {valid_loss:.3f} '
          f'| Val. Accuracy: {valid_acc:.3f} '
          f'| Time taken: {time.time() - epoch_start_time:.2f}s'
          f'| Time elapsed: {time.time() - start_time:.2f}s')

| Epoch: 01 | Train Loss: 0.176 | Train Accuracy: 0.757 | Val. Loss: 0.158 | Val. Accuracy: 0.787 | Time taken: 95.27s| Time elapsed: 95.27s
| Epoch: 02 | Train Loss: 0.143 | Train Accuracy: 0.811 | Val. Loss: 0.148 | Val. Accuracy: 0.801 | Time taken: 98.24s| Time elapsed: 193.51s
| Epoch: 03 | Train Loss: 0.126 | Train Accuracy: 0.838 | Val. Loss: 0.145 | Val. Accuracy: 0.805 | Time taken: 106.98s| Time elapsed: 300.52s
| Epoch: 04 | Train Loss: 0.114 | Train Accuracy: 0.857 | Val. Loss: 0.142 | Val. Accuracy: 0.811 | Time taken: 105.72s| Time elapsed: 406.24s
| Epoch: 05 | Train Loss: 0.105 | Train Accuracy: 0.872 | Val. Loss: 0.140 | Val. Accuracy: 0.815 | Time taken: 105.64s| Time elapsed: 511.89s
| Epoch: 06 | Train Loss: 0.097 | Train Accuracy: 0.884 | Val. Loss: 0.142 | Val. Accuracy: 0.811 | Time taken: 106.04s| Time elapsed: 617.93s
| Epoch: 07 | Train Loss: 0.091 | Train Accuracy: 0.893 | Val. Loss: 0.140 | Val. Accuracy: 0.815 | Time taken: 93.25s| Time elapsed: 711.18s
| E

In [0]:
model.eval()
predictions_list = []
labels_list = []

with torch.no_grad():
    for batch in valid_iter:
        predictions = model(batch.question1, batch.question2)
        
        predictions_list += predictions.tolist()
        labels_list += batch.is_duplicate.tolist()
        
        # predictions_list += predictions

predictions_list = np.array(predictions_list)
labels_list = np.array(labels_list)

In [18]:
from sklearn.metrics import log_loss, accuracy_score

print(log_loss(labels_list, predictions_list))
print(accuracy_score(labels_list, predictions_list>=0.5))

1.1362893153164952
0.8143411907294269
