In [96]:
import torch
import torch.nn as nn

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

import numpy as np

import random
import math
import time
import pickle
import re
from collections import Counter
import scipy

In [2]:
with open('data/base_text_data.pkl', 'rb') as f:
    train_data = pickle.load(f)
with open('data/Multi-30k/train.en') as f:
    data = [x.strip() for x in f.readlines()]
dev_data = data[-8000:]
print(len(train_data), len(dev_data))

60265 8000


In [27]:
punctuations = [r'\.', r'\.{2,}',
                             r'\!+', r'\:+', r'\;+', r'\"+', r"\'+", r'\?+', r'\,+', r'\(|\)|\[|\]|\{|\}|\<|\>']
def clean(line):
    for pattern in punctuations:
        line = re.sub(pattern, '', line)
    line = re.sub(r'[^a-z]', ' ', line.lower())
    return line

for i, dat in enumerate(train_data):
    train_data[i] = clean(dat)
    
for i, dat in enumerate(dev_data):
    dev_data[i] = clean(dat)

In [28]:
tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

In [29]:
def build_vocab(data, tokenizer):
    counter = Counter()
    for sent in data:
        counter.update(tokenizer(sent))

    vocab_obj = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
    vocab_obj.set_default_index(vocab_obj['<unk>'])

    return vocab_obj

In [30]:
vocabulary = build_vocab(train_data, tokenizer)

In [31]:
def data_process(data):
    _data = []
    for raw in data:
        tokens = tokenizer(raw)
        inp_tensor = torch.tensor([vocabulary[token] for token in tokens[:-1]], dtype=torch.long)
        trg_tensor = torch.tensor([vocabulary[token] for token in tokens[1:]], dtype=torch.long)
        _data.append((inp_tensor, trg_tensor))

    return _data

In [32]:
tok_train_data = data_process(train_data)
tok_dev_data = data_process(dev_data)

In [33]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [35]:
BATCH_SIZE = 32
PAD_IDX = vocabulary['<pad>']
BOS_IDX = vocabulary['<bos>']
EOS_IDX = vocabulary['<eos>']

In [36]:
def generate_batch(data_batch):
    src_batch, trg_batch = [], []
    for (src_item, trg_item) in data_batch:
        src_batch.append(torch.cat([torch.tensor([BOS_IDX]), src_item, torch.tensor([EOS_IDX])], dim=0))
        trg_batch.append(torch.cat([torch.tensor([BOS_IDX]), trg_item, torch.tensor([EOS_IDX])], dim=0))
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX)

    return src_batch, trg_batch

In [37]:
train_dataloader = DataLoader(tok_train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch, drop_last=True)
dev_dataloader = DataLoader(tok_dev_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch, drop_last=True)

In [38]:
class EncoderModel(nn.Module):
    def __init__(self, vocab_len, device, dropout_prob=0.2, bidirectional=True):
        super(EncoderModel, self).__init__()
        self.rnn_size = 512
        self.embedding_dim = 128
        self.num_layers = 2
        self.num_directions = 2 if bidirectional else 1
        self.device = device

        self.embedding = nn.Embedding(
            num_embeddings = vocab_len,
            embedding_dim = self.embedding_dim,
        )
        self.rnn = nn.LSTM(
            input_size = self.embedding_dim,
            hidden_size = self.rnn_size,
            num_layers = self.num_layers,
            bidirectional=bidirectional,
            dropout = dropout_prob
        )
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(self.num_directions * self.rnn_size, vocab_len)

    def forward(self, x):
        batch_size = x.shape[1]
        hidden_state = self.init_state(batch_size)
        
        embed = self.embedding(x)
        output, state = self.rnn(embed, hidden_state)
        output = self.dropout(output)
        logits = self.fc(output)
        return logits, state

    def init_state(self, batch_size):
        return torch.zeros(self.num_directions * self.num_layers, batch_size, self.rnn_size).to(self.device)

In [42]:
model = EncoderModel(len(vocabulary), device=device)
model = model.to(device)

In [40]:
def train_evaluate(model, num_epochs=3):
    loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(num_epochs):
        # Training
        model.train()
        print(f"Epoch: {epoch}")
        loss_val = 0 
        for x, y in train_dataloader:
            x = x.to(device)
            y = y.to(device)
            
            optimizer.zero_grad()

            pred, state_h = model(x)
            loss = loss_fn(pred.transpose(1, 2), y)

            loss.backward()
            optimizer.step()
            
            loss_val += loss.item()
        print(f"Train Loss: {loss_val/len(train_dataloader)}, Train PPL: {2**(loss_val/len(train_dataloader))}")

        # Validation
        model.train()
        with torch.no_grad():
            loss_val = 0 
            for x, y in dev_dataloader:
                x = x.to(device)
                y = y.to(device)
                optimizer.zero_grad()

                pred, state_h = model(x)
                loss = loss_fn(pred.transpose(1, 2), y)
                
                loss_val += loss.item()
            print(f"Dev Loss: {loss_val/len(dev_dataloader)}, Dev PPL: {2**(loss_val/len(dev_dataloader))}")

In [46]:
train_evaluate(model, num_epochs=1)

Epoch: 0
Train Loss: 0.4423317674149779, Train PPL: 1.3587987229344551
Dev Loss: 0.6440360325574875, Dev PPL: 1.562694789512324


In [49]:
with open('biGRU_5ep.pkl', 'wb') as f:
    pickle.dump(model, f)

In [44]:
def predict(vocab, tokenizer, model, seed_text, num_to_predict=4):
    model.to(device)
    model.eval()

    print(f"Provided Input -> {seed_text}")
    sent = clean(seed_text)
    probs = []

    for _ in range(num_to_predict):
        seed_sent = sent

        x = torch.tensor([[vocab[token] for token in tokenizer(seed_sent)]], dtype=torch.long).to(device)
        x = x.t()

        y_pred, state_h = model(x)
        y_pred = y_pred.squeeze(1)
        last_word_logits = y_pred[-1].cpu()

        prob_dist = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.argmax(prob_dist)
        
        probs.append((vocab.get_itos()[word_index], prob_dist[word_index]))
        sent += f" {vocab.get_itos()[word_index]}"

    return sent, probs

In [51]:
predict(vocabulary, tokenizer, model, "Five people")

Provided Input -> Five people


('five people marching band stool etc',
 [('marching', 0.09523961),
  ('band', 0.09642078),
  ('stool', 0.045894906),
  ('etc', 0.027783073)])

In [48]:
predict(vocabulary, tokenizer, model, "I am")

Provided Input -> I am


('i am delighted it approved date',
 [('delighted', 0.19852068),
  ('it', 0.37268147),
  ('approved', 0.06063961),
  ('date', 0.034026783)])

In [136]:
def get_sent_vector(vocab, tokenizer, model, seed_text):
    model.to(device)
    model.eval()
    sent = clean(seed_text)

    x = torch.tensor([[vocab[token] for token in tokenizer(sent)]], dtype=torch.long).to(device)
    x = x.t()

    _, state_h = model(x)
    vec = torch.mean(state_h.squeeze(1), dim=0).cpu().detach().numpy()

    return vec

In [137]:
get_sent_vector(vocabulary, tokenizer, model, "whatever works")

torch.Size([4, 6, 512])
torch.Size([6, 1024])


In [73]:
def cosine_similarity(vec_1, vec_2):
    return vec_1@vec_2.T/(np.linalg.norm(vec_1) * np.linalg.norm(vec_2))

In [80]:
def output_preds(sent_1, sent_2):
    vec_1 = get_sent_vector(vocabulary, tokenizer, model, sent_1)
    vec_2 = get_sent_vector(vocabulary, tokenizer, model, sent_2)
    
    return 5 * cosine_similarity(vec_1, vec_2)

In [81]:
sent_1 = "People wearing costumes are gathering in a forest and are looking in the same direction"
sent_2 = "A little girl in costume looks like a woman"
output_preds(sent_1, sent_2)

0.8107129484415054

In [86]:
sent_1 = "To explain further vector space models, basically a document is characterized by a vector."
sent_2 = "A document is represented as a vector."
output_preds(sent_1, sent_2)

1.072433516383171

In [106]:
with open('data/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)
x_test = test_data['x']
y_test = test_data['y']

In [108]:
preds = []
for dat in x_test:
    preds.append(output_preds(dat[0], dat[1]))

In [109]:
from scipy.stats import pearsonr
pearson_score, _ = pearsonr(preds, y_test)
pearson_score

0.38860042516420196