In [1]:
import pickle
from scipy.stats import pearsonr

from src.Vocab import Vocab
from src.utils import cosine

import numpy as np
from copy import deepcopy


In [2]:
with open('data/train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)


In [3]:
with open('data/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [4]:
with open('data/base_text_data.pkl', 'rb') as f:
    sentences = pickle.load(f)


In [5]:
vocab = Vocab(sentences, remove_stopwords=False)


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader


In [7]:
train_sequences = [vocab.sequencify(sen, addEOSBOS=True) for sen in sentences]


In [8]:
class EmbeddingDataset(Dataset):

    def __init__(self, sequences):
        self.sequences = sorted(sequences, key=lambda x: len(x))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx][:-1], self.sequences[idx][1:]


def pad_batch(batch_data):
    X = [list(x[0]) for x in batch_data]
    y = [list(x[1]) for x in batch_data]

    x_seq_len = max(len(x) for x in X)
    y_seq_len = max(len(x) for x in y)

    padded_X = np.zeros((len(X), x_seq_len))
    padded_y = np.zeros((len(y), y_seq_len))

    for i in range(len(X)):
        curr_X = X[i] + ([0] * (x_seq_len - len(X[i])))
        curr_y = y[i] + ([0] * (y_seq_len - len(y[i])))

        padded_X[i] = np.array(curr_X)
        padded_y[i] = np.array(curr_y)

    return torch.tensor(padded_X).long(), torch.tensor(padded_y).long()


In [9]:
BATCH_SIZE = 64
EMBEDDING_DIM = 100

In [10]:
embedding_dataset = EmbeddingDataset(train_sequences)
embedding_dataloader = DataLoader(
    embedding_dataset, batch_size=BATCH_SIZE, drop_last=True, collate_fn=pad_batch)


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")


Using Device: cuda


In [12]:
class SiameseLSTM(nn.Module):
    def __init__(self, vocab_size, n_layers, embedding_size=128, n_hidden=256, drop_prob=0.2, freeze_embeddings=False, embedding_state_dict=None):
        super(SiameseLSTM, self).__init__()

        if freeze_embeddings:
            assert embedding_state_dict is not None

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.drop_prob = drop_prob
        self.n_hidden = n_hidden
        self.n_layers = n_layers

        self.embedding = nn.Embedding(
            self.vocab_size, self.embedding_size, padding_idx=0)

        if freeze_embeddings:
            self.embedding.load_state_dict(torch.load(embedding_state_dict))
            for param in self.embedding.parameters():
                param.requires_grad = False

        self.lstm = nn.LSTM(self.embedding_size, self.n_hidden, self.n_layers,
                            dropout=self.drop_prob, batch_first=True, bidirectional=True)

    def forward_once(self, X):
        embedding = self.embedding(X)
        _, (h, c) = self.lstm(embedding)
        h = torch.mean(h, dim=0)
        c = torch.mean(c, dim=0)
        out = (h+c)/2
        return out

    def forward(self, X1, X2):
        out1 = self.forward_once(X1)
        out2 = self.forward_once(X2)
        return out1, out2
        

    def init_hidden(self, batch_size, device):

        weight = next(self.parameters()).data

        return (weight.new(2 * self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                weight.new(2 * self.n_layers, batch_size, self.n_hidden).zero_().to(device))


In [13]:
class STSDataset(Dataset):
    def __init__(self, vocab: Vocab, data):
        self.sts_data = []

        for x, y in zip(data['x'], data['y']):
            s1, s2 = x
            s1, s2 = vocab.sequencify(
                s1, addEOSBOS=True), vocab.sequencify(s2, addEOSBOS=True)
            self.sts_data.append(((s1, s2), y))

    def __len__(self):
        return len(self.sts_data)

    def __getitem__(self, idx):
        return self.sts_data[idx]


In [14]:
ft_dataset = STSDataset(vocab, train_data)


In [15]:
ft_dataloader = DataLoader(ft_dataset, batch_size=1,
                           shuffle=True, collate_fn=lambda x: zip(*x))


In [16]:
lstm = SiameseLSTM(len(vocab), 4, EMBEDDING_DIM, 128, freeze_embeddings=True, embedding_state_dict='glove_state_dict.pkl')
lstm.to(device)

SiameseLSTM(
  (embedding): Embedding(12935, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=4, batch_first=True, dropout=0.2, bidirectional=True)
)

In [17]:
class SimilarityScore(nn.Module):

    def __init__(self,dim=0):
        super(SimilarityScore, self).__init__()
        self.cos = nn.CosineSimilarity(dim=dim)

    def __call__(self, v1, v2):
        return self.cos(v1, v2) * 5

In [18]:
def train(model:SiameseLSTM, dataloader:DataLoader, device:torch.device, n_epochs:int=5, unfreeze_at=None)->None:

    metric = SimilarityScore(dim=0)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())
    model.train()

    for param in model.embedding.parameters():
        param.requires_grad = False

    for e in range(n_epochs):
        loss_val = 0

        if unfreeze_at is not None and e + 1 == unfreeze_at:
            for param in model.embedding.parameters():
                param.requires_grad = True
                
        for X, y in dataloader:
            s1, s2 = X[0]
            s1 = torch.tensor(s1).long().to(device).unsqueeze(0)
            s2 = torch.tensor(s2).long().to(device).unsqueeze(0)

            y = torch.tensor(y).float().to(device).squeeze(0)

            optimizer.zero_grad()

            emb1, emb2 = model(s1, s2)
            emb1 = emb1.squeeze(0)
            emb2 = emb2.squeeze(0)
            sim = metric(emb1, emb2)
            loss = criterion(sim, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")
    

In [19]:
train(lstm, ft_dataloader, device, 15)

Epoch 1, Train Loss: 1.7034738931139928
Epoch 2, Train Loss: 1.1022716464957352
Epoch 3, Train Loss: 0.8815826892650231
Epoch 4, Train Loss: 0.7146625629758963
Epoch 5, Train Loss: 0.6060787078924045
Epoch 6, Train Loss: 0.5042802250590922
Epoch 7, Train Loss: 0.4227517997651721
Epoch 8, Train Loss: 0.3530248109038238
Epoch 9, Train Loss: 0.31481209309030234
Epoch 10, Train Loss: 0.2538487521939406
Epoch 11, Train Loss: 0.22423000068961135
Epoch 12, Train Loss: 0.19320417110493843
Epoch 13, Train Loss: 0.16498353537357807
Epoch 14, Train Loss: 0.14819584930385998
Epoch 15, Train Loss: 0.13385338591017432


In [20]:
def get_embeddings(model:SiameseLSTM, device:torch.device, s1:torch.tensor, s2:torch.tensor):

    model.eval()
    s1 = torch.tensor(s1).long().to(device).unsqueeze(0)
    s2 = torch.tensor(s2).long().to(device).unsqueeze(0)
    emb1, emb2 = model(s1, s2)
    return emb1.squeeze(0).detach().cpu().numpy(), emb2.squeeze(0).detach().cpu().numpy()


In [21]:
context_preds = []

for pair in test_data['x']:
    s1, s2 = vocab.sequencify(pair[0]), vocab.sequencify(pair[1])
    
    emb1, emb2 = get_embeddings(lstm, device, s1, s2)

    score = cosine(emb1, emb2) * 5
    context_preds.append(score)


In [22]:
pearson_score, _ = pearsonr(context_preds, test_data['y'])
print(f'Pearson Score for LSTM Model (from hidden state) with Cosine Finetuning: {pearson_score:.4f}')


Pearson Score for LSTM Model (from hidden state) with Cosine Finetuning: 0.7873
