In [1]:
import pickle
from scipy.stats import pearsonr

from src.Vocab import Vocab
from src.utils import cosine

import numpy as np
from copy import deepcopy

In [2]:
with open('data/en_en/train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)


In [3]:
with open('data/en_en/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [4]:
with open('data/english/base_text_data.pkl', 'rb') as f:
    sentences = pickle.load(f)


In [5]:
vocab = Vocab(sentences, remove_stopwords=False)


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader


In [7]:
train_sequences = [vocab.sequencify(sen, addEOSBOS=True) for sen in sentences]


In [8]:
class EmbeddingDataset(Dataset):

    def __init__(self, sequences):
        self.sequences = sorted(sequences, key=lambda x: len(x))

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx][:-1], self.sequences[idx][1:]


def pad_batch(batch_data):
    X = [list(x[0]) for x in batch_data]
    y = [list(x[1]) for x in batch_data]

    x_seq_len = max(len(x) for x in X)
    y_seq_len = max(len(x) for x in y)

    padded_X = np.zeros((len(X), x_seq_len))
    padded_y = np.zeros((len(y), y_seq_len))
    
    for i in range(len(X)):
        curr_X = X[i] + ([0] * (x_seq_len - len(X[i])))
        curr_y = y[i] + ([0] * (y_seq_len - len(y[i])))

        padded_X[i] = np.array(curr_X)
        padded_y[i] = np.array(curr_y)

    return torch.tensor(padded_X).long(), torch.tensor(padded_y).long()

In [9]:
BATCH_SIZE = 64
EMBEDDING_DIM = 100

In [10]:
embedding_dataset = EmbeddingDataset(train_sequences)
embedding_dataloader = DataLoader(embedding_dataset, batch_size=BATCH_SIZE, drop_last=True, collate_fn=pad_batch)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")


Using Device: cuda


In [12]:
class LSTMLM(nn.Module):
    def __init__(self, vocab_size, n_layers, embedding_size=128, n_hidden=256, drop_prob=0.2):
        super(LSTMLM, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.drop_prob = drop_prob
        self.n_hidden = n_hidden
        self.n_layers = n_layers

        self.embedding = nn.Embedding(
            self.vocab_size, self.embedding_size, padding_idx=0)

        self.embedding.load_state_dict(torch.load('glove_state_dict.pkl'))
        self.lstm = nn.LSTM(self.embedding_size, self.n_hidden, self.n_layers,
                            dropout=self.drop_prob, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(self.drop_prob)
        self.fc = nn.Linear(2 * self.n_hidden, self.vocab_size)

    def forward(self, x):
        embedding = self.embedding(x)
        out, h = self.lstm(embedding)
        out = self.dropout(out)
        out = self.fc(out)
        return out, h


In [13]:
lstm = LSTMLM(len(vocab), 4, EMBEDDING_DIM, 256)
lstm.to(device)

LSTMLM(
  (embedding): Embedding(12935, 100, padding_idx=0)
  (lstm): LSTM(100, 256, num_layers=4, batch_first=True, dropout=0.2, bidirectional=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=12935, bias=True)
)

In [14]:
def train(model:LSTMLM, dataloader:DataLoader, device:torch.device, n_epochs:int=5)->None:

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    for param in model.embedding.parameters():
        param.requires_grad = False
    
    model.train()

    for e in range(n_epochs):
        loss_val = 0

        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)

            optimizer.zero_grad()
            y_pred, _ = model(X)
            loss = criterion(y_pred.transpose(1, 2), y)
            
            loss.backward()
            optimizer.step()
            loss_val += loss.item()
        
        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")

    for param in model.embedding.parameters():
        param.requires_grad = True


In [15]:
train(lstm, embedding_dataloader, device, n_epochs=15)

Epoch 1, Train Loss: 3.623719723394396
Epoch 2, Train Loss: 1.3306020590754801
Epoch 3, Train Loss: 0.6591009895429094
Epoch 4, Train Loss: 0.3663934275555433
Epoch 5, Train Loss: 0.22253424891218243
Epoch 6, Train Loss: 0.14605610603892208
Epoch 7, Train Loss: 0.10329464575182745
Epoch 8, Train Loss: 0.07565875822335767
Epoch 9, Train Loss: 0.060571553530003264
Epoch 10, Train Loss: 0.048718316512188296
Epoch 11, Train Loss: 0.040799829863431475
Epoch 12, Train Loss: 0.036183806848845874
Epoch 13, Train Loss: 0.030608395506468513
Epoch 14, Train Loss: 0.027711896432660654
Epoch 15, Train Loss: 0.025921025639053484


In [16]:
def get_context_embedding(model: LSTMLM, device:torch.device, seq:np.ndarray):

    model.eval()
    with torch.no_grad():
        X = torch.tensor(seq).to(device).unsqueeze(0)
        _, (h, c) = model(X)

    return ((torch.mean(h, dim=0).squeeze(0) + torch.mean(c, dim=0).squeeze(0))).detach().cpu().numpy()


In [17]:
def get_sentence_embedding(model:LSTMLM, device:torch.device, seq:np.ndarray):

    model.eval()

    with torch.no_grad():
        X = torch.tensor(seq).to(device)
        emb = torch.mean(model.embedding(X), dim=0).detach().cpu().numpy()

    return emb

In [18]:
context_preds = []

for pair in test_data['x']:
    s1, s2 = vocab.sequencify(pair[0]), vocab.sequencify(pair[1])

    emb1 = get_context_embedding(lstm, device, s1)
    emb2 = get_context_embedding(lstm, device, s2)

    score = cosine(emb1, emb2) * 5
    context_preds.append(score)


In [19]:
sentence_preds = []

for pair in test_data['x']:
    s1, s2 = vocab.sequencify(pair[0]), vocab.sequencify(pair[1])

    emb1 = get_sentence_embedding(lstm, device, s1)
    emb2 = get_sentence_embedding(lstm, device, s2)

    score = cosine(emb1, emb2) * 5
    sentence_preds.append(score)

In [20]:
pearson_score, _ = pearsonr(context_preds, test_data['y'])
print(f'Pearson Score for LSTM Model (from hidden state) with Cosine Similarity: {pearson_score:.4f}')

Pearson Score for LSTM Model (from hidden state) with Cosine Similarity: 0.4450


In [21]:
pearson_score, _ = pearsonr(sentence_preds, test_data['y'])
print(f'Pearson Score for LSTM Model (from embedding layer ) with Cosine Similarity: {pearson_score:.4f}')


Pearson Score for LSTM Model (from embedding layer ) with Cosine Similarity: 0.6475


In [22]:
cosine_ft_lstm = deepcopy(lstm)

In [23]:
class STSDataset(Dataset):
    def __init__(self, vocab:Vocab, data):
        self.sts_data = []

        for x, y in zip(data['x'], data['y']):
            s1, s2 = x
            s1, s2 = vocab.sequencify(s1, addEOSBOS=True), vocab.sequencify(s2, addEOSBOS=True)
            self.sts_data.append(((s1, s2), y))

    def __len__(self):
        return len(self.sts_data)

    def __getitem__(self, idx):
        return self.sts_data[idx]

In [24]:
ft_dataset = STSDataset(vocab, train_data)

In [25]:
ft_dataloader = DataLoader(ft_dataset, batch_size=1, shuffle=True, collate_fn=lambda x: zip(*x))

In [26]:
def cosine_ft(model:LSTMLM, device:torch.device, dataloader:DataLoader, n_epochs:int=5):
    criterion = nn.CosineSimilarity(dim = 0)
    optimizer = optim.Adam(model.parameters())
    model.train()
    for e in range(n_epochs):
        loss_val = 0
        for X, y in dataloader:

            s1, s2  = X[0]
            s1 = torch.tensor(s1).long().to(device).unsqueeze(0)
            s2 = torch.tensor(s2).long().to(device).unsqueeze(0)

            y = torch.tensor(y).float().to(device).squeeze(0)

            optimizer.zero_grad()

            _, (h1, c1) = model(s1)
            
            emb1 = (torch.mean(h1, dim=0).squeeze(0) + torch.mean(c1, dim=0).squeeze(0))/2

            _, (h2, c2) = model(s2)

            emb2 = (torch.mean(h2, dim=0).squeeze(0) +torch.mean(c2, dim=0).squeeze(0)) / 2

            sim = criterion(emb1, emb2) * 5
            
            loss = nn.MSELoss()(sim, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")

In [27]:
cosine_ft(cosine_ft_lstm, device, ft_dataloader, 5)

  result = _VF.lstm(input, hx, self._flat_weights, self.bias, self.num_layers,


Epoch 1, Train Loss: 0.9654646384515881
Epoch 2, Train Loss: 0.5156356550053578
Epoch 3, Train Loss: 0.37323116740672124
Epoch 4, Train Loss: 0.28849926342101484
Epoch 5, Train Loss: 0.2401693253231447


In [28]:
cosine_ft_context_preds = []

for pair in test_data['x']:
    s1, s2 = vocab.sequencify(pair[0]), vocab.sequencify(pair[1])

    emb1 = get_context_embedding(cosine_ft_lstm, device, s1)
    emb2 = get_context_embedding(cosine_ft_lstm, device, s2)

    score = cosine(emb1, emb2) * 5
    cosine_ft_context_preds.append(score)


In [29]:
pearson_score, _ = pearsonr(cosine_ft_context_preds, test_data['y'])
print(f'Pearson Score for LSTM Model (from hidden state) with Cosine Finetuning: {pearson_score:.4f}')


Pearson Score for LSTM Model (from hidden state) with Cosine Finetuning: 0.7675
