In [1]:
import pickle
from scipy.stats import pearsonr

from src.utils import cosine
import numpy as np
from tqdm import tqdm

In [2]:
with open('data/en_en/train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)


In [3]:
with open('data/en_en/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from flair.embeddings import TransformerWordEmbeddings
from flair.data import Sentence

In [5]:
embedder = TransformerWordEmbeddings('roberta-base')

In [6]:
class STSDataset(Dataset):

    def __init__(self, data):

        self.embeddings = []
        print("Generating Embeddings...")
        for s1, s2 in tqdm(data['x']):
            s1 = Sentence(s1)
            s2 = Sentence(s2)            
            e1 = [token.embedding for token in embedder.embed(s1)[0]]
            e2 = [token.embedding for token in embedder.embed(s2)[0]]

            e1 = torch.vstack(e1)
            e2 = torch.vstack(e2)
            self.embeddings.append((e1, e2))
        self.labels = data['y']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self,idx):
        return self.embeddings[idx], self.labels[idx]


In [7]:
train_dataset = STSDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

Generating Embeddings...


100%|██████████| 3825/3825 [01:23<00:00, 45.72it/s]


In [8]:
test_dataset = STSDataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

Generating Embeddings...


100%|██████████| 675/675 [00:14<00:00, 47.02it/s]


In [9]:
EMBEDDING_DIM = 768

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")


Using Device: cuda


In [11]:
class SiameseLSTM(nn.Module):
    def __init__(self, n_layers, embedding_size, n_hidden=256, drop_prob=0.2):
        super(SiameseLSTM, self).__init__()

        self.embedding_size = embedding_size
        self.drop_prob = drop_prob
        self.n_hidden = n_hidden
        self.n_layers = n_layers

        self.lstm = nn.LSTM(self.embedding_size, self.n_hidden, self.n_layers,
                            dropout=self.drop_prob, batch_first=True, bidirectional=True)

    def forward_once(self, X):
        _, (h, c) = self.lstm(X)
        h = torch.mean(h, dim=0)
        c = torch.mean(c, dim=0)
        out = (h + c) / 2
        return out

    def forward(self, X1, X2):
        out1 = self.forward_once(X1)
        out2 = self.forward_once(X2)
        return out1, out2


In [12]:
lstm = SiameseLSTM(2, EMBEDDING_DIM, 256)
lstm.to(device)

SiameseLSTM(
  (lstm): LSTM(768, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
)

In [13]:
class SimilarityScore(nn.Module):

    def __init__(self, dim=0):
        super(SimilarityScore, self).__init__()
        self.cos = nn.CosineSimilarity(dim=dim)

    def __call__(self, v1, v2):
        return self.cos(v1, v2) * 5


In [14]:
def train(model:SiameseLSTM, dataloader:DataLoader, device:torch.device, n_epochs:int=5)->None:

    metric = SimilarityScore(dim=0)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())
    model.train()

    for e in range(n_epochs):
        loss_val = 0

        for X in dataloader:
            s1, s2 = X[0][0].float().to(device), X[0][1].float().to(device)
            y = X[1].float().to(device).squeeze(0)

            optimizer.zero_grad()

            emb1, emb2 = model(s1, s2)
            emb1 = emb1.squeeze()
            emb2 = emb2.squeeze()
            sim = metric(emb1, emb2)
            loss = criterion(sim, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")


In [15]:
train(lstm, train_dataloader, device, n_epochs=15)

Epoch 1, Train Loss: 0.9054229341870733
Epoch 2, Train Loss: 0.46877143403948046
Epoch 3, Train Loss: 0.3316300269227287
Epoch 4, Train Loss: 0.2653651787698175
Epoch 5, Train Loss: 0.20151675954032447
Epoch 6, Train Loss: 0.16784510255437274
Epoch 7, Train Loss: 0.14144984973393895
Epoch 8, Train Loss: 0.11969690452203272
Epoch 9, Train Loss: 0.10623940005046302
Epoch 10, Train Loss: 0.09702819687004856
Epoch 11, Train Loss: 0.08452992241102887
Epoch 12, Train Loss: 0.07857235838796638
Epoch 13, Train Loss: 0.07200607166368136
Epoch 14, Train Loss: 0.06562606786975009
Epoch 15, Train Loss: 0.0620028781504852


In [16]:
y_true = []
y_pred = []

lstm.eval()
with torch.no_grad():
    for X in test_dataloader:
        s1, s2 = X[0][0].float().to(device), X[0][1].float().to(device)
        y = X[1].float().to(device).squeeze(0)

        y_true.append(y.detach().cpu().numpy().item())
        emb1, emb2 = lstm(s1, s2)
        emb1 = emb1.squeeze().detach().cpu().numpy()
        emb2 = emb2.squeeze().detach().cpu().numpy()

        score = cosine(emb1, emb2) * 5
        y_pred.append(score)


In [17]:
pearson_score, _ = pearsonr(y_pred, y_true)
print(f'Pearson Score for LSTM Model (with ROBERTA Embeddings): {pearson_score:.4f}')


Pearson Score for LSTM Model (with ROBERTA Embeddings): 0.8601
