In [1]:
import pickle
from scipy.stats import pearsonr

from src.utils import cosine
import numpy as np
from tqdm import tqdm


In [2]:
with open('data/en_es/train_data_cross.pkl', 'rb') as f:
    train_data = pickle.load(f)


In [3]:
with open('data/en_es/test_data_cross.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from flair.embeddings import TransformerWordEmbeddings
from flair.data import Sentence


In [5]:
en_embedder = TransformerWordEmbeddings('roberta-base')

In [6]:
es_embedder = TransformerWordEmbeddings('bertin-project/bertin-roberta-base-spanish')


In [7]:
class BiLingualSTSDataset(Dataset):

    def __init__(self, data):

        self.embeddings = []
        print("Generating Embeddings...")
        for s1, s2 in tqdm(data['x']):
            s1 = Sentence(s1)
            s2 = Sentence(s2)
            e1 = [token.embedding for token in en_embedder.embed(s1)[0]]
            e2 = [token.embedding for token in es_embedder.embed(s2)[0]]

            e1 = torch.vstack(e1)
            e2 = torch.vstack(e2)
            self.embeddings.append((e1, e2))
        self.labels = data['y']

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]


In [8]:
train_dataset = BiLingualSTSDataset(train_data)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)


Generating Embeddings...


100%|██████████| 8623/8623 [03:13<00:00, 44.59it/s]


In [9]:
test_dataset = BiLingualSTSDataset(test_data)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)


Generating Embeddings...


100%|██████████| 250/250 [00:05<00:00, 45.16it/s]


In [10]:
EMBEDDING_DIM = 768


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")


Using Device: cuda


In [12]:
class DualLSTM(nn.Module):
    def __init__(self, n_layers, embedding_size, n_hidden=256, drop_prob=0.2):
        super(DualLSTM, self).__init__()

        self.embedding_size = embedding_size
        self.drop_prob = drop_prob
        self.n_hidden = n_hidden
        self.n_layers = n_layers

        self.en_lstm = nn.LSTM(self.embedding_size, self.n_hidden, self.n_layers,
                            dropout=self.drop_prob, batch_first=True, bidirectional=True)
        
        self.es_lstm = nn.LSTM(self.embedding_size, self.n_hidden, self.n_layers,
                            dropout=self.drop_prob, batch_first=True, bidirectional=True)
        


    def forward_once(self, X, lang):
        if lang == 'en':
            _, (h, c) = self.en_lstm(X)
        else:
            _, (h,c) = self.es_lstm(X)
        h = torch.mean(h, dim=0)
        c = torch.mean(c, dim=0)
        out = (h + c) / 2
        return out

    def forward(self, X1, X2):
        out1 = self.forward_once(X1, 'en')
        out2 = self.forward_once(X2, 'es')
        return out1, out2


In [13]:
dual_lstm = DualLSTM(2, EMBEDDING_DIM, 128)
dual_lstm.to(device)

DualLSTM(
  (en_lstm): LSTM(768, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (es_lstm): LSTM(768, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
)

In [14]:
class SimilarityScore(nn.Module):

    def __init__(self, dim=0):
        super(SimilarityScore, self).__init__()
        self.cos = nn.CosineSimilarity(dim=dim)

    def __call__(self, v1, v2):
        return self.cos(v1, v2) * 5


In [15]:
def train(model: DualLSTM, dataloader: DataLoader, device: torch.device, n_epochs: int = 5) -> None:

    metric = SimilarityScore(dim=0)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())

    for e in range(n_epochs):
        loss_val = 0

        model.train()
        for X in dataloader:
            s1, s2 = X[0][0].float().to(device), X[0][1].float().to(device)
            y = X[1].float().to(device).squeeze(0)

            optimizer.zero_grad()

            emb1, emb2 = model(s1, s2)
            emb1 = emb1.squeeze()
            emb2 = emb2.squeeze()
            sim = metric(emb1, emb2)
            loss = criterion(sim, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")

In [16]:
train(dual_lstm, train_dataloader, device, n_epochs=2)


Epoch 1, Train Loss: 2.0630524527706418
Epoch 2, Train Loss: 1.8628994389545726


In [17]:
y_true = []
y_pred = []

dual_lstm.eval()
with torch.no_grad():
    for X in test_dataloader:
        s1, s2 = X[0][0].float().to(device), X[0][1].float().to(device)
        y = X[1].float().to(device).squeeze(0)

        y_true.append(y.detach().cpu().numpy().item())
        emb1, emb2 = dual_lstm(s1, s2)
        emb1 = emb1.squeeze().detach().cpu().numpy()
        emb2 = emb2.squeeze().detach().cpu().numpy()

        score = cosine(emb1, emb2) * 5
        y_pred.append(score)




In [18]:
pearson_score, _ = pearsonr(y_pred, y_true)
print(f'Pearson Score for BiLSTM (with RoBERTa Encodings): {pearson_score:.4f}')


Pearson Score for BiLSTM (with RoBERTa Encodings): 0.2191


In [19]:
torch.save(dual_lstm.state_dict(), 'models/cross_bilstm.pt')