In [1]:
import pickle
from scipy.stats import pearsonr

from src.Vocab import Vocab
from src.utils import cosine
from src.Word2Vec import Word2Vec
import numpy as np
from copy import deepcopy


In [2]:
with open('data/en_es/train_data_cross.pkl', 'rb') as f:
    train_data = pickle.load(f)

with open('data/en_es/test_data_cross.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [3]:
en_sentences = [x[0] for x in train_data['x']]
es_sentences = [x[1] for x in train_data['x']]

In [4]:
en_vocab = Vocab(en_sentences,lang='english', remove_stopwords=False)
es_vocab = Vocab(es_sentences,lang='spanish', remove_stopwords=False)

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader


In [6]:
en_train_sequences = [en_vocab.sequencify(sen, addEOSBOS=True) for sen in en_sentences]
es_train_sequences = [es_vocab.sequencify(sen, addEOSBOS=True) for sen in es_sentences]

In [7]:
class EmbeddingDataset(Dataset):

    def __init__(self, sequences, context_size, skipgram=False):
        self.skipgram = skipgram
        self.context_size = context_size

        self.contexts = []
        self.targets = []

        for seq in sequences:
            for i in range(self.context_size, len(seq) - self.context_size):
                target = seq[i]
                context = []
                for j in range(i - self.context_size, i + self.context_size + 1):
                    if i == j:
                        continue
                    context.append(seq[j])
                self.targets.append(target)
                self.contexts.append(context)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        data = (self.targets[idx], np.array(self.contexts[idx]))
        if self.skipgram:
            return data
        else:
            return data[::-1]


In [8]:
BATCH_SIZE = 64
CONTEXT_SIZE = 2
EMBEDDING_DIM = 128


In [9]:
en_embedding_dataset = EmbeddingDataset(en_train_sequences, CONTEXT_SIZE)
en_embedding_dataloader = DataLoader(
    en_embedding_dataset, shuffle=True, batch_size=BATCH_SIZE)


es_embedding_dataset = EmbeddingDataset(es_train_sequences, CONTEXT_SIZE)
es_embedding_dataloader = DataLoader(
    es_embedding_dataset, shuffle=True, batch_size=BATCH_SIZE)


In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")


Using Device: cuda


In [11]:
en_w2v = Word2Vec(len(en_vocab), CONTEXT_SIZE, embedding_dim=EMBEDDING_DIM)
en_w2v.to(device)

Word2Vec(
  (embedding): Embedding(3819, 128)
  (layers): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=3819, bias=True)
  )
)

In [12]:
es_w2v = Word2Vec(len(es_vocab), CONTEXT_SIZE, embedding_dim=EMBEDDING_DIM)
es_w2v.to(device)


Word2Vec(
  (embedding): Embedding(4030, 128)
  (layers): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=4030, bias=True)
  )
)

In [13]:
def train(model: Word2Vec, dataloader: DataLoader, device: torch.device, n_epochs: int = 5) -> None:
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters())
    model.train()

    assert model.skipgram == dataloader.dataset.skipgram, "Mismatching Model and Data Formats"

    for e in range(n_epochs):
        loss_val = 0

        for X, y in dataloader:
            X = X.long().to(device)
            y = y.long().to(device)

            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")

In [14]:
train(es_w2v, es_embedding_dataloader, device, n_epochs=10)

Epoch 1, Train Loss: 4.869018103026141
Epoch 2, Train Loss: 3.5933262315484087
Epoch 3, Train Loss: 2.8450885117758533
Epoch 4, Train Loss: 2.255736900747433
Epoch 5, Train Loss: 1.8260701839335798
Epoch 6, Train Loss: 1.51958656550581
Epoch 7, Train Loss: 1.288433305854587
Epoch 8, Train Loss: 1.107692725927084
Epoch 9, Train Loss: 0.9663792578275979
Epoch 10, Train Loss: 0.8440504751870537


In [15]:
train(en_w2v, en_embedding_dataloader, device, n_epochs=10)

Epoch 1, Train Loss: 5.554893219459868
Epoch 2, Train Loss: 4.357083235960193
Epoch 3, Train Loss: 3.552917606010686
Epoch 4, Train Loss: 2.8358067097446127
Epoch 5, Train Loss: 2.28806072123564
Epoch 6, Train Loss: 1.8998558322403438
Epoch 7, Train Loss: 1.6161003148900164
Epoch 8, Train Loss: 1.3954997729277456
Epoch 9, Train Loss: 1.2182653352465456
Epoch 10, Train Loss: 1.0676211829176598


In [16]:
class BiLingualSTSDataset(Dataset):
    def __init__(self, vocab_en: Vocab, vocab_es:Vocab, data):
        self.sts_data = []
        for x, y in zip(data['x'], data['y']):
            s1, s2 = x
            s1, s2 = vocab_en.sequencify(
                s1, addEOSBOS=True), vocab_es.sequencify(s2, addEOSBOS=True)
            self.sts_data.append(((s1, s2), y))

    def __len__(self):
        return len(self.sts_data)

    def __getitem__(self, idx):
        return self.sts_data[idx]


In [17]:
ft_dataset = BiLingualSTSDataset(en_vocab, es_vocab, train_data)
ft_dataloader = DataLoader(ft_dataset, batch_size=1, shuffle=True, collate_fn=lambda x: zip(*x))

In [18]:
class SpaceTransformer(nn.Module):

    def __init__(self, in_dim, out_dim):
        super(SpaceTransformer, self).__init__()
        self.transformation = nn.Sequential(
            nn.Linear(in_dim, 256),
            nn.ReLU(),
            nn.Linear(256, out_dim)
        )

    def forward(self, X):
        return self.transformation(X)

In [19]:
space_transformer = SpaceTransformer(EMBEDDING_DIM, EMBEDDING_DIM)
space_transformer.to(device)

SpaceTransformer(
  (transformation): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=128, bias=True)
  )
)

In [20]:
class SimilarityScore(nn.Module):
    def __init__(self, dim=0):
        super(SimilarityScore, self).__init__()
        self.cos = nn.CosineSimilarity(dim=dim)

    def __call__(self, v1, v2):
        return self.cos(v1, v2) * 5


In [21]:
def learn_transformation(space_transformer:SpaceTransformer, en_w2v:Word2Vec, es_w2v:Word2Vec, device:torch.device, dataloader:DataLoader, n_epochs=5):

    metric = SimilarityScore(dim=0)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(space_transformer.parameters())

    for param in en_w2v.parameters():
        param.requires_grad = False

    for param in es_w2v.parameters():
        param.requires_grad = False

    for e in range(n_epochs):
        loss_val = 0
        for X, y in dataloader:
            s1, s2 = X[0]
            s1 = torch.tensor(s1).long().to(device)
            s2 = torch.tensor(s2).long().to(device)
            y = torch.tensor(y).float().to(device).squeeze(0)
            optimizer.zero_grad()

            emb1 = torch.mean(en_w2v.embedding(s1), dim=0)
            emb2 = torch.mean(es_w2v.embedding(s2), dim=0)

            emb2 = space_transformer(emb2)
            sim = metric(emb1, emb2)
            loss = criterion(sim, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()
        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")


In [22]:
learn_transformation(space_transformer, en_w2v, es_w2v, device, ft_dataloader, n_epochs=5)

Epoch 1, Train Loss: 2.329406357288242
Epoch 2, Train Loss: 1.9524453576933054
Epoch 3, Train Loss: 1.6321869113770031
Epoch 4, Train Loss: 1.3589374521561024
Epoch 5, Train Loss: 1.1600778687387856


In [23]:
from itertools import chain

In [24]:
def cosine_ft(en_w2v:Word2Vec, es_w2v:Word2Vec, space_transformer:torch.tensor, device, dataloader, n_epochs=5):
    metric = SimilarityScore(dim=0)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(chain(en_w2v.parameters(), es_w2v.parameters(), space_transformer.parameters()))

    for param in en_w2v.parameters():
        param.requires_grad = True

    for param in es_w2v.parameters():
        param.requires_grad = True

    for e in range(n_epochs):
        loss_val = 0

        for X, y in dataloader:
            s1, s2 = X[0]
            s1 = torch.tensor(s1).long().to(device)
            s2 = torch.tensor(s2).long().to(device)
            y = torch.tensor(y).float().to(device).squeeze(0)

            optimizer.zero_grad()

            emb1 = torch.mean(en_w2v.embedding(s1), dim=0)
            emb2 = torch.mean(es_w2v.embedding(s2), dim=0)
            emb2 = space_transformer(emb2)
            sim = metric(emb1, emb2)
            loss = criterion(sim, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")


In [25]:
cosine_ft(en_w2v, es_w2v, space_transformer, device, ft_dataloader, 10 )

Epoch 1, Train Loss: 0.9210498603057984
Epoch 2, Train Loss: 0.566173892177482
Epoch 3, Train Loss: 0.3768685667987289
Epoch 4, Train Loss: 0.27137094958709795
Epoch 5, Train Loss: 0.20912560572058003
Epoch 6, Train Loss: 0.17476388164707837
Epoch 7, Train Loss: 0.14469916134695335
Epoch 8, Train Loss: 0.12713880567708413
Epoch 9, Train Loss: 0.11250049899757789
Epoch 10, Train Loss: 0.10049569369833879


In [26]:
def get_sim_score(s1, s2, device):
    s1 = en_vocab.sequencify(s1)
    s2 = es_vocab.sequencify(s2)

    s1 = torch.tensor(s1).long().to(device)
    s2 = torch.tensor(s2).long().to(device)
    
    with torch.no_grad():
        v1 = torch.mean(en_w2v.embedding(s1), dim=0).detach().cpu().numpy()
        v2 = torch.mean(es_w2v.embedding(s2), dim=0)
        v2 = space_transformer(v2).detach().cpu().numpy()

    return cosine(v1, v2) * 5

In [27]:
preds = []

for pairs in test_data['x']:
    s1 = pairs[0]
    s2 = pairs[1]

    score = get_sim_score(s1, s2, device)
    preds.append(score)

In [28]:
pearson_score, _ = pearsonr(preds, test_data['y'])
print(f'Pearson Score for Word2Vec with Cosine Finetuning: {pearson_score:.4f}')


Pearson Score for Word2Vec with Cosine Finetuning: 0.1583


In [29]:
torch.save(en_w2v.state_dict(),'models/cross_en_w2v.pt')
torch.save(es_w2v.state_dict(),'models/cross_es_w2v.pt')
torch.save(space_transformer.state_dict(),'models/cross_es_space_transformer.pt')