In [1]:
import pickle
from scipy.stats import pearsonr

from src.Vocab import Vocab
from src.Vocab_es import Vocab_es
from src.utils import cosine
from src.Word2Vec import Word2Vec
import numpy as np
from copy import deepcopy

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
with open('../data/train_data_cross.pkl', 'rb') as f:
    train_data = pickle.load(f)

In [6]:
with open('../data/test_data_cross.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [7]:
with open('../data/base_text_data.pkl', 'rb') as f:
    en_sentences = pickle.load(f)
with open('../data/spanish_base_text_data.pkl', 'rb') as f:
    es_sentences = pickle.load(f)

In [8]:
vocab_en = Vocab(en_sentences, remove_stopwords=False)
vocab_es = Vocab(es_sentences, remove_stopwords=False)

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader


In [10]:
en_train_sequences = [vocab_en.sequencify(sen, addEOSBOS=True) for sen in en_sentences]
es_train_sequences = [vocab_en.sequencify(sen, addEOSBOS=True) for sen in en_sentences]

In [11]:
class EmbeddingDataset(Dataset):

    def __init__(self, sequences, context_size, skipgram=False):
        self.skipgram = skipgram
        self.context_size = context_size

        self.contexts = []
        self.targets = []

        for seq in sequences:
            for i in range(self.context_size, len(seq) - self.context_size):
                target = seq[i]
                context = []
                for j in range(i - self.context_size, i+self.context_size + 1):
                    if i == j:
                        continue
                    context.append(seq[j])
                self.targets.append(target)
                self.contexts.append(context)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        data = (self.targets[idx], np.array(self.contexts[idx]))
        if self.skipgram:
            return data
        else:
            return data[::-1]

In [12]:
BATCH_SIZE = 64
CONTEXT_SIZE = 2
EMBEDDING_DIM = 128

In [13]:
en_embedding_dataset = EmbeddingDataset(en_train_sequences, CONTEXT_SIZE)
en_embedding_dataloader = DataLoader(
    en_embedding_dataset, shuffle=True, batch_size=BATCH_SIZE)

es_embedding_dataset = EmbeddingDataset(es_train_sequences, CONTEXT_SIZE)
es_embedding_dataloader = DataLoader(
    es_embedding_dataset, shuffle=True, batch_size=BATCH_SIZE)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")


Using Device: cuda


In [15]:
w2v_en = Word2Vec(len(vocab_en), CONTEXT_SIZE, embedding_dim=EMBEDDING_DIM)
w2v_en.to(device)

Word2Vec(
  (embedding): Embedding(12935, 128)
  (layers): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=12935, bias=True)
  )
)

In [16]:
def train(model:Word2Vec, dataloader:DataLoader, device:torch.device, n_epochs:int = 5) -> None:
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters())
    model.train()

    assert model.skipgram == dataloader.dataset.skipgram, "Mismatching Model and Data Formats"

    for e in range(n_epochs):
        loss_val = 0

        for X, y in dataloader:
            X = X.long().to(device)
            y = y.long().to(device)

            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")


In [17]:
train(w2v_en, en_embedding_dataloader, device, n_epochs=15)

Epoch 1, Train Loss: 5.406969225668374
Epoch 2, Train Loss: 4.578141788583999
Epoch 3, Train Loss: 4.2595872003723985
Epoch 4, Train Loss: 4.087196238814253
Epoch 5, Train Loss: 3.979096287079063
Epoch 6, Train Loss: 3.9039239155525203
Epoch 7, Train Loss: 3.8484516735894676
Epoch 8, Train Loss: 3.8049890902433496
Epoch 9, Train Loss: 3.7718125420013733
Epoch 10, Train Loss: 3.7453961081512723
Epoch 11, Train Loss: 3.721790452090392
Epoch 12, Train Loss: 3.706480112652099
Epoch 13, Train Loss: 3.693569019029431
Epoch 14, Train Loss: 3.6820282528196966
Epoch 15, Train Loss: 3.6725668499513215


In [18]:
w2v_es = Word2Vec(len(vocab_es), CONTEXT_SIZE, embedding_dim=EMBEDDING_DIM)
w2v_es.to(device)

Word2Vec(
  (embedding): Embedding(32689, 128)
  (layers): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=32689, bias=True)
  )
)

In [19]:
train(w2v_es, es_embedding_dataloader, device, n_epochs=15)

Epoch 1, Train Loss: 5.497272349184174
Epoch 2, Train Loss: 4.633380799621418
Epoch 3, Train Loss: 4.315388902910702
Epoch 4, Train Loss: 4.144333707244315
Epoch 5, Train Loss: 4.0398765696704935
Epoch 6, Train Loss: 3.9663001262855215
Epoch 7, Train Loss: 3.9121746915178846
Epoch 8, Train Loss: 3.8720273638966662
Epoch 9, Train Loss: 3.8412193288315204
Epoch 10, Train Loss: 3.816815501589763
Epoch 11, Train Loss: 3.797781426079923
Epoch 12, Train Loss: 3.7787266285733536
Epoch 13, Train Loss: 3.766381328991848
Epoch 14, Train Loss: 3.7603669885573745
Epoch 15, Train Loss: 3.7529972961053635


In [20]:
def get_sentence_embedding(model:Word2Vec, sentence, device, lang='en'):
    seq = vocab_en.sequencify(sentence) if lang == 'en' else vocab_es.sequencify(sentence)

    embeddings = np.zeros(model.embedding.embedding_dim)

    for idx in seq:
        idx = torch.tensor(idx).long().to(device)
        emb = model.embedding(idx).cpu().detach().numpy()
        embeddings += emb
    
    return embeddings / len(seq)

In [23]:
preds = []

for pairs in test_data['x']:
    s1 = pairs[0]
    s2 = pairs[1]
    v1 = get_sentence_embedding(w2v_en, s1, device)
    v2 = get_sentence_embedding(w2v_es, s2, device, lang='es')

    score = cosine(v1, v2) * 5
    preds.append(score)

In [24]:
pearson_score, _ = pearsonr(preds, test_data['y'])
print(f'Pearson Score for Word2Vec Model with Cosine Similarity: {pearson_score:.4f}')

Pearson Score for Word2Vec Model with Cosine Similarity: 0.0556


In [26]:
class STSDataset(Dataset):
    def __init__(self, vocab_en:Vocab, vocab_es:Vocab, data):
        self.sts_data = []
        for x, y in zip(data['x'], data['y']):
            s1, s2 = x
            s1, s2 = vocab_en.sequencify(s1, addEOSBOS=True), vocab_es.sequencify(s2, addEOSBOS=True)
            self.sts_data.append(((s1, s2), y))

    def __len__(self):
        return len(self.sts_data)

    def __getitem__(self, idx):
        return self.sts_data[idx]


In [27]:
ft_dataset = STSDataset(vocab_en, vocab_es, train_data)

In [27]:
cosine_ft_w2v_en = deepcopy(w2v_en)
cosine_ft_w2v_es = deepcopy(w2v_es)

In [28]:
def cosine_ft(model_en:Word2Vec, model_es:Word2Vec, device:torch.device, dataloader:DataLoader, n_epochs:int=5):
    criterion = nn.CosineSimilarity(dim=0)
    optimizer = optim.Adam(list(model_en.parameters()) + list(model_es.parameters()))

    for e in range(n_epochs):
        loss_val = 0

        for X, y in dataloader:
            s1, s2 = X[0]
            s1 = torch.tensor(s1).long().to(device)
            s2 = torch.tensor(s2).long().to(device)
            y = torch.tensor(y).float().to(device).squeeze(0)
            optimizer.zero_grad()

            emb1 = torch.mean(model_en.embedding(s1), dim=0)
            emb2 = torch.mean(model_es.embedding(s2), dim=0)

            sim = criterion(emb1, emb2) * 5
            loss = nn.MSELoss()(sim, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()
        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")


In [30]:
ft_dataloader = DataLoader(ft_dataset, batch_size=1, shuffle=True, collate_fn=lambda x: zip(*x))

In [31]:
cosine_ft(cosine_ft_w2v_en, cosine_ft_w2v_es, device, ft_dataloader, n_epochs=15)

Epoch 1, Train Loss: 10.470523593199633
Epoch 2, Train Loss: 4.038824695902157
Epoch 3, Train Loss: 2.3871585206719725
Epoch 4, Train Loss: 1.7057296704688507
Epoch 5, Train Loss: 1.3281767872885943
Epoch 6, Train Loss: 1.0680211524986984
Epoch 7, Train Loss: 0.8596332850730188
Epoch 8, Train Loss: 0.6861751152271408
Epoch 9, Train Loss: 0.5442517368970222
Epoch 10, Train Loss: 0.4306720170169178
Epoch 11, Train Loss: 0.34014435868559373
Epoch 12, Train Loss: 0.26942635434259504
Epoch 13, Train Loss: 0.2157809375968393
Epoch 14, Train Loss: 0.17400872588769292
Epoch 15, Train Loss: 0.14150360114059593


In [32]:
cosine_ft_preds = []

for pairs in test_data['x']:
    s1 = pairs[0]
    s2 = pairs[1]
    v1 = get_sentence_embedding(cosine_ft_w2v_en, s1, device)
    v2 = get_sentence_embedding(cosine_ft_w2v_es, s2, device)

    score = cosine(v1, v2) * 5
    cosine_ft_preds.append(score)


In [33]:
pearson_score, _ = pearsonr(cosine_ft_preds, test_data['y'])
print(f'Pearson Score for Word2Vec Model with Cosine Finetuning: {pearson_score:.4f}')

Pearson Score for Word2Vec Model with Cosine Finetuning: 0.0264


In [53]:
mlp_ft_w2v_en = deepcopy(w2v_en)
mlp_ft_w2v_es = deepcopy(w2v_es)

In [54]:
class ScoringHead(nn.Module):

    def __init__(self, w2v_en:Word2Vec, w2v_es:Word2Vec, input_dim:int):
        super(ScoringHead, self).__init__()
        self.w2v_en = w2v_en
        self.w2v_es = w2v_es
        self.input_dim = input_dim
        self.hidden_dim_1 = 2048
        self.hidden_dim_2 = 1024
        self.hidden_dim_3 = 512
        self.hidden_dim_4 = 256
        self.linear_stack = nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim_1),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_1, self.hidden_dim_2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_2, self.hidden_dim_3),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_3, self.hidden_dim_4),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_4, 1),
            nn.Sigmoid()
        )

    
    def forward(self, s1:torch.tensor, s2:torch.tensor):
        
        emb1 = torch.mean(self.w2v_en.embedding(s1), dim=0)
        emb2 = torch.mean(self.w2v_es.embedding(s2), dim=0)

        emb = torch.cat((emb1, emb2), dim=-1)
        # diff = torch.abs(emb1 - emb2)
        out = self.linear_stack(emb)
        return out * 5

In [55]:
scoring_head = ScoringHead(mlp_ft_w2v_en, mlp_ft_w2v_es, 2 * EMBEDDING_DIM)
scoring_head.to(device)

ScoringHead(
  (w2v_en): Word2Vec(
    (embedding): Embedding(12935, 128)
    (layers): Sequential(
      (0): Linear(in_features=512, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=12935, bias=True)
    )
  )
  (w2v_es): Word2Vec(
    (embedding): Embedding(32689, 128)
    (layers): Sequential(
      (0): Linear(in_features=512, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=32689, bias=True)
    )
  )
  (linear_stack): Sequential(
    (0): Linear(in_features=256, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=1024, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): ReLU()
    (8): Linear(in_features=256, out_features=1, bias=True)
    (9): Sigmoid()
  )
)

In [56]:
def mlp_ft(model:ScoringHead, device:torch.device, dataloader:DataLoader, n_epochs=5):
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())
    model.train()


    for e in range(n_epochs):
        loss_val = 0
        for X, y in dataloader:
            s1, s2 = X[0]
            s1 = torch.tensor(s1).long().to(device)
            s2 = torch.tensor(s2).long().to(device)
            y = torch.tensor(y).float().to(device)

            optimizer.zero_grad()
            output = model(s1, s2)

            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")



In [57]:
mlp_ft(scoring_head, device, ft_dataloader, n_epochs=15)

Epoch 1, Train Loss: 1.8015419692480092
Epoch 2, Train Loss: 1.3434802942550084
Epoch 3, Train Loss: 0.7439725593557464
Epoch 4, Train Loss: 0.37852614745298
Epoch 5, Train Loss: 0.3172271911095775
Epoch 6, Train Loss: 0.23393089628287006
Epoch 7, Train Loss: 0.18910121809238284
Epoch 8, Train Loss: 0.19297547649383928
Epoch 9, Train Loss: 0.16098641081626627
Epoch 10, Train Loss: 0.11604744388777243
Epoch 11, Train Loss: 0.10586673323077365
Epoch 12, Train Loss: 0.11498946858394639
Epoch 13, Train Loss: 0.08536283282378697
Epoch 14, Train Loss: 0.06902282667681682
Epoch 15, Train Loss: 0.06107542514372912


In [58]:
def mlp_score_predict(model, s1, s2):
    model.eval()
    with torch.no_grad():
        pred = model(s1, s2)

    return pred.cpu().detach().item()

In [34]:
mlp_ft_preds = []

for pairs in test_data['x']:
    s1 = pairs[0]
    s2 = pairs[1]
    s1, s2 = vocab_en.sequencify(s1, addEOSBOS=True), vocab_es.sequencify(s2, addEOSBOS=True)
    s1 = torch.tensor(s1).to(device)
    s2 = torch.tensor(s2).to(device)

    score = mlp_score_predict(scoring_head, s1, s2)

    mlp_ft_preds.append(score)


In [35]:
pearson_score, _ = pearsonr(mlp_ft_preds, test_data['y'])
print(f'Pearson Score for Word2Vec with MLP Scoring: {pearson_score:.4f}')


Pearson Score for Word2Vec with MLP Scoring: 0.0477


In [59]:
pearson_score, _ = pearsonr(mlp_ft_preds, test_data['y'])
print(f'Pearson Score for Word2Vec with MLP Scoring (Concat): {pearson_score:.4f}')

Pearson Score for Word2Vec with MLP Scoring (Concat): 0.0477


In [36]:
# torch.save(w2v, 'models/w2v.pt')
# torch.save(cosine_ft_w2v, 'models/cosine_ft_w2v.pt')
# torch.save(scoring_head, 'models/mlp_ft_scoringhead_w2v.pt')