In [1]:
import pickle
from scipy.stats import pearsonr

from src.Vocab import Vocab
from src.utils import cosine
from src.Word2Vec import Word2Vec
import numpy as np
from copy import deepcopy


In [2]:
with open('data/train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)


In [3]:
with open('data/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [4]:
with open('data/base_text_data.pkl', 'rb') as f:
    sentences = pickle.load(f)


In [5]:
vocab = Vocab(sentences, remove_stopwords=False)


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader


In [7]:
train_sequences = [vocab.sequencify(sen, addEOSBOS=True) for sen in sentences]

In [8]:
class EmbeddingDataset(Dataset):

    def __init__(self, sequences, context_size, skipgram=False):
        self.skipgram = skipgram
        self.context_size = context_size

        self.contexts = []
        self.targets = []

        for seq in sequences:
            for i in range(self.context_size, len(seq) - self.context_size):
                target = seq[i]
                context = []
                for j in range(i - self.context_size, i+self.context_size + 1):
                    if i == j:
                        continue
                    context.append(seq[j])
                self.targets.append(target)
                self.contexts.append(context)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        data = (self.targets[idx], np.array(self.contexts[idx]))
        if self.skipgram:
            return data
        else:
            return data[::-1]

In [9]:
BATCH_SIZE = 64
CONTEXT_SIZE = 2
EMBEDDING_DIM = 128

In [10]:
embedding_dataset = EmbeddingDataset(train_sequences, CONTEXT_SIZE)
embedding_dataloader = DataLoader(
    embedding_dataset, shuffle=True, batch_size=BATCH_SIZE)


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")


Using Device: cuda


In [12]:
w2v = Word2Vec(len(vocab), CONTEXT_SIZE, embedding_dim=EMBEDDING_DIM)
w2v.to(device)

Word2Vec(
  (embedding): Embedding(12935, 128)
  (layers): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=12935, bias=True)
  )
)

In [13]:
def train(model:Word2Vec, dataloader:DataLoader, device:torch.device, n_epochs:int = 5) -> None:
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters())
    model.train()

    assert model.skipgram == dataloader.dataset.skipgram, "Mismatching Model and Data Formats"

    for e in range(n_epochs):
        loss_val = 0

        for X, y in dataloader:
            X = X.long().to(device)
            y = y.long().to(device)

            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")


In [14]:
train(w2v, embedding_dataloader, device, n_epochs=15)

Epoch 1, Train Loss: 5.396146960933049
Epoch 2, Train Loss: 4.571523395593045
Epoch 3, Train Loss: 4.247083069079161
Epoch 4, Train Loss: 4.071893863003414
Epoch 5, Train Loss: 3.959844770467074
Epoch 6, Train Loss: 3.882691705483687
Epoch 7, Train Loss: 3.8219380388846553
Epoch 8, Train Loss: 3.778209960203874
Epoch 9, Train Loss: 3.7432921994739856
Epoch 10, Train Loss: 3.713076945465265
Epoch 11, Train Loss: 3.6900313506466156
Epoch 12, Train Loss: 3.6698004051783704
Epoch 13, Train Loss: 3.6548119341181446
Epoch 14, Train Loss: 3.6416723964475066
Epoch 15, Train Loss: 3.6345605269481847


In [15]:
def get_sentence_embedding(model:Word2Vec, sentence, device):
    seq = vocab.sequencify(sentence)

    embeddings = np.zeros(model.embedding.embedding_dim)

    for idx in seq:
        idx = torch.tensor(idx).long().to(device)
        emb = model.embedding(idx).cpu().detach().numpy()
        embeddings += emb
    
    return embeddings / len(seq)


In [16]:
preds = []

for pairs in test_data['x']:
    s1 = pairs[0]
    s2 = pairs[1]
    v1 = get_sentence_embedding(w2v, s1, device)
    v2 = get_sentence_embedding(w2v, s2, device)

    score = cosine(v1, v2) * 5
    preds.append(score)

In [17]:
pearson_score, _ = pearsonr(preds, test_data['y'])
print(f'Pearson Score for Word2Vec Model with Cosine Similarity: {pearson_score:.4f}')

Pearson Score for Word2Vec Model with Cosine Similarity: 0.6527


In [18]:
def sim_words(model:Word2Vec, word, device):
    model.eval()

    idx = torch.tensor(vocab.get_idx(word)).long().to(device)
    src = model.embedding(idx).cpu().detach().numpy()
    word_pairs = []

    for word in vocab:
        idx = torch.tensor(vocab.get_idx(word)).long().to(device)
        trg = model.embedding(idx).cpu().detach().numpy()
        sim = cosine(trg, src)
        word_pairs.append((word, sim))

    word_pairs = sorted(word_pairs, key=lambda x: -x[1])
    return word_pairs[:10]

In [19]:
sim_words(w2v, 'boy', device)

[('boy', 1.0000000596046448),
 ('girl', 0.8020584285259247),
 ('man', 0.7542853951454163),
 ('child', 0.7368017733097076),
 ('person', 0.7338158488273621),
 ('woman', 0.7258056551218033),
 ('snowboarder', 0.7229256331920624),
 ('player', 0.7159172147512436),
 ('baby', 0.7051761299371719),
 ('kid', 0.7037726491689682)]

In [20]:
class STSDataset(Dataset):
    def __init__(self, vocab:Vocab, data):
        self.sts_data = []
        for x, y in zip(data['x'], data['y']):
            s1, s2 = x
            s1, s2 = vocab.sequencify(s1, addEOSBOS=True), vocab.sequencify(s2, addEOSBOS=True)
            self.sts_data.append(((s1, s2), y))

    def __len__(self):
        return len(self.sts_data)

    def __getitem__(self, idx):
        return self.sts_data[idx]


In [21]:
ft_dataset = STSDataset(vocab, train_data)

In [22]:
cosine_ft_w2v = deepcopy(w2v)

In [23]:
def cosine_ft(model:Word2Vec, device:torch.device, dataloader:DataLoader, n_epochs:int=5):
    criterion = nn.CosineSimilarity(dim=0)
    optimizer = optim.Adam(model.parameters())

    for e in range(n_epochs):
        loss_val = 0

        for X, y in dataloader:
            s1, s2 = X[0]
            s1 = torch.tensor(s1).long().to(device)
            s2 = torch.tensor(s2).long().to(device)
            y = torch.tensor(y).float().to(device).squeeze(0)
            optimizer.zero_grad()

            emb1 = torch.mean(model.embedding(s1), dim=0)
            emb2 = torch.mean(model.embedding(s2), dim=0)

            sim = criterion(emb1, emb2) * 5
            loss = nn.MSELoss()(sim, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()
        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")


In [24]:
ft_dataloader = DataLoader(ft_dataset, batch_size=1, shuffle=True, collate_fn=lambda x: zip(*x))

In [25]:
cosine_ft(cosine_ft_w2v, device, ft_dataloader, n_epochs=15)

Epoch 1, Train Loss: 0.6823128048377483
Epoch 2, Train Loss: 0.45681999826893227
Epoch 3, Train Loss: 0.3302942856766977
Epoch 4, Train Loss: 0.25560188880442875
Epoch 5, Train Loss: 0.20488578198731108
Epoch 6, Train Loss: 0.1688497619827834
Epoch 7, Train Loss: 0.14156667873783454
Epoch 8, Train Loss: 0.12002876869153267
Epoch 9, Train Loss: 0.10307753202717057
Epoch 10, Train Loss: 0.08999764502303989
Epoch 11, Train Loss: 0.0781947272629363
Epoch 12, Train Loss: 0.06957320787498535
Epoch 13, Train Loss: 0.06215618388200053
Epoch 14, Train Loss: 0.05620273006193558
Epoch 15, Train Loss: 0.05107094679864024


In [26]:
cosine_ft_preds = []

for pairs in test_data['x']:
    s1 = pairs[0]
    s2 = pairs[1]
    v1 = get_sentence_embedding(cosine_ft_w2v, s1, device)
    v2 = get_sentence_embedding(cosine_ft_w2v, s2, device)

    score = cosine(v1, v2) * 5
    cosine_ft_preds.append(score)


In [27]:
pearson_score, _ = pearsonr(cosine_ft_preds, test_data['y'])
print(f'Pearson Score for Word2Vec Model with Cosine Finetuning: {pearson_score:.4f}')

Pearson Score for Word2Vec Model with Cosine Finetuning: 0.8554


In [28]:
mlp_ft_w2v = deepcopy(w2v)

In [29]:
class ScoringHead(nn.Module):

    def __init__(self, w2v:Word2Vec, input_dim:int):
        super(ScoringHead, self).__init__()
        self.w2v = w2v
        self.input_dim = input_dim
        self.hidden_dim_1 = 2048
        self.hidden_dim_2 = 1024
        self.hidden_dim_3 = 512
        self.hidden_dim_4 = 256
        self.linear_stack = nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim_1),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_1, self.hidden_dim_2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_2, self.hidden_dim_3),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_3, self.hidden_dim_4),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_4, 1),
            nn.Sigmoid()
        )

    
    def forward(self, s1:torch.tensor, s2:torch.tensor):
        
        emb1 = torch.mean(self.w2v.embedding(s1), dim=0)
        emb2 = torch.mean(self.w2v.embedding(s2), dim=0)

        # emb = torch.cat((emb1, emb2), dim=-1)
        diff = torch.abs(emb1 - emb2)
        out = self.linear_stack(diff)
        return out * 5

In [30]:
scoring_head = ScoringHead(mlp_ft_w2v, EMBEDDING_DIM)
scoring_head.to(device)

ScoringHead(
  (w2v): Word2Vec(
    (embedding): Embedding(12935, 128)
    (layers): Sequential(
      (0): Linear(in_features=512, out_features=128, bias=True)
      (1): ReLU()
      (2): Linear(in_features=128, out_features=12935, bias=True)
    )
  )
  (linear_stack): Sequential(
    (0): Linear(in_features=128, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=1024, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): ReLU()
    (8): Linear(in_features=256, out_features=1, bias=True)
    (9): Sigmoid()
  )
)

In [31]:
def mlp_ft(model:ScoringHead, device:torch.device, dataloader:DataLoader, n_epochs=5):
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())
    model.train()


    for e in range(n_epochs):
        loss_val = 0
        for X, y in dataloader:
            s1, s2 = X[0]
            s1 = torch.tensor(s1).long().to(device)
            s2 = torch.tensor(s2).long().to(device)
            y = torch.tensor(y).float().to(device)

            optimizer.zero_grad()
            output = model(s1, s2)

            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(dataloader)}")



In [32]:
mlp_ft(scoring_head, device, ft_dataloader, n_epochs=15)

Epoch 1, Train Loss: 0.8144275890690776
Epoch 2, Train Loss: 0.49398384255347133
Epoch 3, Train Loss: 0.2955798331155108
Epoch 4, Train Loss: 0.19108123085568932
Epoch 5, Train Loss: 0.1362320179472898
Epoch 6, Train Loss: 0.11129868440946206
Epoch 7, Train Loss: 0.09715406092106922
Epoch 8, Train Loss: 0.08442426689808673
Epoch 9, Train Loss: 0.0777957329676715
Epoch 10, Train Loss: 0.07426489540505626
Epoch 11, Train Loss: 0.06806227564148051
Epoch 12, Train Loss: 0.0630297456150563
Epoch 13, Train Loss: 0.06190243160014222
Epoch 14, Train Loss: 0.05616168147227822
Epoch 15, Train Loss: 0.055930369491746644


In [33]:
def mlp_score_predict(model, s1, s2):
    model.eval()
    with torch.no_grad():
        pred = model(s1, s2)

    return pred.cpu().detach().item()

In [34]:
mlp_ft_preds = []

for pairs in test_data['x']:
    s1 = pairs[0]
    s2 = pairs[1]
    s1, s2 = vocab.sequencify(s1, addEOSBOS=True), vocab.sequencify(s2, addEOSBOS=True)
    s1 = torch.tensor(s1).to(device)
    s2 = torch.tensor(s2).to(device)

    score = mlp_score_predict(scoring_head, s1, s2)

    mlp_ft_preds.append(score)


In [35]:
pearson_score, _ = pearsonr(mlp_ft_preds, test_data['y'])
print(f'Pearson Score for Word2Vec with MLP Scoring: {pearson_score:.4f}')


Pearson Score for Word2Vec with MLP Scoring: 0.7953


In [36]:
# torch.save(w2v, 'models/w2v.pt')
# torch.save(cosine_ft_w2v, 'models/cosine_ft_w2v.pt')
# torch.save(scoring_head, 'models/mlp_ft_scoringhead_w2v.pt')