In [1]:
import pickle
from scipy.stats import pearsonr

from src.Vocab import Vocab
from src.TFIDFVectorizer import TFIDFVectorizer
from src.utils import cosine
import numpy as np

In [2]:
with open('data/train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)

In [3]:
with open('data/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)


In [4]:
train_sentences = []
for s1, s2 in train_data['x']:
    train_sentences.append(s1)
    train_sentences.append(s2)

In [5]:
vocab = Vocab(train_sentences, remove_stopwords=True)

In [6]:
vec = TFIDFVectorizer(vocab)
vec.fit(train_sentences)

In [72]:
preds = []

for pair in test_data['x']:
    pair = vec.transform(pair)
    s1 = pair[0]
    s2 = pair[1]

    score = cosine(s1, s2) * 5
    preds.append(score)

In [74]:
pearson_score, _ = pearsonr(preds, test_data['y'])
print(f'Pearson Score for TFIDF Model with Cosine Similarity: {pearson_score:.4f}')


Pearson Score for TFIDF Model with Cosine Similarity: 0.6859


In [75]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

In [76]:
class STSDataset(Dataset):
    def __init__(self, tfidf_vectorizer, data):
        self.sts_data = []
        for x, y in zip(data['x'], data['y']):
            pair = tfidf_vectorizer.transform(x)
            self.sts_data.append((np.abs(pair[0] - pair[1]), y))
    def __len__(self):
        return len(self.sts_data)
    def __getitem__(self, idx):
        return self.sts_data[idx]

In [77]:
train_dataset = STSDataset(vec, train_data)
test_dataset = STSDataset(vec, test_data)

In [113]:
BATCH_SIZE=32

In [114]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [115]:
class ScoringHead(nn.Module):
    def __init__(self, input_dim):
        super(ScoringHead, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim_1 = 2048
        self.hidden_dim_2 = 1024
        self.hidden_dim_3 = 512
        self.hidden_dim_4 = 256
        self.linear_stack = nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim_1),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_1, self.hidden_dim_2),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_2, self.hidden_dim_3),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_3, self.hidden_dim_4),
            nn.ReLU(),
            nn.Linear(self.hidden_dim_4, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        output = self.linear_stack(x)
        return output.squeeze(1) * 5

In [116]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")

Using Device: cuda


In [117]:
input_dim = len(vec.vocab)

scoring_head = ScoringHead(input_dim)
scoring_head.to(device)

ScoringHead(
  (linear_stack): Sequential(
    (0): Linear(in_features=1258, out_features=2048, bias=True)
    (1): ReLU()
    (2): Linear(in_features=2048, out_features=1024, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): ReLU()
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): ReLU()
    (8): Linear(in_features=256, out_features=1, bias=True)
    (9): Sigmoid()
  )
)

In [118]:
def train(model, device, n_epochs=5):
    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    model.train()
    for e in range(n_epochs):
        loss_val = 0
        for X, y in train_dataloader:
            X = X.float().to(device)
            y = y.float().to(device)
            optimizer.zero_grad()

            output = model(X)
            loss = loss_fn(output, y)
            loss.backward()
            optimizer.step()

            loss_val += loss.item()

        print(f"Epoch {e+1}, Train Loss: {loss_val/len(train_dataloader)}")

In [119]:
train(scoring_head, device, n_epochs=40)

Epoch 1, Train Loss: 1.1516302535931269
Epoch 2, Train Loss: 0.9949590548872947
Epoch 3, Train Loss: 0.8742782056331635
Epoch 4, Train Loss: 0.7225486196577549
Epoch 5, Train Loss: 0.5587408152719339
Epoch 6, Train Loss: 0.43546702762444817
Epoch 7, Train Loss: 0.36149752624332904
Epoch 8, Train Loss: 0.31689101743201414
Epoch 9, Train Loss: 0.2881363751987616
Epoch 10, Train Loss: 0.26263212809960046
Epoch 11, Train Loss: 0.24338593557476998
Epoch 12, Train Loss: 0.22946036296586195
Epoch 13, Train Loss: 0.220094510478278
Epoch 14, Train Loss: 0.21370398631940285
Epoch 15, Train Loss: 0.20732189025729894
Epoch 16, Train Loss: 0.19873877341548601
Epoch 17, Train Loss: 0.19909343048930167
Epoch 18, Train Loss: 0.1896939021224777
Epoch 19, Train Loss: 0.18519287910312415
Epoch 20, Train Loss: 0.18495314220587414
Epoch 21, Train Loss: 0.18025015868867436
Epoch 22, Train Loss: 0.17905330744882425
Epoch 23, Train Loss: 0.17612534655878942
Epoch 24, Train Loss: 0.17305853419626752
Epoch 25, 

In [120]:
def predict(model, device):

    model.eval()
    outputs = []
    with torch.no_grad():
        for X, _ in test_dataloader:
            X = X.float().to(device)
            out = model(X)
            outputs.append(out.cpu())
    return torch.concat(outputs)

In [121]:
preds = predict(scoring_head, device)

In [122]:
pearson_score, _ = pearsonr(preds, test_data['y'])
print(f'Pearson Score for TFIDF Model with MLP Scoring: {pearson_score:.4f}')


Pearson Score for TFIDF Model with MLP Scoring: 0.7807
