<h1> Doc2Vec embeddings </h1>
<p>Doc2Vec model is trained to generate representative embeddings of sentences and with these embeddings following approaches are applied to generate STS scores:</p>
<li>Normalized cosine similarity score </li>
<li>BiLSTM Regression neural network model</li>
<li>BiGRU Regression neural network model</li>

In [1]:
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial
from scipy.stats import pearsonr
import torch
import torch.nn as nn
import torch.utils.data as data
from sklearn.linear_model import LinearRegression
import logging
import warnings
import sys

logging.disable(logging.WARNING)
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [2]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

## Mono-Lingual Semantic Similarity

In [3]:
train_data = pd.read_csv('../utils2/data/train.csv')
val_data = pd.read_csv('../utils2/data/val.csv')
test_data = pd.read_csv('../utils2/data/test.csv')

train_data['s1'] = train_data['s1'].apply(eval)
train_data['s2'] = train_data['s2'].apply(eval)
val_data['s1'] = val_data['s1'].apply(eval)
val_data['s2'] = val_data['s2'].apply(eval)
test_data['s1'] = test_data['s1'].apply(eval)
test_data['s2'] = test_data['s2'].apply(eval)

In [4]:
total_sents_unk = list(train_data['s1'])
total_sents_unk.extend(list(train_data['s2']))
total_sents_unk

[['sudan', 'block', 'youtub', 'antiislam', 'film'],
 ['man', 'ride', 'white', 'hors'],
 ['mr',
  'mors',
  'charg',
  'assault',
  'mr',
  'darvish',
  'charg',
  'file',
  'fals',
  'report'],
 ['girl', 'play', 'pile', 'color', 'ball'],
 ['person', 'black', 'jacket', 'trick', 'motorbik'],
 ['man', 'woman', 'drive', 'street', 'jeep'],
 ['girl', 'ride', 'hors'],
 ['man', 'elegantli', 'dress', 'black', 'wear', 'elabor', 'black', 'mask'],
 ['woman', 'snowboard', 'rail', 'snow'],
 ['cluster', 'four', 'brown', 'dog', 'play', 'field', 'brown', 'grass'],
 ['two', 'men', 'talk'],
 ['urg',
  'patienc',
  'american',
  'eager',
  'servic',
  'intend',
  'block',
  'num',
  'percent',
  'telemarket',
  'call'],
 ['black',
  'white',
  'photo',
  'live',
  'room',
  'larg',
  'window',
  'sofa',
  'chair'],
 ['greec', 'bond', 'exchang', 'largest', 'debt', 'restructur', 'histori'],
 ['peopl', 'sit', 'bench', 'front', 'restaur'],
 ['death', 'toll', 'philippin', 'earthquak', 'rise', 'num'],
 ['two', 

In [5]:
documents = []
for idx, sent in enumerate(total_sents_unk):
    documents.append(TaggedDocument(sent, [idx]))

In [6]:
model = Doc2Vec(documents, vector_size=25, window=6, min_count=1, workers=1, epochs=30, alpha=0.1, min_alpha=0.001, hs=1)

In [7]:
model.random.seed(42)
train_data['s1'] = train_data['s1'].apply(lambda x: model.infer_vector(x))
train_data['s2'] = train_data['s2'].apply(lambda x: model.infer_vector(x))

In [9]:
y_pred = train_data.apply(lambda x: (1 - spatial.distance.cosine(x['s1'], x['s2'])+1)/2, axis=1)
y_train = train_data['score']
corr = pearson_corr(y_train, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.47


In [10]:
val_data['s1'] = val_data['s1'].apply(lambda x: model.infer_vector(x))
val_data['s2'] = val_data['s2'].apply(lambda x: model.infer_vector(x))

In [11]:
y_pred = val_data.apply(lambda x: (1 - spatial.distance.cosine(x['s1'], x['s2'])+1)/2, axis=1)
y_train = val_data['score']
corr = pearson_corr(y_train, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.56


In [12]:
test_data['s1'] = test_data['s1'].apply(lambda x: model.infer_vector(x))
test_data['s2'] = test_data['s2'].apply(lambda x: model.infer_vector(x))

In [13]:
y_pred = test_data.apply(lambda x: ((1 - spatial.distance.cosine(x['s1'], x['s2']))+1)*2.5, axis=1)
y_train = test_data['score']
corr = pearson_corr(y_train, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.47


### BiLSTM Model

In [14]:
train_embeddings1 = torch.Tensor(list(train_data['s1']))
train_embeddings2 = torch.Tensor(list(train_data['s2']))
train_score = torch.Tensor(list(train_data['score']))

val_embeddings1 = torch.Tensor(list(val_data['s1']))
val_embeddings2 = torch.Tensor(list(val_data['s2']))
val_score = torch.Tensor(list(val_data['score']))

test_embeddings1 = torch.Tensor(list(test_data['s1']))
test_embeddings2 = torch.Tensor(list(test_data['s2']))
test_score = torch.Tensor(list(test_data['score']))

In [24]:
input_dim = 25
hidden_dim = 25
lr = 0.001
num_epochs = 10
batch_size = 10

In [25]:
class BiLSTMRegression(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, 1)

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        x = x.view(len(x), 1, -1)
        h0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers*2, len(x), self.hidden_dim).to(x.device)
        out, _ = self.bilstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [26]:
model = BiLSTMRegression(input_dim*2, hidden_dim, num_layers=2)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss()

class SentenceSimilarityDataset(data.Dataset):
    def __init__(self, embeddings1, embeddings2, scores):
        self.embeddings1 = embeddings1
        self.embeddings2 = embeddings2
        self.scores = scores

    def __len__(self):
        return len(self.embeddings1)

    def __getitem__(self, index):
        return self.embeddings1[index], self.embeddings2[index], self.scores[index]
    
train_dataset = SentenceSimilarityDataset(train_embeddings1, train_embeddings2, train_score)
val_dataset = SentenceSimilarityDataset(val_embeddings1, val_embeddings2, val_score)

train_dataloader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        embeddings1_batch, embeddings2_batch, scores_batch = batch
        output = model(embeddings1_batch, embeddings2_batch)
        loss = criterion(output.squeeze(), scores_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(embeddings1_batch)
    train_loss /= len(train_embeddings1)

    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            embeddings1_batch, embeddings2_batch, scores_batch = batch
            val_output = model(embeddings1_batch, embeddings2_batch)
            val_loss += criterion(val_output.squeeze(), scores_batch).item() * len(embeddings1_batch)
    val_loss /= len(val_embeddings1)

    print(f"Epoch = {epoch}\tTraining Loss = {train_loss}\tValidation Loss = {val_loss}")

Epoch = 0	Training Loss = 0.07198671058086226	Validation Loss = 0.0752607791886463
Epoch = 1	Training Loss = 0.05243354639869979	Validation Loss = 0.0716338195199785
Epoch = 2	Training Loss = 0.046353306964072895	Validation Loss = 0.06969077409883323
Epoch = 3	Training Loss = 0.04230629820090699	Validation Loss = 0.07281764294590697
Epoch = 4	Training Loss = 0.03813186941010449	Validation Loss = 0.07748702112129414
Epoch = 5	Training Loss = 0.035115631875488204	Validation Loss = 0.08006021806358041
Epoch = 6	Training Loss = 0.0326171828130736	Validation Loss = 0.08548914877554545
Epoch = 7	Training Loss = 0.030110111669932996	Validation Loss = 0.08600907015835645
Epoch = 8	Training Loss = 0.028265988875324468	Validation Loss = 0.08848732217928147
Epoch = 9	Training Loss = 0.026940590660268108	Validation Loss = 0.08746444982300688


In [27]:
model.eval()
output = model(train_embeddings1, train_embeddings2)
y_pred = output.squeeze().tolist()
y_train = train_score
corr = pearson_corr(y_train, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.82


In [28]:
model.eval()
output = model(val_embeddings1, val_embeddings2)
y_pred = output.squeeze().tolist()
y_val = val_score
corr = pearson_corr(y_val, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.41


In [29]:
model.eval()
output = model(test_embeddings1, test_embeddings2)
y_pred = output.squeeze().tolist()
y_test = test_score
corr = pearson_corr(y_test, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.41


In [30]:
df_raw = pd.read_csv('../../train.csv')
id = df_raw['id'].values
s1_raw = df_raw['s1'].values
s2_raw = df_raw['s2'].values
y_raw = df_raw['score'].values

model.eval()
with torch.no_grad():
    print(len(train_embeddings1))
    output = model(train_embeddings1, train_embeddings2)
    y_pred = output.squeeze()
    # y_pred = model(s1_train, s2_train)
    # y_pred = y_pred.numpy()
    # y_pred = np.squeeze(y_pred)
    # y_pred = np.clip(y_pred, 0, 5)
    print(id.shape, s1_raw.shape, y_raw.shape, len(y_pred))
    results = pd.DataFrame({'id': id, 'sentence1': s1_raw, 'sentence2': s2_raw, 'similarity': y_raw, 'predicted_similarity': y_pred})
    results.to_csv(f'Results/train.csv', index=False)
    print(f'Predictions saved in results.csv')

correlation = pd.Series(y_raw).corr(pd.Series(y_pred))
print('Correlation between expected and predicted similarity scores:', correlation)


12833
(12833,) (12833,) (12833,) 12833
Predictions saved in results.csv
Correlation between expected and predicted similarity scores: 0.8204012146124229


In [31]:
df_raw = pd.read_csv('../../test.csv')
id = df_raw['id'].values
s1_raw = df_raw['s1'].values
s2_raw = df_raw['s2'].values
y_raw = df_raw['score'].values

model.eval()
with torch.no_grad():
    output = model(test_embeddings1, test_embeddings2)
    y_pred = output.squeeze()
    # y_pred = model(s1_train, s2_train)
    # y_pred = y_pred.numpy()
    # y_pred = np.squeeze(y_pred)
    # y_pred = np.clip(y_pred, 0, 5)
    print(id.shape, s1_raw.shape, y_raw.shape, len(y_pred))
    results = pd.DataFrame({'id': id, 'sentence1': s1_raw, 'sentence2': s2_raw, 'similarity': y_raw, 'predicted_similarity': y_pred})
    results.to_csv(f'Results/test.csv', index=False)
    print(f'Predictions saved in results.csv')

correlation = pd.Series(y_raw).corr(pd.Series(y_pred))
print('Correlation between expected and predicted similarity scores:', correlation)


(3347,) (3347,) (3347,) 3347
Predictions saved in results.csv
Correlation between expected and predicted similarity scores: 0.4123440842953832
