# Siamese BiLSTM Neural Network with Attention

In [2]:
import sys
import torch
import logging
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
from torch.utils.data import TensorDataset, Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from scipy.stats import pearsonr
from gensim.models import KeyedVectors
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

logging.disable(logging.WARNING)
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

## Mono-Lingual Semantic Similarity

In [3]:
# Load all the data
train_path = '../utils2/data/train.csv'
test_path = '../utils2/data/test.csv'
val_path = '../utils2/data/val.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
val_data = pd.read_csv(val_path)

In [4]:
modelpath = "GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(modelpath, binary=True)
word2idx = {word: i for i, word in enumerate(model.index_to_key)}

In [5]:
vocab = {}
j = 0
sentences_1 = train_data['s1'].apply(eval)
sentences_2 = train_data['s2'].apply(eval)

for i in range(len(sentences_1)):
    for word in sentences_1[i]:
        if word not in vocab and word in model.key_to_index:
            vocab[word] = j
            j += 1
    for word in sentences_2[i]:
        if word not in vocab and word in model.key_to_index:
            vocab[word] = j
            j += 1


word2idx_dataset = {}
for i in list(vocab.keys()):
    word2idx_dataset[vocab[i]] = word2idx[i]
word2idx_dataset['unk'] = len(word2idx_dataset)
word_indices = word2idx_dataset.values()
dataset_embed_matrix = model.vectors[np.array(list(word_indices))]

In [6]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [7]:
class CustomDataset(Dataset):
    def __init__(self, sentences1, sentences2, scores, word2idx):
        self.sentences1 = sentences1
        self.sentences2 = sentences2
        self.scores = scores
        self.word2idx = word2idx

    def __len__(self):
        return max(len(self.sentences1), len(self.sentences2))

    def __getitem__(self, idx):
        unk_token = self.word2idx['unk']
        sentence1 = self.sentences1[idx]
        sentence2 = self.sentences2[idx]
        score = self.scores[idx]
        seq1 = [self.word2idx[word] if word in self.word2idx else unk_token for word in sentence1]
        seq2 = [self.word2idx[word] if word in self.word2idx else unk_token for word in sentence2]
        return seq1, seq2, score

    def collate_fn(self, batch):
        sequences1, sequences2, scores = zip(*batch)
        padded_seqs1 = pad_sequence([torch.LongTensor(seq) for seq in sequences1], batch_first=True, padding_value=0)
        padded_seqs2 = pad_sequence([torch.LongTensor(seq) for seq in sequences2], batch_first=True, padding_value=0)
        return padded_seqs1, padded_seqs2, torch.LongTensor(scores)

In [8]:
train_data['s1'] = train_data['s1'].apply(eval)
train_data['s2'] = train_data['s2'].apply(eval)
val_data['s1'] = val_data['s1'].apply(eval)
val_data['s2'] = val_data['s2'].apply(eval)
test_data['s1'] = test_data['s1'].apply(eval)
test_data['s2'] = test_data['s2'].apply(eval)

In [9]:
batch_size = 16

train_dataset = CustomDataset(train_data['s1'], train_data['s2'], train_data['score'], word2idx_dataset)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn)
val_dataset = CustomDataset(val_data['s1'], val_data['s2'], val_data['score'], word2idx_dataset)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, collate_fn=val_dataset.collate_fn)
test_dataset = CustomDataset(test_data['s1'], test_data['s2'], test_data['score'], word2idx_dataset)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, collate_fn=test_dataset.collate_fn)

In [10]:
class SiameseBiLSTM(nn.Module):
    def __init__(self, hidden_size, num_layers, embedding_dim, embd_matrix, dropout=0.2):
        super(SiameseBiLSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        self.embd_matrix = embd_matrix

        self.word_embeddings = nn.Embedding(len(embd_matrix), embedding_dim)
        self.word_embeddings.weight = nn.Parameter(torch.from_numpy(self.embd_matrix))
        self.word_embeddings.weight.requires_grad = False

        self.bilstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        self.attention_fc = nn.Linear(hidden_size * 2, 1)
        self.attention_softmax = nn.Softmax(dim=1)
        self.fc = nn.Linear(hidden_size * 4, 1)  # 4 because we concatenate forward and backward hidden states of both LSTMs


    def forward_once(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.bilstm(embeds)
        lstm_out = self.dropout(lstm_out)
        attention_weights = self.attention_softmax(self.attention_fc(lstm_out))
        lstm_out = lstm_out * attention_weights
        lstm_out = lstm_out.sum(dim=1)
        return lstm_out
    

    def forward(self, sentence1, sentence2):
        output1 = self.forward_once(sentence1)
        output2 = self.forward_once(sentence2)
        concatenated = torch.cat((output1, output2), dim=1)
        similarity_score = torch.sigmoid(self.fc(concatenated))
        return similarity_score

In [11]:
# Define model and optimizer
model1 = SiameseBiLSTM(hidden_size=50, num_layers=2, embedding_dim=300, embd_matrix = dataset_embed_matrix)
optimizer = torch.optim.Adam(model1.parameters(), lr=1e-3)
criterion = nn.MSELoss()
num_epochs = 10
model1.train()

for epoch in range(num_epochs):
    train_loss = 0
    for sentence1, sentence2, score in train_loader:
        sentence1_tensor = sentence1
        sentence2_tensor = sentence2
        score_tensor = torch.tensor(score, dtype=torch.float)
        optimizer.zero_grad()
        output = model1(sentence1_tensor, sentence2_tensor)
        loss = criterion(output.squeeze(), score_tensor.squeeze())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    print(f"Epoch = {epoch}\tTraining Loss = {train_loss/len(train_data)}")
    
    val_loss = 0
    with torch.no_grad():
        for sentence1, sentence2, score in val_loader:
            sentence1_tensor = sentence1
            sentence2_tensor = sentence2
            score_tensor = torch.tensor(score, dtype=torch.float)
            outputs = model1(sentence1_tensor, sentence2_tensor)
            val_loss = criterion(outputs.squeeze(), score_tensor.squeeze())
            val_loss += val_loss.item()
    
    print(f"Epoch = {epoch}\tValidation Loss = {val_loss/len(val_data)}")

Epoch = 0	Training Loss = 0.0023099445907160043
Epoch = 0	Validation Loss = 1.4196899655871675e-06
Epoch = 1	Training Loss = 0.00218376952635891
Epoch = 1	Validation Loss = 8.835387461658684e-07
Epoch = 2	Training Loss = 0.002180990727094291
Epoch = 2	Validation Loss = 0.00015478802379220724
Epoch = 3	Training Loss = 0.0021825124550626965
Epoch = 3	Validation Loss = 1.0324259847038775e-06
Epoch = 4	Training Loss = 0.0021819813249525743
Epoch = 4	Validation Loss = 5.2707004215335473e-05
Epoch = 5	Training Loss = 0.002177644091278232
Epoch = 5	Validation Loss = 5.24942806805484e-05
Epoch = 6	Training Loss = 0.002182439253220819
Epoch = 6	Validation Loss = 6.484144137175463e-07
Epoch = 7	Training Loss = 0.002182599949604528
Epoch = 7	Validation Loss = 9.159506362266256e-07
Epoch = 8	Training Loss = 0.0021794152317392962
Epoch = 8	Validation Loss = 0.0001029971317620948
Epoch = 9	Training Loss = 0.002180796370925275
Epoch = 9	Validation Loss = 1.238501113220991e-06


In [13]:
train_predictions = []
train_labels = []

df_raw = pd.read_csv('../../train.csv')
ids = df_raw['id'].values
s1_raw = df_raw['s1'].values
s2_raw = df_raw['s2'].values
scores_raw = df_raw['score'].values

model1.eval()

for train_sentence1, train_sentence2, train_score in train_loader:
    train_sentence1_tensor = train_sentence1
    train_sentence2_tensor = train_sentence2
    train_score_tensor = torch.tensor(train_score, dtype=torch.float)
    train_output = model1(train_sentence1_tensor, train_sentence2_tensor)
    train_predictions.extend(train_output.tolist())
    train_labels.extend(train_score)

train_predictions = np.array(train_predictions)
train_labels = np.array(train_labels)

results = pd.DataFrame({
    'id': ids,
    'sentence1': s1_raw,
    'sentence2': s2_raw,
    'similarity': scores_raw,
    'predicted_similarity': np.clip(np.squeeze(train_predictions), 0, 5)
})
results.to_csv('Results/train.csv', index=False)
print('Predictions saved in Results/train.csv')

train_mse = mean_squared_error(train_labels, train_predictions)
print('Train MSE: {:.4f}'.format(train_mse))


Predictions saved in Results/train.csv
Train MSE: 0.0348


In [12]:
val_predictions = []
val_labels = []
model1.eval()
for val_sentence1, val_sentence2, val_score in val_loader:
    val_sentence1_tensor = val_sentence1
    val_sentence2_tensor = val_sentence2
    val_score_tensor = torch.tensor(val_score, dtype=torch.float)
    val_output = model1(val_sentence1_tensor, val_sentence2_tensor)
    val_predictions.extend(val_output.tolist())
    val_labels.extend(val_score)
val_predictions = np.array(val_predictions)
val_labels = np.array(val_labels)
val_mse = mean_squared_error(val_labels, val_predictions)
print('Val MSE: {:.4f}'.format(val_mse))

Val MSE: 0.0337


In [14]:
test_predictions = []
test_labels = []

df_test_raw = pd.read_csv('../../test.csv')
ids = df_test_raw['id'].values
s1_raw = df_test_raw['s1'].values
s2_raw = df_test_raw['s2'].values
scores_raw = df_test_raw['score'].values

model1.eval()

for test_sentence1, test_sentence2, test_score in test_loader:
    test_sentence1_tensor = test_sentence1
    test_sentence2_tensor = test_sentence2
    test_score_tensor = torch.tensor(test_score, dtype=torch.float)
    test_output = model1(test_sentence1_tensor, test_sentence2_tensor)
    test_predictions.extend(test_output.tolist())
    test_labels.extend(test_score)

test_predictions = np.array(test_predictions)
test_labels = np.array(test_labels)

results = pd.DataFrame({
    'id': ids,
    'sentence1': s1_raw,
    'sentence2': s2_raw,
    'similarity': scores_raw,
    'predicted_similarity': np.clip(np.squeeze(test_predictions), 0, 5)
})
results.to_csv('Results/test.csv', index=False)
print('Predictions saved in Results/test.csv')

test_mse = mean_squared_error(test_labels, test_predictions)
print('Test MSE: {:.4f}'.format(test_mse))


Predictions saved in Results/test.csv
Test MSE: 0.0442


In [14]:
print("train_predictions", train_predictions)

train_predictions [[0.03369996]
 [0.0372941 ]
 [0.03340741]
 [0.03614156]
 [0.03454901]
 [0.03340741]
 [0.03614156]
 [0.03829429]
 [0.03596756]
 [0.03614156]
 [0.03843369]
 [0.03946495]
 [0.03483744]
 [0.03946495]
 [0.03843369]
 [0.03259823]
 [0.03806125]
 [0.03686852]
 [0.03752419]
 [0.03573013]
 [0.03536608]
 [0.03745716]
 [0.03799329]
 [0.03629313]
 [0.03560502]
 [0.03622907]
 [0.03745716]
 [0.03074697]
 [0.03693433]
 [0.03806125]
 [0.03752419]
 [0.03673816]
 [0.03354117]
 [0.03704426]
 [0.03641138]
 [0.03415825]
 [0.03947566]
 [0.03593656]
 [0.03561808]
 [0.03623614]
 [0.03704426]
 [0.03301614]
 [0.03864471]
 [0.03797559]
 [0.03745807]
 [0.03718775]
 [0.03962496]
 [0.03797559]
 [0.03683695]
 [0.03940668]
 [0.04023412]
 [0.03303181]
 [0.03724293]
 [0.03638209]
 [0.03325319]
 [0.03511736]
 [0.03915045]
 [0.03824116]
 [0.03824116]
 [0.03915045]
 [0.03435464]
 [0.03661447]
 [0.03811998]
 [0.03638209]
 [0.03807731]
 [0.03728911]
 [0.03620885]
 [0.03728911]
 [0.03718315]
 [0.03974644]
 [

In [15]:
corr = pearson_corr(train_labels, train_predictions.ravel())
corr

0.010992589163266488

In [16]:
corr = pearson_corr(val_labels, val_predictions.ravel())
corr

0.05982164374990365

In [17]:
corr = pearson_corr(test_labels, test_predictions.ravel())
corr

0.07833904283232998