In [None]:
!pip install transformers



In [None]:
import sys
import torch
import logging
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from scipy.stats import pearsonr
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
from tqdm.notebook import tqdm

logging.disable(logging.WARNING)
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
raw_dataset_path = 'data'

In [None]:
# Load all the data
trainpath = f'{raw_dataset_path}/train.csv'
testpath = f'{raw_dataset_path}/test.csv'
valpath = f'{raw_dataset_path}/val.csv'

traindata = pd.read_csv(trainpath)
testdata = pd.read_csv(testpath)
valdata = pd.read_csv(valpath)

# traindata['score'] = traindata['score'].apply(lambda x: (x)/5.0)
# testdata['score'] = testdata['score'].apply(lambda x: (x)/5.0)
# valdata['score'] = valdata['score'].apply(lambda x: (x)/5.0)

In [None]:
def pearson_corr(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

In [None]:
# compute the sequence length using 95% samples logic
lengths = []
for _, row in traindata.iterrows():
    lengths.append(len(row['s1']))
    lengths.append(len(row['s2']))

lengths.sort()
MAX_LEN = lengths[int(0.95*len(lengths))]
print(MAX_LEN)

103


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def convert_sentences_to_features(sentences, tokenizer, max_len):
    input_ids = []
    attention_masks = []
    token_type_ids = []

    for i in range(0, len(sentences), 2):
        encoded_dict = tokenizer.encode_plus(sentences[i], sentences[i+1], add_special_tokens=True, max_length=max_len, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='pt', truncation_strategy='longest_first')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        token_type_ids.append(encoded_dict['token_type_ids'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    token_type_ids = torch.cat(token_type_ids, dim=0)

    return input_ids.to(device), attention_masks.to(device), token_type_ids.to(device)

### Fine-Tune BERT Model

In [None]:
# Hyperparameters
EPOCHS = 10
BATCH_SIZE = 8
LEARN_RATE = 1e-5

In [24]:
x_train = []
for _, row in traindata.iterrows():
    x_train.append(row['s1'])
    x_train.append(row['s2'])

input_ids, attention_masks, token_type_ids = convert_sentences_to_features(x_train, tokenizer, MAX_LEN)
y_train = torch.tensor(traindata['score'], dtype=torch.float)

trainset = TensorDataset(input_ids, attention_masks, token_type_ids, y_train)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)

config = BertConfig.from_pretrained(
    'bert-base-uncased',
    num_labels=1,
    output_attentions=False,
    output_hidden_states=False,
)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARN_RATE, betas=[0.5, 0.99])
loss_fn = nn.MSELoss()

for epoch in tqdm(range(EPOCHS), desc='Epochs'):
    model.train()
    t_loss = 0
    for _, batch in tqdm(enumerate(trainloader), total=len(trainloader), leave=False, desc='Batches'):
        input_ids, attention_masks, _, labels = tuple(t for t in batch)
        optimizer.zero_grad()
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels.to(device))
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        t_loss += loss.item()
    print(f'Epoch: {epoch}\tLoss: {t_loss / len(trainloader)}')

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 0	Loss: 0.0405695942453924


Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 1	Loss: 0.02640886183090909


Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 2	Loss: 0.02084193215843267


Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 3	Loss: 0.01721104238838004


Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 4	Loss: 0.014451975767381744


Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 5	Loss: 0.012421689305100531


Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 6	Loss: 0.010768821553872591


Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 7	Loss: 0.00944712881261419


Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 8	Loss: 0.00858722903128899


Batches:   0%|          | 0/1605 [00:00<?, ?it/s]

Epoch: 9	Loss: 0.007735375683671995


In [26]:
torch.save(model, f='model.pt')

In [34]:
model_untrained = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
model_untrained.to(device)

x_val = []
for _, row in valdata.iterrows():
    x_val.append(row['s1'])
    x_val.append(row['s2'])

input_ids, attention_masks, token_type_ids = convert_sentences_to_features(x_val, tokenizer, MAX_LEN)
y_val = torch.tensor(valdata['score'], dtype=torch.float)
valset = TensorDataset(input_ids, attention_masks, token_type_ids, y_val)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for _, batch in enumerate(valloader):
        input_ids, attention_masks, _, labels = tuple(t for t in batch)
        outputs = model_untrained(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels.to(device))
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.13


In [30]:
x_val = []
for _, row in valdata.iterrows():
    x_val.append(row['s1'])
    x_val.append(row['s2'])

input_ids, attention_masks, token_type_ids = convert_sentences_to_features(x_val, tokenizer, MAX_LEN)
y_val = torch.tensor(valdata['score'], dtype=torch.float)
valset = TensorDataset(input_ids, attention_masks, token_type_ids, y_val)
valloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for _, batch in tqdm(enumerate(valloader), total=len(valloader)):
        input_ids, attention_masks, _, labels = tuple(t for t in batch)
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels.to(device))
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

  0%|          | 0/286 [00:00<?, ?it/s]

Pearson correlation coefficient: 0.82


In [32]:
model_untrained = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
model_untrained.to(device)

x_test = []
for _, row in testdata.iterrows():
    x_test.append(row['s1'])
    x_test.append(row['s2'])

input_ids, attention_masks, token_type_ids = convert_sentences_to_features(x_test, tokenizer, MAX_LEN)
y_test = torch.tensor(testdata['score'], dtype=torch.float)

testset = TensorDataset(input_ids, attention_masks, token_type_ids, y_test)
testloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for _, batch in tqdm(enumerate(testloader), total=len(testloader)):
        input_ids, attention_masks, _, labels = tuple(t for t in batch)
        outputs = model_untrained(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels.to(device))
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

  0%|          | 0/286 [00:00<?, ?it/s]

Pearson correlation coefficient: 0.27


In [36]:
x_test = []
for _, row in testdata.iterrows():
    x_test.append(row['s1'])
    x_test.append(row['s2'])

input_ids, attention_masks, token_type_ids = convert_sentences_to_features(x_test, tokenizer, MAX_LEN)
y_test = torch.tensor(testdata['score'], dtype=torch.float)

testset = TensorDataset(input_ids, attention_masks, token_type_ids, y_test)
testloader = DataLoader(valset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for _, batch in enumerate(testloader):
        input_ids, attention_masks, _, labels = tuple(t for t in batch)
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels.to(device))
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

Pearson correlation coefficient: 0.83


In [37]:
model_untrained = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)
model_untrained.to(device)

x_train = []
for _, row in traindata.iterrows():
    x_train.append(row['s1'])
    x_train.append(row['s2'])

input_ids, attention_masks, token_type_ids = convert_sentences_to_features(x_train, tokenizer, MAX_LEN)
y_test = torch.tensor(traindata['score'], dtype=torch.float)

trainset = TensorDataset(input_ids, attention_masks, token_type_ids, y_test)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for _, batch in tqdm(enumerate(trainloader), total=len(trainloader)):
        input_ids, attention_masks, _, labels = tuple(t for t in batch)
        outputs = model_untrained(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels.to(device))
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

  0%|          | 0/1605 [00:00<?, ?it/s]

Pearson correlation coefficient: -0.15


In [38]:
x_train = []
for _, row in traindata.iterrows():
    x_train.append(row['s1'])
    x_train.append(row['s2'])

input_ids, attention_masks, token_type_ids = convert_sentences_to_features(x_train, tokenizer, MAX_LEN)
y_test = torch.tensor(traindata['score'], dtype=torch.float)

trainset = TensorDataset(input_ids, attention_masks, token_type_ids, y_test)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)

y_true, y_pred = [], []
with torch.no_grad():
    for _, batch in tqdm(enumerate(trainloader), total=len(trainloader)):
        input_ids, attention_masks, _, labels = tuple(t for t in batch)
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels.to(device))
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])

corr = pearson_corr(y_true, y_pred)
print("Pearson correlation coefficient: {:.2f}".format(corr))

  0%|          | 0/1605 [00:00<?, ?it/s]

Pearson correlation coefficient: 0.95


In [41]:
x_train = []
for _, row in traindata.iterrows():
    x_train.append(row['s1'])
    x_train.append(row['s2'])

input_ids, attention_masks, token_type_ids = convert_sentences_to_features(x_train, tokenizer, MAX_LEN)
y_train = torch.tensor(traindata['score'], dtype=torch.float)

trainset = TensorDataset(input_ids, attention_masks, token_type_ids, y_train)
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=False)

df_raw = pd.read_csv('./raw_data/train.csv')
id = df_raw['id'].values
s1_raw = df_raw['s1'].values
s2_raw = df_raw['s2'].values
y_raw = df_raw['score'].values

y_pred = []

model.eval()
with torch.no_grad():
    for _, batch in tqdm(enumerate(trainloader), total=len(trainloader)):
        input_ids, attention_masks, _, labels = tuple(t for t in batch)
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels.to(device))
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])
    # print(len(train_embeddings1))
    # output = model(train_embeddings1, train_embeddings2)
    # y_pred = output.squeeze()
    # # y_pred = model(s1_train, s2_train)
    # # y_pred = y_pred.numpy()
    # # y_pred = np.squeeze(y_pred)
    # # y_pred = np.clip(y_pred, 0, 5)
    # print(id.shape, s1_raw.shape, y_raw.shape, len(y_pred))
    results = pd.DataFrame({'id': id, 'sentence1': s1_raw, 'sentence2': s2_raw, 'similarity': y_raw, 'predicted_similarity': y_pred})
    results.to_csv(f'Results/train.csv', index=False)
    print(f'Predictions saved in results.csv')

correlation = pd.Series(y_raw).corr(pd.Series(y_pred))
print('Correlation between expected and predicted similarity scores:', correlation)


  0%|          | 0/1605 [00:00<?, ?it/s]

Predictions saved in results.csv
Correlation between expected and predicted similarity scores: 0.9635309027564102


In [42]:
x_test = []
for _, row in testdata.iterrows():
    x_test.append(row['s1'])
    x_test.append(row['s2'])

input_ids, attention_masks, token_type_ids = convert_sentences_to_features(x_test, tokenizer, MAX_LEN)
y_test = torch.tensor(testdata['score'], dtype=torch.float)

testset = TensorDataset(input_ids, attention_masks, token_type_ids, y_test)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, shuffle=False)

df_raw = pd.read_csv('./raw_data/test.csv')
id = df_raw['id'].values
s1_raw = df_raw['s1'].values
s2_raw = df_raw['s2'].values
y_raw = df_raw['score'].values

y_pred = []

model.eval()
with torch.no_grad():
    for _, batch in tqdm(enumerate(testloader), total=len(testloader)):
        input_ids, attention_masks, _, labels = tuple(t for t in batch)
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks, labels=labels.to(device))
        y_true.extend(labels.tolist())
        y_pred.extend([row[0] for row in outputs[1].tolist()])
    # print(len(train_embeddings1))
    # output = model(train_embeddings1, train_embeddings2)
    # y_pred = output.squeeze()
    # # y_pred = model(s1_train, s2_train)
    # # y_pred = y_pred.numpy()
    # # y_pred = np.squeeze(y_pred)
    # # y_pred = np.clip(y_pred, 0, 5)
    # print(id.shape, s1_raw.shape, y_raw.shape, len(y_pred))
    results = pd.DataFrame({'id': id, 'sentence1': s1_raw, 'sentence2': s2_raw, 'similarity': y_raw, 'predicted_similarity': y_pred})
    results.to_csv(f'Results/test.csv', index=False)
    print(f'Predictions saved in results.csv')

correlation = pd.Series(y_raw).corr(pd.Series(y_pred))
print('Correlation between expected and predicted similarity scores:', correlation)


  0%|          | 0/419 [00:00<?, ?it/s]

Predictions saved in results.csv
Correlation between expected and predicted similarity scores: 0.7902926801810988
