### Importing the necessary libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import RandomSampler
from torch.nn.utils import clip_grad_norm_
from transformers import AutoModel, AutoTokenizer, AdamW, BertTokenizer, BertModel
from transformers.optimization import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os, pickle

### Reading the dataset using pandas

In [None]:
LANG = "eng"
train_data = pd.read_csv("{}_train.csv".format(LANG))
dev_data = pd.read_csv("{}_dev.csv".format(LANG))
train_data.head(10)

Unnamed: 0,PairID,Text,Score
0,ENG-dev-0000,The story is gripping and interesting.\r\nIt's...,0.64
1,ENG-dev-0001,The majority of Southeast Alaska 's area is pa...,0.61
2,ENG-dev-0002,and from your post i think you are to young to...,0.31
3,ENG-dev-0003,The film 's success also made Dreamworks Anima...,0.59
4,ENG-dev-0004,I am still confused about how I feel about thi...,0.5
5,ENG-dev-0005,Connor is the exception to his families rules....,0.42
6,ENG-dev-0006,Ellegarden is a Japanese rock group formed in ...,0.7
7,ENG-dev-0007,It was listed on the National Register of Hist...,0.7
8,ENG-dev-0008,an hour and a half from the evening service pa...,0.59
9,ENG-dev-0009,One of the major disorders is Gender Identity ...,0.63


In [None]:
pilot_data = pd.read_csv("sem_text_rel_ranked.csv")
pilot_data.head(10)

Unnamed: 0,Index,SourceID,SubsetID,PairID,Text,Score
0,0,Formality,Formality_pp,Formality_pp_222,"It that happens, just pull the plug.\r\nif tha...",1.0
1,1,STS,STS,STS_237,A black dog running through water.\r\nA black ...,1.0
2,2,ParaNMT,ParaNMT_pp,ParaNMT_pp_204,I've been searchingthe entire abbey for you.\r...,1.0
3,3,Formality,Formality_pp,Formality_pp_119,If he is good looking and has a good personali...,1.0
4,4,Formality,Formality_pp,Formality_pp_174,"She does not hate you, she is just annoyed wit...",1.0
5,5,STS,STS,STS_211,Actor Gazzara dead at 81\r\nActor Ben Gazzara ...,1.0
6,6,Formality,Formality_pp,Formality_pp_277,"No, I really didn't want New York to win.\r\nN...",1.0
7,7,Formality,Formality_pp,Formality_pp_167,I hae no problems with them.\r\nlol i have no ...,1.0
8,8,Formality,Formality_pp,Formality_pp_123,Your parents do not have to like your boyfrien...,1.0
9,9,Formality,Formality_pp,Formality_pp_194,"I think Taylor is really cute, but I hate his ...",1.0


### Basic preprocessing

The sentences present in each pair are joined using a `[SEP]` token to identify each sentence. The given pair is appended to a `[CLS]` token and terminated with another `[SEP]` token which represents the data format that the BERT model requries for further training. 

In [None]:
def conc(sentence1, sentence2):
    return '[CLS] ' + sentence1 + ' [SEP] ' + sentence2 + ' [SEP]'

def preprocess(file):
    data = pd.read_csv(file)
    split_text = data["Text"].str.split("\r\n", expand=True)
    data["sentence1"] = split_text[0]
    data["sentence2"] = split_text[1]
    # print(data[["sentence1", "sentence2"]].head())
    data["sentence"] = data.apply(lambda x: conc(x["sentence1"], x["sentence2"]), axis=1)
    data = data[["sentence","Score"]]
    train_data = data.iloc[0:int(len(data)*0.8),:]
    val_data = data.iloc[len(train_data):, :]
    return train_data, val_data

LANG = "eng"
train_data, val_data = preprocess("{}_train.csv".format(LANG))
pilot_data_train, pilot_data_val = preprocess("sem_text_rel_ranked.csv")
train_data = pd.concat([train_data, pilot_data_train], axis = 0)
val_data = pd.concat([val_data, pilot_data_val], axis = 0)
train_data.head(10)
print(train_data.shape)

(4600, 2)


In [None]:
train_data

Unnamed: 0,sentence,Score
0,[CLS] The story is gripping and interesting. [...,0.64
1,[CLS] The majority of Southeast Alaska 's area...,0.61
2,[CLS] and from your post i think you are to yo...,0.31
3,[CLS] The film 's success also made Dreamworks...,0.59
4,[CLS] I am still confused about how I feel abo...,0.50
...,...,...
4395,[CLS] A group of men and women eating and drin...,0.30
4396,[CLS] Numbered participants race past a man dr...,0.30
4397,[CLS] This book made me feel stupid by reading...,0.30
4398,[CLS] @HomeOfUncleSam @ScotsFyre @RWNutjob1 @S...,0.30


In [None]:
val_data.head(10)
print(val_data.shape)

(1150, 2)


In [None]:
# creationg the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")

### Creating a custom Pytorch Dataset

In [None]:
class STRDataset(Dataset):

  def __init__(self, input_list: list, scores_list: list) -> None:
    self.input_data = input_list
    self.scores_data = scores_list
    # self.target_data = target_list

  def __len__(self):
    return len(self.input_data)

  def __getitem__(self, index):
    return (self.input_data[index], self.scores_data[index])

# padding function

def collate_fn(batch):

  input_data = [item[0] for item in batch]
  scores_data = [item[1] for item in batch]
  # target_data = [item[2] for item in batch]
  input_tensors = tokenizer(input_data, padding = "longest", return_tensors = "pt", truncation = True, add_special_tokens = False)
  scores_tensors = torch.tensor(scores_data)
  return (input_tensors, scores_tensors)


In [None]:
BATCH_SIZE = 32
WIDTH = 256
FUNNEL = 4
DROPOUT_RATE = 0.2
LEARNING_RATE = 0.0001
EPOCHS = 15
device = ("cuda" if torch.cuda.is_available() else "cpu")

### Writing the CustomModel architecture
A BERT based architecture containing a feed forward neural network which essentially approximates the scoring mechanism between two sentences in the range `[0,1]`

In [None]:
class CustomModel(nn.Module):
    def __init__(self, hidden_dim):
        super(CustomModel, self).__init__()
        self.embedding_model = bert_model
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_dim, BATCH_SIZE),  # hidden_dim * 2 because we concatenate the difference and product
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(BATCH_SIZE, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None):
        outputs = self.embedding_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        sentence1_embedding = outputs[1][:, :768//2]  # assuming BERT-base with 768 hidden size
        sentence2_embedding = outputs[1][:, 768//2:]
        abs_difference = torch.abs(sentence1_embedding - sentence2_embedding)
        elementwise_product = sentence1_embedding * sentence2_embedding
        comparison_features = torch.cat([abs_difference, elementwise_product], dim=1)
        result = self.fc_layers(comparison_features)

        return result


### Helper functions for initiating the model with the necessary parameters, ptimizers and loss function

In [None]:
def loadmodel(train_dataloader, hidden_dim, epochs = EPOCHS, learning_rate = LEARNING_RATE):
  model = CustomModel(hidden_dim)
  optimizer = AdamW(model.parameters(), lr = learning_rate)
  total_steps = len(train_dataloader) * epochs
  scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
  return model, optimizer, scheduler

def save_checkpoint(path, model, optimizer, scheduler, fold_i, epoch, loss):
    file_name = f'{path}/model_{fold_i}.ckpt.{epoch}'
    torch.save(
        {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'loss': loss
        }, file_name)

    print(f"Saving epoch {epoch} checkpoint at {file_name}\n")


### Validation Loop

In [None]:
from sklearn.metrics import mean_squared_error

def validate(model, valloader):
    model.eval()
    total_loss, total_acc = 0, 0

    for batch_idx, (input, score) in enumerate(valloader):
        # Move input to the device
        input = {key: value.to(device) for key, value in input.items()}
        score = score.to(device)
        # print(score)
        with torch.no_grad():
            output = model(**input).squeeze()

        loss = criterion(output, score)
        total_loss += loss.item()

        pred_score = output.squeeze().cpu().numpy()
        # print(pred_score)
        mse = mean_squared_error(score.cpu().numpy(), pred_score)

        total_acc += mse

    total_loss /= (batch_idx + 1)
    total_acc /= (batch_idx + 1)

    return total_loss, total_acc

### Training Loop

In [None]:
def train(model, trainloader, valloader, epochs=EPOCHS, path="v1"):
    model.train()
    model.to(device)

    for epoch in range(epochs):
        epoch_mean_losses = []
        total_loss, batch_loss, batch_count = 0, 0, 0

        for batch_idx, (input, score) in enumerate(trainloader):
            # Move input to the device
            input = {key: value.to(device) for key, value in input.items()}
            score = score.to(device)
            batch_count += 1
            model.zero_grad()
            output = model(**input).squeeze()

            loss = criterion(output, score)
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2)
            optimizer.step()
            scheduler.step()

            if (batch_idx % 100 == 0 and batch_idx != 0):
                learning_rate = optimizer.param_groups[0]['lr']
                print(f"Step : {batch_idx}, LR : {learning_rate:.8f}, Avg Loss : {batch_loss / batch_count:.4f}")
                batch_count, batch_loss = 0, 0

        print(f"Epoch {epoch} Total Mean Loss : {total_loss/(batch_idx + 1):.4f}")
        epoch_mean_losses.append(total_loss/(batch_idx + 1))

        if valloader is not None:
            print(f'Validation')
            valid_loss, valid_acc = validate(model, valloader)
            print(f"Epoch {epoch}, loss : {valid_loss:.4f}, accuracy : {valid_acc:.4f}\n")

    print("Train Completed")
    return epoch_mean_losses

In [None]:
train_dataset = STRDataset(train_data["sentence"].to_list(), train_data["Score"].to_list())
val_dataset = STRDataset(val_data["sentence"].to_list(), val_data["Score"].to_list())
trainloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_fn)
valloader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_fn)
print(len(train_dataset))
print(len(val_dataset))

4600
1150


In [None]:
version = 'v01'
os.makedirs(version, exist_ok=True)
os.makedirs('checkpoint', exist_ok=True)

In [None]:
criterion = nn.MSELoss()
model, optimizer, scheduler = loadmodel(trainloader, hidden_dim = 768, epochs = EPOCHS, learning_rate = LEARNING_RATE)



In [None]:
print(model)

CustomModel(
  (embedding_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [None]:
epoch_mean_losses = train(model, trainloader, valloader, epochs=EPOCHS, path=version)

Step : 100, LR : 0.00009532, Avg Loss : 0.0219
Epoch 0 Total Mean Loss : 0.0204
Validation
Epoch 0, loss : 0.0699, accuracy : 0.0699

Step : 100, LR : 0.00008866, Avg Loss : 0.0095
Epoch 1 Total Mean Loss : 0.0094
Validation
Epoch 1, loss : 0.0461, accuracy : 0.0461

Step : 100, LR : 0.00008199, Avg Loss : 0.0041
Epoch 2 Total Mean Loss : 0.0041
Validation
Epoch 2, loss : 0.0448, accuracy : 0.0448

Step : 100, LR : 0.00007532, Avg Loss : 0.0019
Epoch 3 Total Mean Loss : 0.0019
Validation
Epoch 3, loss : 0.0422, accuracy : 0.0422

Step : 100, LR : 0.00006866, Avg Loss : 0.0009
Epoch 4 Total Mean Loss : 0.0009
Validation
Epoch 4, loss : 0.0474, accuracy : 0.0474

Step : 100, LR : 0.00006199, Avg Loss : 0.0004
Epoch 5 Total Mean Loss : 0.0004
Validation
Epoch 5, loss : 0.0466, accuracy : 0.0466

Step : 100, LR : 0.00005532, Avg Loss : 0.0002
Epoch 6 Total Mean Loss : 0.0002
Validation
Epoch 6, loss : 0.0470, accuracy : 0.0470

Step : 100, LR : 0.00004866, Avg Loss : 0.0001
Epoch 7 Total M

### Evaluation function on test dataset

In [1]:
def test_model(model, sentences):
    # Tokenize the sentences
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Model inference
    with torch.no_grad():
        model.eval()
        predictions = model(**inputs)

    return predictions.cpu().numpy()

# Example usage:
# combined_data = pd.concat([pilot_data_train, pilot_data_val, train_data, val_data], axis = 0)
combined_data = pd.concat([train_data, val_data], axis = 0)
sentences = combined_data["sentence"].to_list()
# print(sentences)
predictions = test_model(model, sentences)
score_df = pd.DataFrame({ "actual" : combined_data["Score"].to_list(), "pred" : predictions})
# print(predictions)

In [None]:
predictions[100]

array([0.4188895], dtype=float32)