In [None]:
# Installing Required Dependencies

!pip install transformers
!pip install evaluate
!pip install datasets

# Importing dependencies

import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
from scipy.stats import pearsonr, spearmanr
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.optim as optim

In [None]:
# Reading the News data file
df_news = pd.read_csv('sts_news.csv')

# Reading the Sports data file
df_sports = pd.read_csv('sts_sports.csv')

# Importing Biomedical data from Huggingface Datasets
biosses_sts = 'mteb/biosses-sts'
dataset_biosses_sts = load_dataset(biosses_sts, split="test")

# Changing column names so that each data in formatted in similar fashion
new_column_names = ['sentence1', 'sentence2', 'score']

# Assign new column names to the news dataset
df_news.columns = new_column_names

# Assign new column names to the sports dataset
df_sports.columns = new_column_names

# Assigning model
model_name_bert = "bert-base-uncased"

In [None]:
# Creating Train Test Splits for News dataset
train_df_news, test_df_news = train_test_split(df_news, test_size=0.2, random_state=21)
train_df_news = train_df_news.reset_index(drop=True)
test_df_news = test_df_news.reset_index(drop=True)

# Creating Train Test Splits for Biosses dataset
train_bio, test_bio = train_test_split(dataset_biosses_sts, test_size=0.2, random_state=21)
# Converting Biosses data from dictionary to pandas dataframe
train_df_bio = pd.DataFrame(train_bio)
test_df_bio = pd.DataFrame(test_bio)

# Creating Train Test Splits for Sports dataset
train_df_sports, test_df_sports = train_test_split(df_sports, test_size=0.2, random_state=21)
train_df_sports = train_df_sports.reset_index(drop=True)
test_df_sports = test_df_sports.reset_index(drop=True)


In [None]:
class BertForSTS(nn.Module):
    def __init__(self, bert_model_name, num_classes, hidden_size):
        super(BertForSTS, self).__init__()
        # Use the pretrained BERT model
        self.bert = AutoModel.from_pretrained(bert_model_name)
        # Add a Linear Layer on toop of the BERT model for fine tuning
        self.fc1 = nn.Linear(hidden_size, hidden_size)  # Adding fully connected layers for fine-tuning
        # Cosine Similarity score to be computed
        self.cosine_similarity = nn.CosineSimilarity(dim=1)


    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        # Sentence embeddings passed to the BERT model to get the outputs of Sentence 1
        outputs1 = self.bert(input_ids=input_ids1, attention_mask=attention_mask1)
        # Apply mean pooling
        token_embeddings1 = outputs1[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask1.unsqueeze(-1).expand(token_embeddings1.size()).float()
        sum_embeddings = torch.sum(token_embeddings1 * input_mask_expanded,1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        pooled_output1 =  sum_embeddings/sum_mask

        # Sentence embeddings passed to the BERT model to get the outputs of Sentence 2
        outputs2 = self.bert(input_ids=input_ids2, attention_mask=attention_mask2)
        # Apply mean pooling
        token_embeddings2 = outputs2[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask2.unsqueeze(-1).expand(token_embeddings2.size()).float()
        sum_embeddings = torch.sum(token_embeddings2 * input_mask_expanded,1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        pooled_output2 =  sum_embeddings/sum_mask

        # Pass the outputs from the BERT model to perform the Linear Transformations as a additional head for fine tuning on Sentence 1 and 2
        pooled_output1 = self.fc1(pooled_output1)
        pooled_output2 = self.fc1(pooled_output2)

        # Calculate cosine similarity scores between the 2 sentences
        similarity_scores = self.cosine_similarity(pooled_output1, pooled_output2)

        return similarity_scores


In [None]:
def getSpearmanCorr(dataset,modelName):
  # Lists to store predicted and true similarity scores
  predicted_scores = []
  true_scores = dataset["score"]

  # Load pre-trained model and tokenizer
  model_name = modelName
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = BertForSTS(model_name, num_classes=1, hidden_size=768)
  criterion = nn.MSELoss() # Use CrossEntropyLoss for classification tasks
  optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

  total_loss = 0
  for i in range(len(dataset)):
    #Sentences we want sentence embeddings for
    Sentence1 = [dataset["sentence1"][i]]
    Sentence2 = [dataset["sentence2"][i]]

    #Tokenize sentences
    encoded_input1 = tokenizer(Sentence1, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_input2 = tokenizer(Sentence2, padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()

    #Computing Similarity Score
    similarity_score = model(encoded_input1['input_ids'], encoded_input1['attention_mask'],encoded_input2['input_ids'],encoded_input2['attention_mask'])
    # Rescale the score from (-1,1) to (0,1) to match the targets.
    rescaled_similarity_score = (similarity_score + 1)/2

    similarity_score_tensor = torch.tensor([rescaled_similarity_score], dtype=torch.float32, requires_grad=True)
    target_score_tensor = torch.tensor([dataset["score"][i]], dtype=torch.float32, requires_grad=True)

    # Compute the loss for each instance
    loss = criterion(similarity_score_tensor, target_score_tensor)

    # Compute the total loss
    total_loss += loss.item()

    loss.backward()
    optimizer.step()

  return total_loss / len(dataset), model

In [None]:
def testgetSpearmanCorr(dataset,func_model):
  # Lists to store predicted and true similarity scores
  predicted_scores = []
  true_scores = dataset["score"]

  # Load domain specific trained model and tokenizer
  model = func_model
  tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

  for i in range(len(dataset)):
    #Sentences we want sentence embeddings for
    Sentence1 = [dataset["sentence1"][i]]
    Sentence2 = [dataset["sentence2"][i]]

    #Tokenize sentences
    encoded_input1 = tokenizer(Sentence1, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_input2 = tokenizer(Sentence2, padding=True, truncation=True, max_length=128, return_tensors='pt')

    similarity_score = model(encoded_input1['input_ids'], encoded_input1['attention_mask'],encoded_input2['input_ids'],encoded_input2['attention_mask'])
    rescaled_similarity_score = (similarity_score + 1)/2
    rescaled_similarity_score = rescaled_similarity_score.detach().numpy()

    # Store predicted and true similarity scores
    predicted_scores.append(rescaled_similarity_score)

    # Calculate Spearman rank correlation coefficient
  spearman_corr, _ = spearmanr(true_scores, predicted_scores)
  return spearman_corr

### Training Custom Model Specific for News Data

In [None]:
loss, model_news = getSpearmanCorr(train_df_news, model_name_bert)

In [None]:
print("Average Loss for News Domain Data is: ",loss)

Average Loss for News Domain Data is:  0.18390363676371635


### Testing Custom Model Specific for News Data

In [None]:
model_news.eval()
corr_for_news = testgetSpearmanCorr(test_df_news, model_news)
print("Spearman Coorelation for scores given by Domain Specific(News) Model is: ", corr_for_news)

Spearman Coorelation for scores given by Domain Specific(News) Model is:  0.37811589973119136


### Training Custom Model Specific for Biomedical Data

In [None]:
loss, model_bio = getSpearmanCorr(train_df_bio, model_name_bert)

In [None]:
print("Average Loss for Biomedical Domain Data is: ",loss)

Average Loss for Biomedical Domain Data is:  3.03235981304897


### Testing Custom Model Specific for Biomedical Data

In [None]:
model_bio.eval()
corr_for_bio = testgetSpearmanCorr(test_df_bio, model_bio)
print("Spearman Coorelation for scores given by Domain Specific(News) Model is: ", corr_for_bio)

Spearman Coorelation for scores given by Domain Specific(News) Model is:  0.5851763367832985


### Training Custom Model Specific for Sports Data

In [None]:
loss, model_sports = getSpearmanCorr(train_df_sports, model_name_bert)

In [None]:
print("Average Loss for Sports Domain Data is: ",loss)

Average Loss for Sports Domain Data is:  0.13146289361627125


### Testing Custom Model Specific for Sports Data

In [None]:
model_sports.eval()
corr_for_sports = testgetSpearmanCorr(test_df_sports, model_sports)
print("Spearman Coorelation for scores given by Domain Specific(News) Model is: ", corr_for_sports)

Spearman Coorelation for scores given by Domain Specific(News) Model is:  0.25019823829630605
