In [None]:
# Installing required dependencies

!pip install transformers
!pip install evaluate
!pip install datasets

# Importing dependencies

import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
from scipy.stats import pearsonr, spearmanr
from datasets import load_dataset
import pandas as pd

In [None]:
# Reading the News data file
df_news = pd.read_csv('sts_news.csv')

# Reading the Sports data file
df_sports = pd.read_csv('sts_sports.csv')

# Importing Biomedical data from Huggingface Datasets
biosses_sts = 'mteb/biosses-sts'
dataset_biosses_sts = load_dataset(biosses_sts, split="test")

# Changing column names so that each data in formatted in similar fashion
new_column_names = ['sentence1', 'sentence2', 'score']

# Assign new column names to the news dataset
df_news.columns = new_column_names
# Assign new column names to the sports dataset
df_sports.columns = new_column_names

# Assigning model
model_name_bert = "bert-base-uncased"


Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.1k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
def getSpearmanCorr(dataset,modelName):
  # Lists to store predicted and true similarity scores
  predicted_scores = []
  true_scores = dataset["score"]

  # Load pre-trained model and tokenizer
  model_name = modelName
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name)

  # Defining a function for mean pooling to take attention mask into account for correct averaging
  def mean_pooling(model_output,attention_mask):
    #First element of model_output contains all token embeddings
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded,1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings/sum_mask

  for i in range(len(dataset)):
    #Sentences we want sentence embeddings for
    Sentence1 = [dataset["sentence1"][i]]
    Sentence2 = [dataset["sentence2"][i]]

    #Tokenize the sentences
    encoded_input1 = tokenizer(Sentence1, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_input2 = tokenizer(Sentence2, padding=True, truncation=True, max_length=128, return_tensors='pt')

    #Compute token embeddings
    with torch.no_grad():
        model_output1 = model(**encoded_input1)
        model_output2 = model(**encoded_input2)

    #Performing mean pooling
    sentence_embeddings1 = mean_pooling(model_output1, encoded_input1['attention_mask'])
    sentence_embeddings2 = mean_pooling(model_output2, encoded_input2['attention_mask'])

    # Calculate cosine similarity score
    similarity_score = F.cosine_similarity(sentence_embeddings1, sentence_embeddings2).item()
    # Rescale the score from (-1,1) to (0,1) to match the targets.
    rescaled_similarity_score = (similarity_score + 1)/2

    # Append the score to the List of Predicted scores
    predicted_scores.append(rescaled_similarity_score)

  # Calculate Spearman rank correlation coefficient
  spearman_corr, _ = spearmanr(true_scores, predicted_scores)
  return spearman_corr


In [None]:
# Defining a dictionary to store Spearman Coorelation for each dataset
SpearmanCorrBertDomain = {}
SpearmanCorrBertDomain['news'] = getSpearmanCorr(df_news,model_name_bert)
SpearmanCorrBertDomain['sports'] = getSpearmanCorr(df_sports,model_name_bert)
SpearmanCorrBertDomain['biosses'] = getSpearmanCorr(dataset_biosses_sts,model_name_bert)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
key = list(SpearmanCorrBertDomain.keys())
for i in range(3):
  print("Spearman Coorelation for", key[i], "dataset is", SpearmanCorrBertDomain[key[i]])

Spearman Coorelation for news dataset is 0.25574683339074133
Spearman Coorelation for sports dataset is 0.49926544728141753
Spearman Coorelation for biosses dataset is 0.5469823428818151
