In [None]:
# Importing required dependencies

!pip install transformers
!pip install evaluate
!pip install datasets

# Import Libraries

import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
from scipy.stats import pearsonr, spearmanr
from datasets import load_dataset

In [3]:
# Importing STS Data from Hugging Face Datasets Module

sts12_sts = 'mteb/sts12-sts'
sts13_sts = 'mteb/sts13-sts'
sts14_sts = 'mteb/sts14-sts'
sts15_sts = 'mteb/sts15-sts'
sts16_sts = 'mteb/sts16-sts'
sickr_sts = 'mteb/sickr-sts'
dataset_sts12_sts = load_dataset(sts12_sts, split="test")
dataset_sts13_sts = load_dataset(sts13_sts, split="test")
dataset_sts14_sts = load_dataset(sts14_sts, split="test")
dataset_sts15_sts = load_dataset(sts15_sts, split="test")
dataset_sts16_sts = load_dataset(sts16_sts, split="test")
dataset_sickr_sts = load_dataset(sickr_sts, split="test")

# Loading the model names

model_name_bert = "bert-base-uncased"
model_name_xlnet = 'xlnet-base-cased'
model_name_gpt = "gpt2"
model_name_roberta = 'roberta-base'

Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/132k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/118k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/60.2k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/145k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/116k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/46.2k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Downloading readme:   0%|          | 0.00/22.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/175k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
def getSpearmanCorr(dataset,modelName):
  # Lists to store predicted and true similarity scores
  predicted_scores = []
  true_scores = dataset["score"]

  # Load pre-trained model and tokenizer
  model_name = modelName
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModel.from_pretrained(model_name)
  # Added special condition to handle padding for gpt2 model
  if model_name == 'gpt2':
    tokenizer.pad_token = tokenizer.eos_token

  # Function for mean pooling to take attention mask into account for correct averaging
  def mean_pooling(model_output,attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded,1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings/sum_mask

  for i in range(len(dataset)):
    #Sentences we want sentence embeddings for
    Sentence1 = [dataset["sentence1"][i]]
    Sentence2 = [dataset["sentence2"][i]]

    #Tokenize sentences
    encoded_input1 = tokenizer(Sentence1, padding=True, truncation=True, max_length=128, return_tensors='pt')
    encoded_input2 = tokenizer(Sentence2, padding=True, truncation=True, max_length=128, return_tensors='pt')

    #Compute token embeddings
    with torch.no_grad():
        model_output1 = model(**encoded_input1)
        model_output2 = model(**encoded_input2)

    #Perform mean pooling
    sentence_embeddings1 = mean_pooling(model_output1, encoded_input1['attention_mask'])
    sentence_embeddings2 = mean_pooling(model_output2, encoded_input2['attention_mask'])

    # Calculate cosine similarity
    similarity_score = F.cosine_similarity(sentence_embeddings1, sentence_embeddings2).item()

    # Store predicted similarity scores
    predicted_scores.append(similarity_score)

  # Calculate Spearman rank correlation coefficient
  spearman_corr, _ = spearmanr(true_scores, predicted_scores)
  return spearman_corr


In [None]:
# Calculating the Spearman Coorelation for BERT Model on all STS Datasets

SpearmanCorrBert = {}
SpearmanCorrBert[sts12_sts] = getSpearmanCorr(dataset_sts12_sts,model_name_bert)
SpearmanCorrBert[sts13_sts] = getSpearmanCorr(dataset_sts13_sts,model_name_bert)
SpearmanCorrBert[sts14_sts] = getSpearmanCorr(dataset_sts14_sts,model_name_bert)
SpearmanCorrBert[sts15_sts] = getSpearmanCorr(dataset_sts15_sts,model_name_bert)
SpearmanCorrBert[sts16_sts] = getSpearmanCorr(dataset_sts16_sts,model_name_bert)
SpearmanCorrBert[sickr_sts] = getSpearmanCorr(dataset_sickr_sts,model_name_bert)
print("Spearman Rank Correlation Coefficient for BERT Model: ", SpearmanCorrBert)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Spearman Rank Correlation Coefficient for BERT Model:  {'mteb/sts12-sts': 0.3086968482730365, 'mteb/sts13-sts': 0.5989485045425808, 'mteb/sts14-sts': 0.4772789570305685, 'mteb/sts15-sts': 0.6028566106730275, 'mteb/sts16-sts': 0.6373272659220811, 'mteb/sickr-sts': 0.5864510311349127}


In [None]:
# Calculating the Spearman Coorelation for XLNet Model on all STS Datasets

SpearmanCorrXlnet = {}
SpearmanCorrXlnet[sts12_sts] = getSpearmanCorr(dataset_sts12_sts,model_name_xlnet)
SpearmanCorrXlnet[sts13_sts] = getSpearmanCorr(dataset_sts13_sts,model_name_xlnet)
SpearmanCorrXlnet[sts14_sts] = getSpearmanCorr(dataset_sts14_sts,model_name_xlnet)
SpearmanCorrXlnet[sts15_sts] = getSpearmanCorr(dataset_sts15_sts,model_name_xlnet)
SpearmanCorrXlnet[sts16_sts] = getSpearmanCorr(dataset_sts16_sts,model_name_xlnet)

print("Spearman Rank Correlation Coefficient for Xlnet Model: ", SpearmanCorrXlnet)

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

In [9]:
SpearmanCorrXlnet[sickr_sts] = getSpearmanCorr(dataset_sickr_sts,model_name_xlnet)

In [10]:
SpearmanCorrXlnet

{'mteb/sts12-sts': 0.32319444939345443,
 'mteb/sts13-sts': 0.24680655988172265,
 'mteb/sts14-sts': 0.21413849134947405,
 'mteb/sts15-sts': 0.37107162009515887,
 'mteb/sts16-sts': 0.359946971011225,
 'mteb/sickr-sts': 0.3813603020152468}

In [8]:
# Calculating the Spearman Coorelation for GPT2 Model on all STS Datasets

SpearmanCorrGPT = {}
SpearmanCorrGPT[sts12_sts] = getSpearmanCorr(dataset_sts12_sts,model_name_gpt)
SpearmanCorrGPT[sts13_sts] = getSpearmanCorr(dataset_sts13_sts,model_name_gpt)
SpearmanCorrGPT[sts14_sts] = getSpearmanCorr(dataset_sts14_sts,model_name_gpt)
SpearmanCorrGPT[sts15_sts] = getSpearmanCorr(dataset_sts15_sts,model_name_gpt)
SpearmanCorrGPT[sts16_sts] = getSpearmanCorr(dataset_sts16_sts,model_name_gpt)
SpearmanCorrGPT[sickr_sts] = getSpearmanCorr(dataset_sickr_sts,model_name_gpt)
print("Spearman Rank Correlation Coefficient for GPT Model: ", SpearmanCorrGPT)

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Spearman Rank Correlation Coefficient for GPT Model:  {'mteb/sts12-sts': 0.25843876125039716, 'mteb/sts13-sts': 0.2890846823220784, 'mteb/sts14-sts': 0.2620676968723734, 'mteb/sts15-sts': 0.34739826851881866, 'mteb/sts16-sts': 0.3569589761588685, 'mteb/sickr-sts': 0.43828255389445936}


In [11]:
# Calculating the Spearman Coorelation for Roberta Model on all STS Datasets

SpearmanCorrRoberta = {}
SpearmanCorrRoberta[sts12_sts] = getSpearmanCorr(dataset_sts12_sts,model_name_roberta)
SpearmanCorrRoberta[sts13_sts] = getSpearmanCorr(dataset_sts13_sts,model_name_roberta)
SpearmanCorrRoberta[sts14_sts] = getSpearmanCorr(dataset_sts14_sts,model_name_roberta)
SpearmanCorrRoberta[sts15_sts] = getSpearmanCorr(dataset_sts15_sts,model_name_roberta)
SpearmanCorrRoberta[sts16_sts] = getSpearmanCorr(dataset_sts16_sts,model_name_roberta)
SpearmanCorrRoberta[sickr_sts] = getSpearmanCorr(dataset_sickr_sts,model_name_roberta)
print("Spearman Rank Correlation Coefficient for GPT Model: ", SpearmanCorrRoberta)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['robert

Spearman Rank Correlation Coefficient for GPT Model:  {'mteb/sts12-sts': 0.32108683046603553, 'mteb/sts13-sts': 0.563290192541859, 'mteb/sts14-sts': 0.45219669860203976, 'mteb/sts15-sts': 0.6134479901277357, 'mteb/sts16-sts': 0.6197607518742416, 'mteb/sickr-sts': 0.6296137063824311}
