In [None]:
!pip install -q sentence-transformers

In [None]:
#required libraries
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
from transformers import RobertaTokenizer, RobertaModel
import torch
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#unzip the folder
zip_file = "/content/drive/My Drive/NLP/sts2016-english-with-gs-v1.0.zip"
!unzip "/content/drive/My Drive/NLP/sts2016-english-with-gs-v1.0.zip" -d "/content/data"

Archive:  /content/drive/My Drive/NLP/sts2016-english-with-gs-v1.0.zip
  inflating: /content/data/sts2016-english-with-gs-v1.0/README.txt  
  inflating: /content/data/sts2016-english-with-gs-v1.0/STS2016.input.question-question.txt  
  inflating: /content/data/sts2016-english-with-gs-v1.0/STS2016.input.postediting.txt  
  inflating: /content/data/sts2016-english-with-gs-v1.0/STS2016.input.headlines.txt  
  inflating: /content/data/sts2016-english-with-gs-v1.0/STS2016.input.question-question.ascii  
  inflating: /content/data/sts2016-english-with-gs-v1.0/STS2016.input.headlines.ascii  
  inflating: /content/data/sts2016-english-with-gs-v1.0/STS2016.input.plagiarism.txt  
  inflating: /content/data/sts2016-english-with-gs-v1.0/LICENSE.txt  
  inflating: /content/data/sts2016-english-with-gs-v1.0/STS2016.gs.question-question.txt  
  inflating: /content/data/sts2016-english-with-gs-v1.0/STS2016.input.plagiarism.ascii  
  inflating: /content/data/sts2016-english-with-gs-v1.0/STS2016.input.a

In [None]:
txt_folder_path = "/content/data/sts2016-english-with-gs-v1.0"

**Preprocessing**

In [None]:
sentence_pairs = []
gold_scores = []

for filename in os.listdir(txt_folder_path):
    if filename.endswith(".ascii"):
      task_name = filename.split(".")[2]
      gs_filename = f"STS2016.gs.{task_name}.txt"

      with open(os.path.join(txt_folder_path, filename), 'r', encoding='ascii') as file:
        for i, line in enumerate(file):
          parts = line.strip().split("\t")
          if len(parts) >= 2:
              sentence_pairs.append((parts[0], parts[1]))
          else:
              print(f"Invalid line in {filename} at index {i + 1}: {repr(line.strip())}")

        with open(os.path.join(txt_folder_path, gs_filename), 'r', encoding='ascii') as file:
          for line in file:
            lines = file.readlines()
            for line in lines:
               if line.strip():
                try:
                  gold_scores.append(float(line.strip()))
                except ValueError:
                  print(f"Skipping invalid score line: {repr(line.strip())}") # Handle invalid lines
               else:  # If line is empty
                gold_scores.append(np.nan)
            gold_scores.append(np.nan)

if len(sentence_pairs) != len(gold_scores):
    raise ValueError("Mismatch between sentence pairs and gold scores.")

print(f"Total sentence pairs: {len(sentence_pairs)}")
print(f"Total gold scores: {len(gold_scores)}")

Invalid line in STS2016.input.question-question.ascii at index 1556: ''
Invalid line in STS2016.input.answer-answer.ascii at index 1573: ''
Invalid line in STS2016.input.headlines.ascii at index 1499: ''
Invalid line in STS2016.input.plagiarism.ascii at index 1272: ''
Invalid line in STS2016.input.postediting.ascii at index 3288: ''
Total sentence pairs: 9183
Total gold scores: 9183


**SBERT**

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
predicted_scores = []
for pair in sentence_pairs:
    embedding1 = model.encode(pair[0], convert_to_tensor=True)
    embedding2 = model.encode(pair[1], convert_to_tensor=True)

# Move embeddings to CPU and convert to NumPy arrays before calculating similarity
    embedding1 = embedding1.cpu().numpy()
    embedding2 = embedding2.cpu().numpy()

    # Compute cosine similarity
    similarity = cosine_similarity([embedding1], [embedding2])[0][0]
    predicted_scores.append(similarity)

# Compute Pearson correlation
scores_data={'Predicted':predicted_scores,'Gold':gold_scores}
scores_data=pd.DataFrame(scores_data)
pearson_corr = scores_data['Predicted'].corr(scores_data['Gold'],method='pearson')
print(f"Pearson Correlation: {pearson_corr}")
results = pd.DataFrame({
    "Sentence1": [pair[0] for pair in sentence_pairs],
    "Sentence2": [pair[1] for pair in sentence_pairs],
    "Gold_Score": gold_scores,
    "Predicted_Score": predicted_scores
})
results.to_csv("sts2016_results.csv", index=False)
print("Results saved to 'sts2016_results.csv'.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Pearson Correlation: 0.11076901907744913
Results saved to 'sts2016_results.csv'.


**RoBERTa**

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaModel.from_pretrained("roberta-large")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predicted_scores = []
for pair in sentence_pairs:
    inputs1 = tokenizer(pair[0], return_tensors="pt").to(device)
    inputs2 = tokenizer(pair[1], return_tensors="pt").to(device)

    with torch.no_grad():
      embedding1 = model(**inputs1).last_hidden_state[:, 0, :]
      embedding2 = model(**inputs2).last_hidden_state[:, 0, :]

      embedding1 = embedding1.squeeze(0).cpu().numpy()
      embedding2 = embedding2.squeeze(0).cpu().numpy()

      # Compute cosine similarity
      similarity = cosine_similarity([embedding1], [embedding2])[0][0]
      predicted_scores.append(similarity)

# Compute Pearson correlation
scores_data={'Predicted':predicted_scores,'Gold':gold_scores}
scores_data=pd.DataFrame(scores_data)
pearson_corr = scores_data['Predicted'].corr(scores_data['Gold'],method='pearson')
print(f"Pearson Correlation: {pearson_corr}")
results_roberta = pd.DataFrame({
    "Sentence1": [pair[0] for pair in sentence_pairs],
    "Sentence2": [pair[1] for pair in sentence_pairs],
    "Gold_Score": gold_scores,
    "Predicted_Score": predicted_scores
})
results_roberta.to_csv("sts2016_results_rob.csv", index=False)
print("Results saved to 'sts2016_results_rob.csv'.")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pearson Correlation: 0.004678339327469852
Results saved to 'sts2016_results_rob.csv'.


**USE**

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
use_model = hub.load(module_url)
predicted_scores = []

for pair in sentence_pairs:
    embeddings = use_model([pair[0], pair[1]])
    embedding1 = embeddings[0]
    embedding2 = embeddings[1]

    # Compute cosine similarity
    similarity = cosine_similarity([embedding1], [embedding2])[0][0]
    predicted_scores.append(similarity)

# Compute Pearson correlation
scores_data={'Predicted':predicted_scores,'Gold':gold_scores}
scores_data=pd.DataFrame(scores_data)
pearson_corr = scores_data['Predicted'].corr(scores_data['Gold'],method='pearson')
print(f"Pearson Correlation: {pearson_corr}")
results_use = pd.DataFrame({
    "Sentence1": [pair[0] for pair in sentence_pairs],
    "Sentence2": [pair[1] for pair in sentence_pairs],
    "Gold_Score": gold_scores,
    "Predicted_Score": predicted_scores
})
results_use.to_csv("sts2016_results_use.csv", index=False)
print("Results saved to 'sts2016_results_use.csv'.")

Pearson Correlation: 0.09578407250835982
Results saved to 'sts2016_results.csv'.
