In [1]:
import os
import csv
import math
import torch
import logging
import pandas as pd
from datetime import datetime
from torch.utils.data import DataLoader, Dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample

from sentence_transformers import SentenceTransformer,util
import torch
import scipy
from scipy.stats import pearsonr

device=""
if torch.cuda.is_available():
    device = torch.device("cuda")
    logging.info(f'Using GPU: {torch.cuda.get_device_name()}')
    print(f'Using GPU: {torch.cuda.get_device_name()}')
else:
    device = torch.device("cpu")
    logging.info('Using CPU')
    print('Using CPU')

  from .autonotebook import tqdm as notebook_tqdm


Using GPU: NVIDIA GeForce RTX 3060 Laptop GPU


In [2]:
PATH="data/"

In [3]:


class TSDataset(Dataset):
    def __init__(self, file_path):
        data = []
        with open(file_path, encoding="utf8") as f:
            reader = csv.DictReader(f, delimiter="\t")
            for row in reader:
                data.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=(float(row["score"]))/5))

        self.samples = data

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return sample


In [20]:
model_name = "all-mpnet-base-v2"
train_batch_size = 32
num_epochs = 4

In [5]:
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)

In [6]:
model_save_path = (
    "output/training-" + model_name 
)


In [21]:
model = SentenceTransformer(model_name)
model.to(device)
logging.info("Read train dataset")

modules.json: 100%|██████████| 349/349 [00:00<?, ?B/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<?, ?B/s] 
README.md: 100%|██████████| 10.7k/10.7k [00:00<00:00, 10.7MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 611kB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:04<00:00, 20.2MB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<?, ?B/s] 
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 552kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 745kB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 112kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 


In [22]:
train_dataset = TSDataset(f"{PATH}train.csv")
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


In [23]:
logging.info("Read dev dataset")
dev_dataset = TSDataset(f"{PATH}dev.csv")

# dev_input_examples = [InputExample(texts=[sentence1, sentence2], label=score) for sentence1, sentence2, score in dev_dataset]

# evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_input_examples, name="dev")
# evaluator = EmbeddingSimilarityEvaluator(dev_dataset)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_dataset, name="dev")


In [24]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

In [25]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=500,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    # device=device
)

Iteration: 100%|██████████| 179/179 [00:11<00:00, 15.51it/s]
Iteration: 100%|██████████| 179/179 [00:11<00:00, 15.98it/s]
Iteration: 100%|██████████| 179/179 [00:11<00:00, 16.02it/s]
Iteration: 100%|██████████| 179/179 [00:11<00:00, 15.99it/s]
Epoch: 100%|██████████| 4/4 [00:51<00:00, 12.88s/it]


In [26]:
# test_dataset = TSDataset(f"{PATH}test.csv")

In [27]:
untrained_model = SentenceTransformer(model_name)
untrained_model.to(device)

loaded_model = SentenceTransformer(model_save_path)
loaded_model.to(device)

models=[untrained_model, loaded_model]

In [28]:
validation_data = pd.read_csv(f"{PATH}dev.csv",sep="\t")
validation_data
validation_data.dropna(inplace=True)

In [29]:
sentence_set_1 = validation_data['sentence1'].to_numpy()
sentence_set_2 = validation_data['sentence2'].to_numpy()
sentence_set_1.shape,sentence_set_2.shape

((1468,), (1468,))

In [30]:
for model in models:
    print(f"Model: {model}")
    encode_sentence_set_1 = model.encode(sentence_set_1)
    encode_sentence_set_2 = model.encode(sentence_set_2)

    cosine_similarities = util.cos_sim(encode_sentence_set_1,encode_sentence_set_2).tolist()

    predicted_scores = []
    pairwise_pearson=[]
    for i in range(len(cosine_similarities)):
        cos_score = (cosine_similarities[i][i]+1)/2
        pairwise_pearson.append(pearsonr(encode_sentence_set_1[i],encode_sentence_set_2[i])[0])
        predicted_scores.append(cos_score*5)
        
        
    data={
        'pairwise_pearson':pairwise_pearson,
        'predicted_score':predicted_scores,
        'sentence1':sentence_set_1,
        'sentence2':sentence_set_2
    }
    final_dataframe = pd.DataFrame(data)
    final_dataframe.to_csv("task_1_B.csv")
    final_dataframe

    score_val = validation_data['score']
    score_pred_val = final_dataframe['predicted_score']

    print(f"Pearson's Coeffecient = {pearsonr(score_val,score_pred_val)[0]}")

Model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
Pearson's Coeffecient = 0.8631423812646389
Model: SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)
Pearson's Coeffecient = 0.8935878127304003


In [31]:
torch.cuda.empty_cache()