In [None]:
! pip install torch
! pip install transformers
! pip install tqdm
! pip install pandas
! pip install torchmetrics
! pip install -U sentence-transformers

In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
from torchmetrics.regression import PearsonCorrCoef
from sentence_transformers import SentenceTransformer, util, InputExample, losses, models, evaluation

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
def load_data(file_path):
    data = pd.read_table(file_path)
    # check if any missing values
    print(data.isnull().sum())
    key = data.keys()
    # some values were missing in sentence2 column, so did the below (sentence1 didnt split properly)
    # iterate through the rows in dataframe which have missing values
    for index, row in data[data.isnull().any(axis=1)].iterrows():
        if pd.isnull(row[key[2]]):
            if(len(row[key[1]].split('\t')) > 2 or len(row[key[1]].split('\t')) < 2):
                data.drop(index, inplace=True)
                continue
            # split the sentence1 into words into 2 parts based on \t and assign to sentence1 and sentence2
            sentence1, sentence2 = row[key[1]].split('\t')
            sentenceid = row[key[0]]
            # assign to the row
            data.at[index, key[1]] = sentence1
            data.at[index, key[2]] = sentence2
            data.at[index, key[0]] = sentenceid
    return data

In [6]:
test_data = load_data('test.csv')

id           0
sentence1    0
sentence2    0
dtype: int64


### Defining Device Variable

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda:0


### Task 1C

In [81]:
from sentence_transformers import SentenceTransformer, models
# Create the Sentence Transformer model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [82]:
class ValidationDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence1 = self.data.iloc[idx, 1]
        sentence2 = self.data.iloc[idx, 2]
        id = torch.tensor(self.data.iloc[idx, 0])
        return [sentence1, sentence2], id

test_dataset = ValidationDataset(test_data)

test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=8, shuffle=False)

In [83]:
model.to(device)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [15]:
scores = []
#load checkpoint
checkpoint = torch.load('/1C_model.pth')
model.load_state_dict(checkpoint)
def validation(model, valid_loader):
    model.eval()
    all_scores = []
    with torch.no_grad():
        for batch in tqdm(valid_loader):
            sentences, targets = batch
            for i in range(len(targets)):
                sentence1_features = model.encode(sentences[0][i], convert_to_tensor=True).to(device)
                sentence2_features = model.encode(sentences[1][i], convert_to_tensor=True).to(device)
                score = util.pytorch_cos_sim(sentence1_features, sentence2_features)
                all_scores.append(score.item())
    return all_scores

In [80]:
output1c = validation(model, test_loader)

100%|██████████| 1/1 [00:00<00:00,  3.16it/s]

[0.9926149845123291, 0.7718663811683655, 0.6741741299629211, 0.9912800788879395, 0.7612790465354919, 0.9965555667877197]





In [7]:
scores = output1c
scores = [i * 5 for i in scores]
pearson = PearsonCorrCoef()
print(torch.tensor(scores))
print(torch.tensor([5.000, 4.750, 5.000, 2.400, 2.750, 2.615]))
pearson(torch.tensor(scores), torch.tensor([5.000, 4.750, 5.000, 2.400, 2.750, 2.615]))

In [13]:
#add scores to test_data
test_data['scores'] = scores
#make order as id, scores, sentence1, sentence2
test_data = test_data[['id', 'scores', 'sentence1', 'sentence2']]
test_data.to_csv('test_scores.csv', index=False, sep='\t')