Here's all the imports are done :)

In [2]:
import random
import pandas as pd

from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation, util

  from .autonotebook import tqdm as notebook_tqdm


# Data preprocessing

In this step data preprocessing is done. 
The idea is to take all duplicate pairs and pad the number up to 7500 with non duplicate pairs. This way we can get small, but balanced dataset

In [None]:
#load data
random.seed(4)
raw_data = pd.read_csv('train_normalised.csv', sep='\t',  index_col='pair_id')

#find all duplicate pairs idx
duplicate_idx = raw_data.index[raw_data['is_duplicate'] == 1].tolist()
random_idx = []

#choosing needed number of non duplicate pairs randomly
raw_data[['is_duplicate']] = raw_data[['is_duplicate']].astype(float)
rand_count = 0
while rand_count < 7500 - len(duplicate_idx):
    number = random.randint(0, 497819)
    if number not in duplicate_idx:
        random_idx.append(number)
        rand_count += 1

#train/test split
#90% - train, 10% - test
train_idx = duplicate_idx[:int(len(duplicate_idx)*0.9)] + random_idx[:int(len(duplicate_idx)*0.9)]
eval_idx = duplicate_idx[int(len(duplicate_idx)*0.9):] + random_idx[int(len(duplicate_idx)*0.9):]

train_data = raw_data.loc[train_idx].copy()
eval_data = raw_data.loc[eval_idx].copy()

#creating dataloader 
train_examples = []
for index, row in train_data.iterrows():
    train_examples.append(InputExample(texts=[row['name_1'], row['name_2']], 
                                       label= row['is_duplicate']))
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)

#evaluator = evaluation.EmbeddingSimilarityEvaluator(eval_data['name_1'].tolist(), eval_data['name_2'].tolist(), eval_data['is_duplicate'].tolist())

# Training

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder='model')
train_loss = losses.CosineSimilarityLoss(model)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)], 
          epochs=50,
          warmup_steps=100,
          #evaluator=evaluator,
          #evaluation_steps=500,
          output_path='model_norm/sentence-transformers_all-MiniLM-L6-v2')

# Testing

In [None]:
#testing
cosine_scores = []

name1 = eval_data['name_1'].to_list()
name2 = eval_data['name_2'].to_list()
dup = eval_data['is_duplicate'].to_list()

emb1 = model.encode(name1, convert_to_tensor=True)
emb2 = model.encode(name2, convert_to_tensor=True)

res = util.cos_sim(emb1, emb2)

count_tp = 0
count_fp = 0
count_tn = 0
count_fn = 0
thres = 0.8
for i in range(len(name1)):
    if (float(res[i][i]) > thres) and dup[i] == 1.0:
        count_tp += 1
    elif (float(res[i][i]) > thres) and dup[i] == 0.0:
        count_fp +=1
    elif (float(res[i][i]) < thres) and dup[i] == 0.0:
        count_tn +=1
    elif (float(res[i][i]) < thres) and dup[i] == 1.0:
        count_fn +=1

acc = (count_tp + count_tn)/(count_tp + count_fp + count_tn + count_fn)

acc = round((count_tp + count_tn)/(count_tp + count_fp + count_tn + count_fn) * 100, 3)

print("Total number of duplicate pairs in test set: {}".format(len(eval_data[eval_data['is_duplicate']==1.0])))
print("Total number of non duplicate pairs in test set: {}".format(len(eval_data[eval_data['is_duplicate']==0.0])))
print("Accuracy: {}%".format(acc))
print("True positive: {}\nFalse positive: {}\nTrue negative: {}\nFalse negative: {}".format(count_tp, count_fp, count_tn, count_fn))