In [21]:
from sentence_transformers import SentenceTransformer,InputExample,losses,evaluation,util
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [22]:
model_name = 'nli-distilroberta-base-v2'
data_dir = 'data/rt-polaritydata/'
device = 'cuda'

In [23]:
model = SentenceTransformer(model_name)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 75, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [24]:
pos = []
with open(data_dir + 'pos.txt','r',encoding='latin-1') as f:
    for line in f:
        pos.append(line.strip())
neg = []
with open(data_dir + 'neg.txt','r',encoding='latin-1') as f:
    for line in f:
        neg.append(line.strip())
print(len(pos),len(neg))
train_pos,test_pos = train_test_split(pos,test_size=0.33333,random_state=42)
train_neg,test_neg = train_test_split(neg,test_size=0.33333,random_state=42)
print(len(train_pos),len(test_pos),len(train_neg),len(test_neg))

5331 5331
3554 1777 3554 1777


In [25]:
train_examples = []
for i in tqdm(range(len(train_pos))):
    for j in range(len(train_neg)):
        train_examples.append(InputExample(texts=[pos[i],neg[j]],label=0.))
for i in tqdm(range(len(train_pos))):
    for j in range(i+1,len(train_pos)):
            train_examples.append(InputExample(texts=[pos[i],pos[j]],label=1.))
for i in tqdm(range(len(train_neg))):
    for j in range(i+1,len(train_neg)):
            train_examples.append(InputExample(texts=[neg[i],neg[j]],label=1.))

100%|██████████| 3554/3554 [00:38<00:00, 92.11it/s] 
100%|██████████| 3554/3554 [00:21<00:00, 164.61it/s]
100%|██████████| 3554/3554 [00:04<00:00, 769.08it/s] 


In [26]:
test_examples = []
for i in tqdm(range(len(test_pos))):
    for j in range(len(test_neg)):
        test_examples.append(InputExample(texts=[pos[i],neg[j]],label=0.))
for i in tqdm(range(len(test_pos))):
    for j in range(i+1,len(test_pos)):
        test_examples.append(InputExample(texts=[pos[i],pos[j]],label=1.))
for i in tqdm(range(len(test_neg))):
    for j in range(i+1,len(test_neg)):
        test_examples.append(InputExample(texts=[neg[i],neg[j]],label=1.))

100%|██████████| 1777/1777 [00:02<00:00, 771.80it/s]
100%|██████████| 1777/1777 [00:01<00:00, 1555.97it/s]
100%|██████████| 1777/1777 [00:19<00:00, 90.38it/s]


In [27]:
print(len(train_examples),len(test_examples))

25258278 6313681


Computing the cosine similarities before fine tuning (using the pretrained model)

In [28]:
test_pos_embeddings = model.encode(test_pos,show_progress_bar=True,convert_to_tensor=True)
test_neg_embeddings = model.encode(test_neg,show_progress_bar=True,convert_to_tensor=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=56.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=56.0, style=ProgressStyle(description_width…




In [29]:
cosine_scores = util.cos_sim(test_pos_embeddings,test_pos_embeddings).cpu().numpy()
print("Test Pos-Pos Similarity",np.mean(cosine_scores))
cosine_scores = util.cos_sim(test_pos_embeddings,test_neg_embeddings).cpu().numpy()
print("Test Pos-Neg Similarity",np.mean(cosine_scores))
cosine_scores = util.cos_sim(test_neg_embeddings,test_neg_embeddings).cpu().numpy()
print("Test Neg-Neg Similarity",np.mean(cosine_scores))

Test Pos-Pos Similarity 0.29552466
Test Pos-Neg Similarity 0.25853947
Test Neg-Neg Similarity 0.28099316


In [30]:
train_pos_embeddings = model.encode(train_pos,show_progress_bar=True,convert_to_tensor=True)
train_neg_embeddings = model.encode(train_neg,show_progress_bar=True,convert_to_tensor=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=112.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=112.0, style=ProgressStyle(description_widt…




In [31]:
cosine_scores = util.cos_sim(train_pos_embeddings,train_pos_embeddings).cpu().numpy()
print("Train Pos-Pos Similarity",np.mean(cosine_scores))
cosine_scores = util.cos_sim(train_pos_embeddings,train_neg_embeddings).cpu().numpy()
print("Train Pos-Neg Similarity",np.mean(cosine_scores))
cosine_scores = util.cos_sim(train_neg_embeddings,train_neg_embeddings).cpu().numpy()
print("Train Neg-Neg Similarity",np.mean(cosine_scores))

Train Pos-Pos Similarity 0.29575583
Train Pos-Neg Similarity 0.2550434
Train Neg-Neg Similarity 0.2744917


In [32]:
train_dataloader = DataLoader(train_examples,shuffle=True,batch_size=64)
train_loss = losses.CosineSimilarityLoss(model=model)
evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples(test_examples,show_progress_bar=True)

In [33]:
model = model.to(device)

Uncomment the following block to run fine-tuning on MR dataset

In [34]:
# model.fit(train_objectives=[(train_dataloader, train_loss)],
#           evaluator=evaluator,
#           epochs=20,
#           evaluation_steps=100000,
#           warmup_steps=100,
#           output_path='model'
#           )

In [35]:
model = SentenceTransformer('./model/')
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 75, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

Computing cosine similarities after fine tuning

In [36]:
test_pos_embeddings = model.encode(test_pos,show_progress_bar=True,convert_to_tensor=True)
test_neg_embeddings = model.encode(test_neg,show_progress_bar=True,convert_to_tensor=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=56.0, style=ProgressStyle(description_width…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=56.0, style=ProgressStyle(description_width…




In [37]:
cosine_scores = util.cos_sim(test_pos_embeddings,test_pos_embeddings).cpu().numpy()
print("Test Pos-Pos Similarity",np.mean(cosine_scores))
cosine_scores = util.cos_sim(test_pos_embeddings,test_neg_embeddings).cpu().numpy()
print("Test Pos-Neg Similarity",np.mean(cosine_scores))
cosine_scores = util.cos_sim(test_neg_embeddings,test_neg_embeddings).cpu().numpy()
print("Test Neg-Neg Similarity",np.mean(cosine_scores))

Test Pos-Pos Similarity 0.9077267
Test Pos-Neg Similarity 0.090817936
Test Neg-Neg Similarity 0.89808726


In [38]:
train_pos_embeddings = model.encode(train_pos,show_progress_bar=True,convert_to_tensor=True)
train_neg_embeddings = model.encode(train_neg,show_progress_bar=True,convert_to_tensor=True)

HBox(children=(FloatProgress(value=0.0, description='Batches', max=112.0, style=ProgressStyle(description_widt…




HBox(children=(FloatProgress(value=0.0, description='Batches', max=112.0, style=ProgressStyle(description_widt…




In [39]:
cosine_scores = util.cos_sim(train_pos_embeddings,train_pos_embeddings).cpu().numpy()
print("Train Pos-Pos Similarity",np.mean(cosine_scores))
cosine_scores = util.cos_sim(train_pos_embeddings,train_neg_embeddings).cpu().numpy()
print("Train Pos-Neg Similarity",np.mean(cosine_scores))
cosine_scores = util.cos_sim(train_neg_embeddings,train_neg_embeddings).cpu().numpy()
print("Train Neg-Neg Similarity",np.mean(cosine_scores))

Train Pos-Pos Similarity 0.9011581
Train Pos-Neg Similarity 0.08809084
Train Neg-Neg Similarity 0.91211754
