In [15]:
import torch
from transformers import (AutoTokenizer, AutoConfig,
AutoModelForSequenceClassification)
from transformers import Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import numpy as np
import pandas as pd
from scipy.special import expit as sigmoid
from sklearn.metrics import classification_report
import random
from collections import defaultdict
from sentence_transformers import SentenceTransformer, SentencesDataset
from sentence_transformers.losses import TripletLoss
from sentence_transformers.readers import LabelSentenceReader, InputExample
from torch.utils.data import DataLoader

In [16]:
df = pd.read_csv('processed-files/df.csv')
labels = np.load('processed-files/new_labels.npy', allow_pickle = True)

In [17]:
labels = np.load('processed-files/new_labels.npy', allow_pickle = True)
labels = np.unique(labels, return_inverse=True)[1]
df['label'] = torch.tensor(labels, dtype=torch.float)

In [18]:
df = df.sample(frac=0.05, random_state=42)

In [19]:
df

Unnamed: 0,Word A,Word B,Relation,Sentence,label
1426106,simulation,verification,20,"For verification, system-level finite element ...",12.0
1391092,parameter,voltage,15,When the bias voltage is chosen to be a contro...,0.0
1152842,complexity,process,3,The solution is compared with the other approa...,2.0
1025554,graph,minimum spanning tree,12,We propose such an inference algorithm for fir...,9.0
111566,noise,signal,12,For incident photons in the 800 to 1300 nm ran...,7.0
...,...,...,...,...,...
398260,optimization problem,pso,28,"To this end, we develop a mixed-integer nonlin...",17.0
1316580,computer,monitoring,0,We employed Intel Edison and Raspberry Pi as F...,0.0
45006,tracking,tracking performance,8,The PS2 design has evolved incrementally based...,4.0
685149,id,neural network,28,"In this paper, we propose Attention and CL los...",18.0


In [24]:
def triplets_from_labeled_dataset(df):
    """Create triplets from labeled dataset"""
    n_triplets = 10000
    triplets = []
    while len(triplets) < n_triplets:
        # select a random row
        row = df.sample(1).iloc[0]
        # select 1 row with the same label and 1 row with different label
        pos = df[df['label'] == row['label']].sample(1).iloc[0]
        neg = df[df['label'] != row['label']].sample(1).iloc[0]
        # make sure they are not the same and that this label is not already in the triplets
        if (pos['Sentence'] != row['Sentence'] and
            (row['Sentence'], pos['Sentence'], neg['Sentence']) not in triplets):
           triplets.append(InputExample(texts=[row['Sentence'], pos['Sentence'], neg['Sentence']]))
    # convert to dataframe
    return triplets
    




In [25]:
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
triplets = triplets_from_labeled_dataset(df)
finetune_data = SentencesDataset(examples=triplets, model=sbert_model)
finetune_dataloader = DataLoader(finetune_data, shuffle=True, batch_size=16)

# Initialize triplet loss
loss = TripletLoss(model=sbert_model)



In [26]:
sbert_model.fit(train_objectives=[(finetune_dataloader, loss)], epochs=4, output_path='sbert_model')

Iteration: 100%|██████████| 625/625 [1:38:59<00:00,  9.50s/it]
Epoch:  25%|██▌       | 1/4 [1:38:59<4:56:57, 5939.11s/it]

In [None]:
# save model
sbert_model.save('sbert_model')