In [1]:
import os
import csv
import math
import torch
import logging
from datetime import datetime
from torch.utils.data import DataLoader, Dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample


device=""
if torch.cuda.is_available():
    device = torch.device("cuda")
    logging.info(f'Using GPU: {torch.cuda.get_device_name()}')
    print(f'Using GPU: {torch.cuda.get_device_name()}')
else:
    device = torch.device("cpu")
    logging.info('Using CPU')
    print('Using CPU')

  from .autonotebook import tqdm as notebook_tqdm


Using GPU: NVIDIA GeForce RTX 3060 Laptop GPU


In [2]:
PATH="data/"

In [3]:


class TSDataset(Dataset):
    def __init__(self, file_path):
        data = []
        with open(file_path, encoding="utf8") as f:
            reader = csv.DictReader(f, delimiter="\t")
            for row in reader:
                data.append(InputExample(texts=[row["sentence1"], row["sentence2"]], label=(float(row["score"]))/5))

        self.samples = data

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return sample


In [4]:
model_name = "paraphrase-MiniLM-L3-v2"
train_batch_size = 16
num_epochs = 2

In [5]:
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)

In [6]:
model_save_path = (
    "output/training-" + model_name + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
)


In [7]:
model = SentenceTransformer(model_name)
model.to(device)
logging.info("Read train dataset")

In [8]:
train_dataset = TSDataset(f"{PATH}train.csv")
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


In [9]:
logging.info("Read dev dataset")
dev_dataset = TSDataset(f"{PATH}dev.csv")

# dev_input_examples = [InputExample(texts=[sentence1, sentence2], label=score) for sentence1, sentence2, score in dev_dataset]

# evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_input_examples, name="dev")
# evaluator = EmbeddingSimilarityEvaluator(dev_dataset)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_dataset, name="dev")


In [10]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

In [11]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=500,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    # device=device
)

Iteration: 100%|██████████| 357/357 [00:10<00:00, 34.74it/s]
Iteration: 100%|██████████| 357/357 [00:10<00:00, 35.07it/s]
Epoch: 100%|██████████| 2/2 [00:23<00:00, 11.64s/it]


In [12]:
# test_dataset = TSDataset(f"{PATH}test.csv")

In [13]:

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_dataset, name="test")
test_evaluator(model, output_path=model_save_path)

0.8779326698957277