In [11]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import pandas as pd
import random

In [12]:
#### Just some code to print debug information to stdout
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout

In [13]:
# Read the dataset
model_name = "snunlp/KR-SBERT-V40K-klueNLI-augSTS"
train_batch_size = 16
num_epochs = 4
model_save_path = (
    "output/scientific_continue_training_kr_sbert_v40k_klueNLI_augSTS"
)
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

2024-05-02 11:54:46 - Load pretrained SentenceTransformer: snunlp/KR-SBERT-V40K-klueNLI-augSTS
2024-05-02 11:54:46 - Use pytorch device: cuda


In [14]:
data_path = 'data'
train_df = pd.read_csv(os.path.join(data_path,'train_ir.csv'))
val_df = pd.read_csv(os.path.join(data_path,'val_ir.csv'))
test_df = pd.read_csv(os.path.join(data_path,'test_ir.csv'))

In [15]:
train_df.shape, val_df.shape, test_df.shape

((2912, 3), (434, 3), (926, 3))

In [16]:
# Convert the dataset to a DataLoader ready for training
logging.info("Converting train dataset to DataLoader")

train_samples = []
val_samples = []
test_samples = []

for idx, (answer, question, domain) in train_df.iterrows():
    train_samples.append(InputExample(texts=[question, answer], label=random.uniform(0.8, 1)))

for idx, (answer, question, domain) in val_df.iterrows():
    val_samples.append(InputExample(texts=[question, answer], label=random.uniform(0.8, 1)))

for idx, (answer, question, domain) in test_df.iterrows():
    test_samples.append(InputExample(texts=[question, answer], label=random.uniform(0.8, 1)))
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

2024-05-02 11:54:49 - Converting train dataset to DataLoader


In [19]:
train_loss = losses.CosineSimilarityLoss(model=model)
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read Scientific val dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name="scientific-val")

2024-05-02 11:54:52 - Read Scientific val dataset


In [20]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2024-05-02 11:54:52 - Warmup-steps: 292


In [21]:
# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)


##############################################################################
#
# Load the stored model and evaluate its performance on Scientific dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="scientific-test")
test_evaluator(model, output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/728 [00:00<?, ?it/s]

2024-05-02 11:55:25 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset after epoch 0:
2024-05-02 11:55:25 - Cosine-Similarity :	Pearson: 0.0144	Spearman: 0.0113
2024-05-02 11:55:25 - Manhattan-Distance:	Pearson: -0.0064	Spearman: -0.0019
2024-05-02 11:55:25 - Euclidean-Distance:	Pearson: -0.0048	Spearman: -0.0000
2024-05-02 11:55:25 - Dot-Product-Similarity:	Pearson: 0.0619	Spearman: 0.0569
2024-05-02 11:55:25 - Save model to output/scientific_continue_training_kr_sbert_v40k_klueNLI_augSTS


Iteration:   0%|          | 0/728 [00:00<?, ?it/s]

2024-05-02 11:55:58 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset after epoch 1:
2024-05-02 11:55:59 - Cosine-Similarity :	Pearson: 0.0187	Spearman: 0.0141
2024-05-02 11:55:59 - Manhattan-Distance:	Pearson: 0.0082	Spearman: 0.0050
2024-05-02 11:55:59 - Euclidean-Distance:	Pearson: 0.0057	Spearman: 0.0021
2024-05-02 11:55:59 - Dot-Product-Similarity:	Pearson: 0.0708	Spearman: 0.0769
2024-05-02 11:55:59 - Save model to output/scientific_continue_training_kr_sbert_v40k_klueNLI_augSTS


Iteration:   0%|          | 0/728 [00:00<?, ?it/s]

2024-05-02 11:56:31 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset after epoch 2:
2024-05-02 11:56:32 - Cosine-Similarity :	Pearson: 0.0155	Spearman: 0.0193
2024-05-02 11:56:32 - Manhattan-Distance:	Pearson: 0.0135	Spearman: 0.0100
2024-05-02 11:56:32 - Euclidean-Distance:	Pearson: 0.0130	Spearman: 0.0076
2024-05-02 11:56:32 - Dot-Product-Similarity:	Pearson: 0.0582	Spearman: 0.0622


Iteration:   0%|          | 0/728 [00:00<?, ?it/s]

2024-05-02 11:57:04 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset after epoch 3:
2024-05-02 11:57:05 - Cosine-Similarity :	Pearson: 0.0386	Spearman: 0.0364
2024-05-02 11:57:05 - Manhattan-Distance:	Pearson: 0.0397	Spearman: 0.0296
2024-05-02 11:57:05 - Euclidean-Distance:	Pearson: 0.0362	Spearman: 0.0280
2024-05-02 11:57:05 - Dot-Product-Similarity:	Pearson: 0.0755	Spearman: 0.0887
2024-05-02 11:57:05 - Save model to output/scientific_continue_training_kr_sbert_v40k_klueNLI_augSTS
2024-05-02 11:57:05 - Load pretrained SentenceTransformer: output/scientific_continue_training_kr_sbert_v40k_klueNLI_augSTS
2024-05-02 11:57:05 - Use pytorch device: cuda
2024-05-02 11:57:05 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-test dataset:
2024-05-02 11:57:07 - Cosine-Similarity :	Pearson: -0.0433	Spearman: -0.0293
2024-05-02 11:57:07 - Manhattan-Distance:	Pearson: -0.0380	Spearman: -0.0261
2024-05-02 11:57:07 - Euclidean-Distance:	Pearson: -0.0

-0.02609918354261913