In [23]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import pandas as pd
import random

In [24]:
#### Just some code to print debug information to stdout
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout

In [25]:
# Read the dataset
model_name = "snunlp/KR-SBERT-V40K-klueNLI-augSTS"
train_batch_size = 16
num_epochs = 4
model_save_path = (
    "output/scientific_continue_training_kr_sbert_v40k_klueNLI_augSTS"
)
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

2024-05-02 17:50:32 - Load pretrained SentenceTransformer: snunlp/KR-SBERT-V40K-klueNLI-augSTS
2024-05-02 17:50:32 - Use pytorch device: cuda


In [26]:
data_path = 'data'
train_df = pd.read_csv(os.path.join(data_path, 'train_ir_2024-05-02_15-46.csv'))
val_df = pd.read_csv(os.path.join(data_path, 'val_ir_2024-05-02_15-46.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test_ir_2024-05-02_15-46.csv'))

In [27]:
train_df = pd.concat([train_df, test_df, val_df])

In [28]:
train_df.shape, val_df.shape, test_df.shape

((140726, 6), (14292, 6), (30508, 6))

In [29]:
train_df.head()

Unnamed: 0,answer,id,question,domain,data_id,posneg
0,건강한 사람이 에너지 균형을 평형 상태로 유지하는 것은 중요합니다. 에너지 균형은 ...,42508ee0-c543-4338-878e-d98c6babee66,"에너지 균형이란 무엇이며, 왜 건강에 중요한가요?",nutrition,test,1.0
1,"수소, 산소, 질소 가스의 혼합물에서 평균 속도가 가장 빠른 분자는 수소입니다. 수...",4a437e7f-16c1-4c62-96b9-f173d44f4339,수소 분자가 다른 분자들보다 더 빠르게 움직이는 이유는 무엇인가요?,conceptual_physics,test,1.0
2,종이와 플라스틱은 재활용 가능한 자원입니다. 중학교 과학 수업에서 우리는 종이와 플...,d3c68be5-9cb1-4d6e-ba18-5f81cf89affb,종이와 플라스틱이 재활용 가능한 이유는 무엇인가요?,ARC_Challenge,test,1.0
3,마이애미파랑나비는 남부 플로리다에서 멸종 위기에 처한 종입니다. 이 나비의 개체수 ...,910107a6-2a42-41a2-b337-fbf22d6440fe,마이애미파랑나비의 주택 건설 증가에 대한 영향은 어떻게 나타나고 있나요?,ARC_Challenge,test,1.0
4,"AIDS에 직면한 민족문화 공동체 연구에 따르면, 한 해 동안 섹스 파트너가 한 명...",80feb7f2-1b24-4a9d-87a3-4976e9304e74,성 건강 및 성 문화에 대한 연구에서 발견된 주요 결과는 무엇인가요?,human_sexuality,test,1.0


In [30]:
# Convert the dataset to a DataLoader ready for training
logging.info("Converting train dataset to DataLoader")

train_samples = []
val_samples = []
test_samples = []

for idx, (answer, id, question, domain, data_id, posneg) in train_df.iterrows():
    if type(answer) is not str or type(question) is not str:
        continue
    train_samples.append(InputExample(texts=[question, answer], label=random.uniform(0.8, 1) if posneg == 1.0 else random.uniform(0, 0.2)))

for idx, (answer, id, question, domain, data_id, posneg) in val_df.iterrows():
    if type(answer) is not str or type(question) is not str:
        continue
    val_samples.append(InputExample(texts=[question, answer], label=random.uniform(0.8, 1) if posneg == 1.0 else random.uniform(0, 0.2)))

for idx, (answer, id, question, domain, data_id, posneg) in test_df.iterrows():
    if type(answer) is not str or type(question) is not str:
        continue
    test_samples.append(InputExample(texts=[question, answer], label=random.uniform(0.8, 1) if posneg == 1.0 else random.uniform(0, 0.2)))
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

2024-05-02 17:50:35 - Converting train dataset to DataLoader


In [31]:
train_loss = losses.CosineSimilarityLoss(model=model)
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read Scientific val dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name="scientific-val")

2024-05-02 17:50:37 - Read Scientific val dataset


In [32]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2024-05-02 17:50:37 - Warmup-steps: 3511


In [33]:
# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)


##############################################################################
#
# Load the stored model and evaluate its performance on Scientific dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="scientific-test")
test_evaluator(model, output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8777 [00:00<?, ?it/s]

2024-05-02 17:51:46 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 0 after 1000 steps:
2024-05-02 17:52:08 - Cosine-Similarity :	Pearson: 0.8366	Spearman: 0.2427
2024-05-02 17:52:08 - Manhattan-Distance:	Pearson: 0.8151	Spearman: 0.2482
2024-05-02 17:52:08 - Euclidean-Distance:	Pearson: 0.8173	Spearman: 0.2489
2024-05-02 17:52:08 - Dot-Product-Similarity:	Pearson: 0.8270	Spearman: 0.2412
2024-05-02 17:52:08 - Save model to output/scientific_continue_training_kr_sbert_v40k_klueNLI_augSTS
2024-05-02 17:53:16 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 0 after 2000 steps:
2024-05-02 17:53:37 - Cosine-Similarity :	Pearson: 0.8585	Spearman: 0.2414
2024-05-02 17:53:37 - Manhattan-Distance:	Pearson: 0.8418	Spearman: 0.2468
2024-05-02 17:53:37 - Euclidean-Distance:	Pearson: 0.8435	Spearman: 0.2476
2024-05-02 17:53:37 - Dot-Product-Similarity:	Pearson: 0.8493	Spearman: 0.2399
2024-05-02 17:54:46 - EmbeddingSimi

Iteration:   0%|          | 0/8777 [00:00<?, ?it/s]

2024-05-02 18:05:04 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 1 after 1000 steps:
2024-05-02 18:05:26 - Cosine-Similarity :	Pearson: 0.9215	Spearman: 0.2551
2024-05-02 18:05:26 - Manhattan-Distance:	Pearson: 0.8876	Spearman: 0.2520
2024-05-02 18:05:26 - Euclidean-Distance:	Pearson: 0.8885	Spearman: 0.2505
2024-05-02 18:05:26 - Dot-Product-Similarity:	Pearson: 0.9075	Spearman: 0.2537
2024-05-02 18:06:34 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 1 after 2000 steps:
2024-05-02 18:06:56 - Cosine-Similarity :	Pearson: 0.9214	Spearman: 0.2520
2024-05-02 18:06:56 - Manhattan-Distance:	Pearson: 0.8887	Spearman: 0.2520
2024-05-02 18:06:56 - Euclidean-Distance:	Pearson: 0.8897	Spearman: 0.2511
2024-05-02 18:06:56 - Dot-Product-Similarity:	Pearson: 0.9073	Spearman: 0.2508
2024-05-02 18:08:04 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 1 after 3000 steps:
2024-05-

Iteration:   0%|          | 0/8777 [00:00<?, ?it/s]

2024-05-02 18:18:18 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 2 after 1000 steps:
2024-05-02 18:18:40 - Cosine-Similarity :	Pearson: 0.9409	Spearman: 0.2538
2024-05-02 18:18:40 - Manhattan-Distance:	Pearson: 0.9132	Spearman: 0.2540
2024-05-02 18:18:40 - Euclidean-Distance:	Pearson: 0.9144	Spearman: 0.2544
2024-05-02 18:18:40 - Dot-Product-Similarity:	Pearson: 0.9293	Spearman: 0.2520
2024-05-02 18:19:49 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 2 after 2000 steps:
2024-05-02 18:20:11 - Cosine-Similarity :	Pearson: 0.9410	Spearman: 0.2508
2024-05-02 18:20:11 - Manhattan-Distance:	Pearson: 0.9137	Spearman: 0.2533
2024-05-02 18:20:11 - Euclidean-Distance:	Pearson: 0.9149	Spearman: 0.2533
2024-05-02 18:20:11 - Dot-Product-Similarity:	Pearson: 0.9299	Spearman: 0.2494
2024-05-02 18:21:19 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 2 after 3000 steps:
2024-05-

Iteration:   0%|          | 0/8777 [00:00<?, ?it/s]

2024-05-02 18:31:38 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 3 after 1000 steps:
2024-05-02 18:32:00 - Cosine-Similarity :	Pearson: 0.9476	Spearman: 0.2529
2024-05-02 18:32:00 - Manhattan-Distance:	Pearson: 0.9265	Spearman: 0.2535
2024-05-02 18:32:00 - Euclidean-Distance:	Pearson: 0.9278	Spearman: 0.2549
2024-05-02 18:32:00 - Dot-Product-Similarity:	Pearson: 0.9387	Spearman: 0.2519
2024-05-02 18:33:09 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 3 after 2000 steps:
2024-05-02 18:33:31 - Cosine-Similarity :	Pearson: 0.9479	Spearman: 0.2546
2024-05-02 18:33:31 - Manhattan-Distance:	Pearson: 0.9269	Spearman: 0.2563
2024-05-02 18:33:31 - Euclidean-Distance:	Pearson: 0.9280	Spearman: 0.2569
2024-05-02 18:33:31 - Dot-Product-Similarity:	Pearson: 0.9388	Spearman: 0.2534
2024-05-02 18:34:40 - EmbeddingSimilarityEvaluator: Evaluating the model on scientific-val dataset in epoch 3 after 3000 steps:
2024-05-