In [27]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import pandas as pd
import random

In [28]:
#### Just some code to print debug information to stdout
logging.basicConfig(
    format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, handlers=[LoggingHandler()]
)
#### /print debug information to stdout

In [29]:
# Read the dataset
model_name = "snunlp/KR-SBERT-V40K-klueNLI-augSTS"
train_batch_size = 16
num_epochs = 4
model_save_path = (
    "output/scientific_continue_training_kr_sbert_v40k_klueNLI_augSTS"
)
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

2024-05-02 15:55:09 - Load pretrained SentenceTransformer: snunlp/KR-SBERT-V40K-klueNLI-augSTS
2024-05-02 15:55:09 - Use pytorch device: cuda


In [30]:
data_path = 'data'
train_df = pd.read_csv(os.path.join(data_path, 'train_ir_2024-05-02_15-46.csv'))
val_df = pd.read_csv(os.path.join(data_path, 'val_ir_2024-05-02_15-46.csv'))
test_df = pd.read_csv(os.path.join(data_path, 'test_ir_2024-05-02_15-46.csv'))

In [31]:
train_df.shape, val_df.shape, test_df.shape

((95926, 6), (14292, 6), (30508, 6))

In [32]:
train_df.head()

Unnamed: 0,answer,id,question,domain,data_id,posneg
0,건강한 사람이 에너지 균형을 평형 상태로 유지하는 것은 중요합니다. 에너지 균형은 ...,42508ee0-c543-4338-878e-d98c6babee66,"에너지 균형이란 무엇이며, 왜 건강에 중요한가요?",nutrition,test,1.0
1,"수소, 산소, 질소 가스의 혼합물에서 평균 속도가 가장 빠른 분자는 수소입니다. 수...",4a437e7f-16c1-4c62-96b9-f173d44f4339,수소 분자가 다른 분자들보다 더 빠르게 움직이는 이유는 무엇인가요?,conceptual_physics,test,1.0
2,종이와 플라스틱은 재활용 가능한 자원입니다. 중학교 과학 수업에서 우리는 종이와 플...,d3c68be5-9cb1-4d6e-ba18-5f81cf89affb,종이와 플라스틱이 재활용 가능한 이유는 무엇인가요?,ARC_Challenge,test,1.0
3,마이애미파랑나비는 남부 플로리다에서 멸종 위기에 처한 종입니다. 이 나비의 개체수 ...,910107a6-2a42-41a2-b337-fbf22d6440fe,마이애미파랑나비의 주택 건설 증가에 대한 영향은 어떻게 나타나고 있나요?,ARC_Challenge,test,1.0
4,"AIDS에 직면한 민족문화 공동체 연구에 따르면, 한 해 동안 섹스 파트너가 한 명...",80feb7f2-1b24-4a9d-87a3-4976e9304e74,성 건강 및 성 문화에 대한 연구에서 발견된 주요 결과는 무엇인가요?,human_sexuality,test,1.0


In [48]:
# Convert the dataset to a DataLoader ready for training
logging.info("Converting train dataset to DataLoader")

train_samples = []
val_samples = []
test_samples = []

for idx, (answer, id, question, domain, data_id, posneg) in train_df.iterrows():
    if type(answer) is not str or type(question) is not str:
        continue
    train_samples.append(InputExample(texts=[question, answer], label=random.uniform(0.8, 1) if posneg == 1.0 else random.uniform(0, 0.2)))

for idx, (answer, id, question, domain, data_id, posneg) in val_df.iterrows():
    if type(answer) is not str or type(question) is not str:
        continue
    val_samples.append(InputExample(texts=[question, answer], label=random.uniform(0.8, 1) if posneg == 1.0 else random.uniform(0, 0.2)))

for idx, (answer, id, question, domain, data_id, posneg) in test_df.iterrows():
    if type(answer) is not str or type(question) is not str:
        continue
    test_samples.append(InputExample(texts=[question, answer], label=random.uniform(0.8, 1) if posneg == 1.0 else random.uniform(0, 0.2)))
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

2024-05-02 16:02:39 - Converting train dataset to DataLoader


In [49]:
train_loss = losses.CosineSimilarityLoss(model=model)
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read Scientific val dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name="scientific-val")

2024-05-02 16:02:42 - Read Scientific val dataset


In [50]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2024-05-02 16:02:42 - Warmup-steps: 2394


In [51]:
# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)


##############################################################################
#
# Load the stored model and evaluate its performance on Scientific dataset
#
##############################################################################

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name="scientific-test")
test_evaluator(model, output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/5984 [00:00<?, ?it/s]