In [19]:
""" for domain adaption (Embedding Model Fine-Tunning) """

from sentence_transformers import SentenceTransformer
import os
import json

In [20]:
from torch.utils.data import DataLoader
from sentence_transformers import InputExample
from sentence_transformers import losses
from sentence_transformers.evaluation import InformationRetrievalEvaluator

In [21]:
# set path
# os.chdir('langchain')

In [22]:
train_dataset_path = './da_train_dataset.json'
val_dataset_path = './da_val_dataset.json'

#BATCH_SIZE
batch_size = 8

In [23]:
with open(train_dataset_path, 'r+', encoding='utf-8') as f :
    train_dataset = json.load(f)

with open(val_dataset_path, 'r', encoding='utf-8') as f :
    val_dataset = json.load(f)

In [24]:
corpus = train_dataset['corpus']
queries = train_dataset['queries']
relevant_docs = train_dataset['relevant_docs']

examples = []

for query_id, query in queries.items():
    node_id = relevant_docs[query_id][0]
    text = corpus[node_id]
    example = InputExample(texts=[query, text])
    examples.append(example)

In [25]:
loader = DataLoader(
    examples, batch_size=batch_size,
)

In [26]:
### pretrained embedding model config
model = SentenceTransformer('BM-K/KoSimCSE-roberta-multitask/')

In [27]:
# Define Loss
loss = losses.MultipleNegativesRankingLoss(model)

In [28]:
# Define evaluator

corpus = val_dataset['corpus']
queries = val_dataset['queries']
relevant_docs = val_dataset['relevant_docs']

evaluator = InformationRetrievalEvaluator(queries, corpus, relevant_docs)

In [29]:
# epoch config
EPOCHS = 3

In [None]:
warmup_steps = int(len(loader) * EPOCHS * 0.1)

#memory allocation error
model.to('cuda')

model.fit(
    train_objectives=[(loader, loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path='./da_finetune',
    show_progress_bar=True,
    evaluator=evaluator, 
    evaluation_steps=50,
)