<a href="https://colab.research.google.com/github/ttogle918/NLU_3-/blob/main/%EA%B9%80%EC%97%B0%EC%8B%9D_sts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLU - 문장 유사도 계산 (STS)**

* 과제 목표
  + 두 개의 한국어 문장을 입력받아 두 문장의 의미적 유사도를 출력
  + regression task (0 ≤ target ≤ 5)

* 학습 데이터 셋 (다운로드 가능 & 제공 예정)
  + KLUE-STS
    - AIRBNB (리뷰)
    - policy (뉴스)
    - paraKQC (스마트홈 쿼리)

* 과제 결과물
  + 학습된 모델 (모델 자유 선택) (train set만 사용해 학습)
  + 학습 방식 보고서
    - 어떤 모델을 선택했나
    - 어떻게 파라미터를 튜닝했나
    - 어떤 훈련 과정을 거쳤는가
  + dev set score (F1)
  + 문장 유사도를 출력하는 API (프레임워크 자유 선택)


In [None]:
!pip install pytorch-transformers
!pip install transformers
!pip install datasets
!pip install sentence-transformers datasets

In [None]:
import os
import sys
import pandas as pd
import numpy as np 
import math
import torch
import logging


from torch.utils.data import DataLoader
from datetime import datetime
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from datasets import load_dataset

In [None]:
# seed
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# device type
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"# available GPUs : {torch.cuda.device_count()}")
    print(f"GPU name : {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
print(device)

In [None]:
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [None]:
model_name = "klue/roberta-large"

In [None]:
train_batch_size = 32
num_epochs = 4
model_save_path = "output/training_klue_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
embedding_model = models.Transformer(model_name)

In [None]:
pooler = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

In [None]:
model = SentenceTransformer(modules=[embedding_model, pooler])

In [None]:
from datasets import load_dataset
dataset = load_dataset("klue", "sts")

In [None]:
dataset

In [None]:
train_dt= dataset['train']

In [None]:
val_dt= dataset['validation']

In [None]:
dataset['validation'][0]

In [None]:
max_length=512
len(train_dt), len(val_dt)

In [None]:
testsets = load_dataset("kor_nlu", "sts")

In [None]:
train_samples = []
dev_samples = []
test_samples = []

# KLUE STS 내 훈련, 검증 데이터 예제 변환
for phase in ["train", "validation"]:
    examples = dataset[phase]

    for example in examples:
        score = float(example["labels"]["label"]) / 5.0  # 0.0 ~ 1.0 스케일로 유사도 정규화

        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]], 
            label=score,
        )

        if phase == "validation":
            dev_samples.append(inp_example)
        else:
            train_samples.append(inp_example)

# KorSTS 내 테스트 데이터 예제 변환
for example in testsets["test"]:
    score = float(example["score"]) / 5.0

    if example["sentence1"] and example["sentence2"]:
        inp_example = InputExample(
            texts=[example["sentence1"], example["sentence2"]],
            label=score,
        )

    test_samples.append(inp_example)

In [None]:
train_dataloader = DataLoader(
    train_samples,
    shuffle=True,
    batch_size=train_batch_size,
)
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    dev_samples,
    name="sts-dev",
)

In [None]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1)  # 10% of train data for warm-up
logging.info(f"Warmup-steps: {warmup_steps}")

In [None]:
import torch
torch.cuda.empty_cache()

torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

In [None]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')

In [None]:
test_evaluator(model, output_path=model_save_path)