<a href="https://colab.research.google.com/github/ttogle918/NLU_3-/blob/main/%EA%B9%80%EC%97%B0%EC%8B%9D_sts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **NLU - 문장 유사도 계산 (STS)**

* 과제 목표
  + 두 개의 한국어 문장을 입력받아 두 문장의 의미적 유사도를 출력
  + regression task (0 ≤ target ≤ 5)

* 학습 데이터 셋 (다운로드 가능 & 제공 예정)
  + KLUE-STS
    - AIRBNB (리뷰)
    - policy (뉴스)
    - paraKQC (스마트홈 쿼리)

* 과제 결과물
  + 학습된 모델 (모델 자유 선택) (train set만 사용해 학습)
  + 학습 방식 보고서
    - 어떤 모델을 선택했나
    - 어떻게 파라미터를 튜닝했나
    - 어떤 훈련 과정을 거쳤는가
  + dev set score (F1)
  + 문장 유사도를 출력하는 API (프레임워크 자유 선택)


In [None]:
!pip install pytorch-transformers
!pip install transformers
!pip install datasets
!pip install sentence-transformers datasets

In [None]:
import os
import sys
import pandas as pd
import numpy as np 
import math
import torch
import logging
import re


from torch.utils.data import DataLoader
from datetime import datetime
from sentence_transformers import SentenceTransformer,  LoggingHandler, losses, models, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from datasets import load_dataset

## **1. 데이터 Load & 전처리**

[KLUE ](https://github.com/KLUE-benchmark/KLUE)

In [None]:
from datasets import load_dataset
klue_dt = load_dataset("klue", "sts")

In [None]:
klue_dt

[Kor_STS](https://github.com/kakaobrain/KorNLUDatasets)

In [None]:
# KorSTS dataset
KorSTS_dt = load_dataset("kor_nlu", "sts")

In [None]:
KorSTS_dt

In [None]:
def make_dataset(dataset):
    sentence1, sentence2, rlabels = [], [], []

    for data in dataset :
          rlabels.append(data['labels']['real-label'])
          sentence1.append(cleaning(data['sentence1']))
          sentence2.append(cleaning(data['sentence2']))

    df = pd.DataFrame({'sentence1' : sentence1, 'sentence2' : sentence2, 'labels' : rlabels})

    return df
    

In [None]:
# 데이터셋을 프레임으로 변환하고, sentence 와 점수만 뽑음
def make_dataset_sts(dataset):
    sentence1, sentence2, rlabels = [], [], []

    for data in dataset :
          rlabels.append(data['score']['real-label'])
          sentence1.append(cleaning(data['sentence1']))
          sentence2.append(cleaning(data['sentence2']))

    df = pd.DataFrame({'sentence1' : sentence1, 'sentence2' : sentence2, 'labels' : rlabels})

    return df


In [None]:
klue_df_train=make_dataset(klue_dt['train'])
KorSTS_df_train=make_dataset_sts(KorSTS_dt['train'])
KorSTS_df_train

In [None]:
# seed
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# device type 확인
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"# available GPUs : {torch.cuda.device_count()}")
    print(f"GPU name : {torch.cuda.get_device_name()}")
else:
    device = torch.device("cpu")
print(device)

In [None]:
logging.basicConfig(
    format="%(asctime)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
    handlers=[LoggingHandler()],
)

In [None]:
# 모델 이름 설정
model_name = "klue/roberta-large"
embedding_model = models.Transformer(model_name, max_seq_length=256, do_lower_case=True)

In [None]:
pooler = models.Pooling(
    embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False,
)

In [None]:
model = SentenceTransformer(modules=[embedding_model, pooler])

In [None]:
train_batch_size = 16
num_epochs = 1
model_save_path = "output/training_klue_sts_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [None]:
from datasets import load_dataset
dataset = load_dataset("klue", "sts")

In [None]:

sentence1, sentence2, rlabels = [], [], []

for data1 in dataset['train'] :
  sentence1.append(data1['sentence1'])
  sentence2.append(data1['sentence2'])
  rlabels.append(data1['labels']['real-label'])

train_df = pd.DataFrame({'sentence1' : sentence1, 'sentence2' : sentence2, 'labels' : rlabels})
train_df.head(10)

In [None]:
sentence3, sentence4, rlabels2 = [], [], []

for data2 in dataset['validation'] :
  sentence3.append(data2['sentence1'])
  sentence4.append(data2['sentence2'])
  rlabels2.append(data2['labels']['real-label'])

test_df= pd.DataFrame({'sentence1' : sentence3, 'sentence2' : sentence4, 'labels' : rlabels2})
test_df.head(10)

In [None]:
train_df.shape, test_df.shape

In [None]:
kor_nlu_data = load_dataset("kor_nlu", "sts")


In [None]:
kor_nlu_data

In [None]:
sentence_k, sentence_k2, rlabels_k = [], [], []

for data_k in kor_nlu_data['train']  :
  sentence_k.append(data_k['sentence1'])
  sentence_k2.append(data_k['sentence2'])
  rlabels_k.append(data2['labels']['real-label'])

kor_nlu_df= pd.DataFrame({'sentence1' : sentence_k, 'sentence2' : sentence_k2, 'labels' : rlabels_k})
kor_nlu_df.head(10)

In [None]:
train_df_1 = pd.concat(train_df,kor_nlu_df)

In [None]:
train_dt= dataset['train']

In [None]:
val_dt= dataset['validation']

In [None]:
dataset['validation'][0]

In [None]:

len(train_dt), len(val_dt)

In [None]:
def make_sts_input_example(dataset):
    ''' 
    Transform to InputExample
    ''' 
    input_examples = []
    for i, data in enumerate(dataset):
        sentence1 = data['sentence1']
        sentence2 = data['sentence2']
        score = (data['labels']['label']) / 5.0  # normalize 0 to 5
        input_examples.append(InputExample(texts=[sentence1, sentence2], label=score))

    return input_examples

In [None]:
test_examples_dt = make_sts_input_example(val_dt)

train_samples_dt = make_sts_input_example(train_dt)

In [None]:
train_dataloader = DataLoader(
    train_samples_dt,
    shuffle=True,
    batch_size=train_batch_size,
)
train_loss = losses.CosineSimilarityLoss(model=model)

In [None]:
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    test_examples_dt,
    name="sts-dev",
)

In [None]:
warmup_steps = math.ceil(len(train_dataloader) * num_epochs  * 0.1)  # 10% of train data for warm-up
logging.info(f"Warmup-steps: {warmup_steps}")

In [None]:
import torch
torch.cuda.empty_cache()

torch.cuda.memory_summary(device=None, abbreviated=False)

In [None]:
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

In [None]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_examples_dt, name='sts-test')

In [None]:
test_evaluator(model, output_path=model_save_path)