In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
# from torch.utils.data import Dataset, DataLoader -> datasets 라이브러리랑 충돌

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.chdir("C:/Users/ehddl/Desktop/업무/code/sns-categorizer/")

In [3]:
data = pd.read_csv("tests/data/final_fine-tuning_multi-columns_data.csv", index_col=0)
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [4]:
data_copy = data.copy()
data = data.drop(['single_label', 'label_list'], axis=1)

In [5]:
train, test = train_test_split(data, test_size= 0.2, stratify=data['label_id'], random_state=42)

In [6]:
dataset = DatasetDict({
    'train' : Dataset.from_pandas(train),
    'test' : Dataset.from_pandas(test)
})

In [7]:
# model_name = "kykim/bert-kor-base"
model_name = "BM-K/KoSimCSE-roberta"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

In [8]:
# def tokenize_three_columns(examples):
#     """
#     세 개의 텍스트 컬럼을 [SEP] 토큰으로 연결하여 토큰화하는 함수.
    
#     Args:
#         examples (dict): Hugging Face Dataset의 batch 데이터를 담고 있는 딕셔너리.
#                          `acnt_sub_nm_cleaned`, `intro_txt_cleaned`, `text` 키를 포함
    
#     Returns:
#         dict: 토큰화된 'input_ids', 'attention_mask'를 담고 있는 딕셔너리.
#     """
    
#     # 텍스트 컬럼들을 [SEP] 토큰으로 연결하여 하나의 시퀀스로 만듭니다.
#     # f-string을 사용하여 간결하게 결합합니다.
#     # BERT는 "[CLS] text1 [SEP] text2 [SEP] text3 [SEP]" 형태의 입력을 처리합니다.
#     combined_texts = [
#         f"{acnt} {tokenizer.sep_token} {intro} {tokenizer.sep_token} {txt}"
#         for acnt, intro, txt in zip(
#             examples["acnt_sub_nm_cleaned"],
#             examples["intro_txt_cleaned"],
#             examples["text"]
#         )
#     ]
    
#     return tokenizer(
#         combined_texts,
#         padding="max_length",
#         truncation=True,
#         max_length=512  # BERT의 최대 입력 길이
#     )

In [9]:
def tokenize_three_columns(examples):
    '''
    BM-K/KoSimCSE-roberta
    '''
    combined_texts = [
        f"{acnt} {tokenizer.sep_token} {intro} {tokenizer.sep_token} {txt}"
        for acnt, intro, txt in zip(
            examples["acnt_sub_nm_cleaned"],
            examples["intro_txt_cleaned"],
            examples["text"]
        )
    ]
    
    return tokenizer(
        combined_texts,
        padding="max_length",
        truncation=True,
        max_length=512  # BERT의 최대 입력 길이
    )

In [10]:
tokenized_dataset = dataset.map(tokenize_three_columns, batched=True) # batched=True는 효율적으로 처리

Map: 100%|██████████| 26015/26015 [00:05<00:00, 4653.82 examples/s]
Map: 100%|██████████| 6504/6504 [00:01<00:00, 4449.26 examples/s]


In [11]:
tokenized_dataset = tokenized_dataset.rename_column("label_id", 'label')

In [12]:
# 불필요한 컬럼 제거. 토큰화에 사용된 원본 텍스트 컬럼은 제거
columns_to_remove = ['acnt_sub_nm_cleaned', 'intro_txt_cleaned', 'text']
tokenized_dataset = tokenized_dataset.remove_columns(columns_to_remove)

In [14]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 26015
    })
    test: Dataset({
        features: ['label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6504
    })
})

In [None]:
# 최종 포맷 설정
# tokenized_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])
tokenized_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'token_type_ids', 'label']) # roberta

In [16]:
num_labels = data['label_id'].nunique()
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    use_safetensors=True # gpu 버전 사용 시 추가
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at BM-K/KoSimCSE-roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# setting trainer
# !pip install accelerate>=0.26.0

args = TrainingArguments(
    output_dir="./muli-columns-BM-K/KoSimCSE-roberta", 
    eval_strategy="epoch", 
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3, 
    learning_rate=2e-5,
    warmup_ratio=0.06, 
    weight_decay=0.01, 
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
    load_best_model_at_end=True, 
    metric_for_best_model="accuracy",
    greater_is_better=True,
    remove_unused_columns=False
)

In [18]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    return {"accuracy": accuracy_score(labels, preds)}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [108]:
trainer.save_model("muli-columns-kykim-bert-kor")
tokenizer.save_pretrained("muli-columns-kykim-bert-kor") 

('muli-columns-kykim-bert-kor\\tokenizer_config.json',
 'muli-columns-kykim-bert-kor\\special_tokens_map.json',
 'muli-columns-kykim-bert-kor\\vocab.txt',
 'muli-columns-kykim-bert-kor\\added_tokens.json',
 'muli-columns-kykim-bert-kor\\tokenizer.json')