# 훈련시 파라미터

learning_rate: 2e-05
train_batch_size: 32
eval_batch_size: 32
seed: 42
gradient_accumulation_steps: 8
total_train_batch_size: 256
optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
lr_scheduler_type: linear
lr_scheduler_warmup_ratio: 0.1
num_epochs: 10.0

In [1]:
import pandas as pd

dataset = pd.read_csv('./data/KoBert_review_train_set_v2.03.csv')

print(dataset.head(10))

                                         Review_Text  Label
0  포장하려고 들어선 순간부터 뭔가 말투가 짜증나있고 싸우자는 태도였지만 일단 참았는데...      0
1  재오픈 하신건지 그동안 못가서 아쉬웠습니다 그리고 아 솔직히 여기 보쌈이랑 전 김치...      1
2  맛과 분위기 좋고 스테이크 가성비 좋습니다 다만 생각보다 많이 익혀서 나옵니다 평소...      1
3                              친절하고 분위기 좋은데맛이없어요 비싸요      0
4    시 분에 왔는데 재료 소진으로 문 닫은 곳은 첨 보네 장사를 하고 싶을 때만 하시나요      0
5  직원분들 불친절하고 가위에 이물질묻어서 교환요청하니까 쓰윽 보시더니 왜이렇게 예민하...      0
6                                       너무맛있어요양도푸짐해요      1
7  밑에 몇 분들이 말씀하신대로 한꺼번에 팀 입장시키는 시스템 진짜 짜증나요 덕분에 시...      0
8                                      정돈은 언제 먹어도 최고      1
9  동료와 험께 돈까스와 김치돈까스뚝배기를 주문두개 메뉴 모두 자주 먹던 메뉴입니다 하...      0


In [3]:
# Test 로 10개만 진행
df = dataset.copy()

In [4]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer
import torch

# 감성 분석용 모델 로드
model_name = "monologg/koelectra-base-v3-discriminator"
tokenizer = ElectraTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(model_name, num_labels=2)


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 텍스트 전처리 함수
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
import numpy as np
import re

def preprocess_text(text):
    # 특수문자 제거
    text = re.sub(r'[^가-힣a-zA-Z0-9\s]', '', text)
    # 불필요한 공백 제거
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# 데이터 전처리 및 토큰화
def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

# 데이터 전처리 적용
dataset['clean_text'] = dataset['Review_Text'].apply(preprocess_text)

# 이제 train_test_split 진행
train_df, eval_df = train_test_split(dataset, test_size=0.2, random_state=42)

In [6]:
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df['clean_text'].values  # clean_text 사용
        self.labels = df['Label'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):  # 이 메소드 추가
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'][0],
            'attention_mask': encoding['attention_mask'][0],
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
# 데이터셋 객체 생성
train_dataset = ReviewDataset(train_df, tokenizer)
eval_dataset = ReviewDataset(eval_df, tokenizer)


In [9]:
# MLflow 설정
import mlflow

mlflow.set_tracking_uri("http://10.196.197.32:30164")  # MLflow 서버 URI 설정
mlflow.set_experiment("sentiment_analysis")  # 실험 이름 설정


2024/12/05 07:01:03 INFO mlflow.tracking.fluent: Experiment with name 'sentiment_analysis' does not exist. Creating a new experiment.


<Experiment: artifact_location='/data/ephemeral/home/mlruns/1', creation_time=1733382063071, experiment_id='1', last_update_time=1733382063071, lifecycle_stage='active', name='sentiment_analysis', tags={}>

In [10]:
# 학습 인자 설정
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    gradient_accumulation_steps=8,
    warmup_ratio=0.1,
    seed=42,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
)



In [11]:
# 평가 메트릭 정의
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = np.mean(labels == preds)
    return {'accuracy': accuracy}

In [12]:
# Trainer 초기화 및 학습 시작
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [13]:
# 모델 학습
trainer.train()

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Epoch,Training Loss,Validation Loss,Accuracy
0,No log,0.285979,0.910735
1,No log,0.212622,0.928242
2,No log,0.189446,0.933436
4,No log,0.199275,0.937476
5,No log,0.210076,0.937668
6,0.209300,0.223456,0.936322
8,0.209300,0.23298,0.936899
9,0.209300,0.234677,0.936706


🏃 View run ./results at: http://10.196.197.32:30164/#/experiments/1/runs/f7509293e2844adbb925916b1edfdf24
🧪 View experiment at: http://10.196.197.32:30164/#/experiments/1


TrainOutput(global_step=810, training_loss=0.1558755898181303, metrics={'train_runtime': 1092.0318, 'train_samples_per_second': 190.397, 'train_steps_per_second': 0.742, 'total_flos': 1.363494111086592e+16, 'train_loss': 0.1558755898181303, 'epoch': 9.96923076923077})

In [14]:
# 학습된 모델 저장
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')