In [2]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict

# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F
# from torch.utils.data import Dataset, DataLoader -> datasets 라이브러리랑 충돌

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.device_count()) 
print(torch.cuda.get_device_name(0))

True
1
NVIDIA GeForce RTX 3050


In [3]:
# pip install torch==2.5.1+cu121 --index-url https://download.pytorch.org/whl/cu121
# poetry run pip install torch==2.5.1+cu121 --index-url https://download.pytorch.org/whl/cu121

In [5]:
os.chdir("C:/Users/ehddl/Desktop/업무/code/sns-categorizer/")

In [6]:
data = pd.read_csv("tests/final_fine_tuning_data.csv", index_col=0)
data

Unnamed: 0,media_cn_cleaned,label_id
0,휴가 돌려죠,19
1,관종들 릴스 릴스타그램 릴스초보,21
2,날이 좋아서,19
3,행복했던 9월 고마워,19
4,협찬 동결건조야채블럭 1개로 13종의 보라야채와 유산균 섭취가능 보라색 안토시아닌이...,8
...,...,...
39925,아빠들의 일사분란함은 가족애 였다 어제 캠핑하는데 전혀 예상치 못했던 비바람 돌풍이...,25
39926,광고 식용유 없이 전을 굽는다고 풀무원 철판수제전 3종세트 철판 바삭감자채전 철판 ...,25
39927,제품제공 이걸 소개할 수 있어 영광입니다 가장 카고스럽게 보다 더 대담하게 카고컨테...,25
39928,제품제공 이제 남편들이 요리 다하겠습니다 아내님들 남편에게 식스볼트 듀라박스만 사주...,25


In [7]:
train, test = train_test_split(data, test_size= 0.2, stratify=data['label_id'], random_state=42)

In [None]:
# model & toknizer loading

'''
kykim/bert-kor-base : BERT-base 70GB 한국어 대용량 말뭉치 
snunlp/KR-Medium : KR-BERT의 medium 버전, 한국어 위키 + 뉴스 + 특허 + 댓글 포함 total 12.37GB -> SNS 데이터 매우 적합
BM-K/KoSimCSE-roberta : -> SNS 데이터 매우 적합
distilbert-base-multilingual-cased : SNS 데이터 적합 -> 실제 활용 논문이 있긴 함
beomi/KcELECTRA-base-v2022 : electra-base
'''
model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer = BertTokenizerFast.from_pretrained(model_name)

## 모델이 bert 계열이고, 속도가 중요하면 BertTkenizerFast
## 모델을 자주 비꾸거나 여러 모델 실험할 계획이면 AutoTokenizer

In [27]:
# huggingface의 datasets.Dataset을 사용하는 방식

dataset = DatasetDict({
    'train' : Dataset.from_pandas(train),
    'test' : Dataset.from_pandas(test)
})

# preprocessing

def tokenize_fn(ex):
    return tokenizer(ex["media_cn_cleaned"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize_fn, batched=True)
dataset = dataset.rename_column("label_id", "label")
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map: 100%|██████████| 31890/31890 [00:03<00:00, 10111.04 examples/s]
Map: 100%|██████████| 7973/7973 [00:00<00:00, 9756.63 examples/s] 


In [17]:
dataset['train']

Dataset({
    features: ['media_cn_cleaned', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 31890
})

In [28]:
# setting model

num_labels = data['label_id'].nunique()
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    use_safetensors=True # gpu 버전 사용 시 추가
)

# torch gpu버전을 pip install torch==2.5.1+cu121 --index-url https://download.pytorch.org/whl/cu121로 설치
# huggingface transformer 라이브러리가 pytorch 2.6 미만 버전은 보안 이슈로 강제로 차단하는데, gpu 버전의 torch는 현재 pip로 설치가 불가능. 따라서 우회하는 user_safetensor를 추가

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# setting trainer
# !pip install accelerate>=0.26.0

args = TrainingArguments(
    output_dir="finetune-KR-Medium", # 변경
    eval_strategy="epoch", # evaluation_strategy -> eval_strategy
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1
)

In [13]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    return {"accuracy": accuracy_score(labels, preds)}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.167,1.14286,0.683933
2,0.9174,1.088929,0.698859
3,0.6959,1.101118,0.701367


TrainOutput(global_step=5982, training_loss=1.0130163139820259, metrics={'train_runtime': 2720.5843, 'train_samples_per_second': 35.165, 'train_steps_per_second': 2.199, 'total_flos': 6294314713052160.0, 'train_loss': 1.0130163139820259, 'epoch': 3.0})

In [19]:
# setting trainer
# !pip install accelerate>=0.26.0

args = TrainingArguments(
    output_dir="finetune-BM-K/KoSimCSE-roberta",
    eval_strategy="epoch", # evaluation_strategy -> eval_strategy
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1
)

In [20]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    return {"accuracy": accuracy_score(labels, preds)}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.1449,1.108449,0.689577
2,0.919,1.051125,0.711777
3,0.6772,1.065571,0.713282


TrainOutput(global_step=5982, training_loss=1.0025360377804964, metrics={'train_runtime': 2802.7899, 'train_samples_per_second': 34.134, 'train_steps_per_second': 2.134, 'total_flos': 6294314713052160.0, 'train_loss': 1.0025360377804964, 'epoch': 3.0})

In [23]:
# setting trainer
# !pip install accelerate>=0.26.0

args = TrainingArguments(
    output_dir="distilbert-base-multilingual-cased",
    eval_strategy="epoch", # evaluation_strategy -> eval_strategy
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1
)

In [24]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    return {"accuracy": accuracy_score(labels, preds)}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6173,1.149738,0.68895
2,0.4877,1.258047,0.697228
3,0.3646,1.355011,0.701743


TrainOutput(global_step=5982, training_loss=0.49658566643585134, metrics={'train_runtime': 2809.2434, 'train_samples_per_second': 34.055, 'train_steps_per_second': 2.129, 'total_flos': 6294314713052160.0, 'train_loss': 0.49658566643585134, 'epoch': 3.0})

In [30]:
# setting trainer
# !pip install accelerate>=0.26.0

args = TrainingArguments(
    output_dir="kcELECTRA-base-v2022",
    eval_strategy="epoch", # evaluation_strategy -> eval_strategy
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1
)

In [31]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    return {"accuracy": accuracy_score(labels, preds)}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.2337,1.20996,0.671015
2,1.0407,1.135771,0.694218
3,0.843,1.125889,0.700364


TrainOutput(global_step=5982, training_loss=1.1494783550072418, metrics={'train_runtime': 2937.1075, 'train_samples_per_second': 32.573, 'train_steps_per_second': 2.037, 'total_flos': 6294314713052160.0, 'train_loss': 1.1494783550072418, 'epoch': 3.0})

In [None]:
# setting trainer
# !pip install accelerate>=0.26.0

args = TrainingArguments(
    output_dir="skt-kobert-base-v1",
    eval_strategy="epoch", # evaluation_strategy -> eval_strategy
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1
)

In [None]:
def compute_metrics(p):
    preds = p.predictions.argmax(axis=-1)
    labels = p.label_ids
    return {"accuracy": accuracy_score(labels, preds)}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.2337,1.20996,0.671015
2,1.0407,1.135771,0.694218
3,0.843,1.125889,0.700364


TrainOutput(global_step=5982, training_loss=1.1494783550072418, metrics={'train_runtime': 2937.1075, 'train_samples_per_second': 32.573, 'train_steps_per_second': 2.037, 'total_flos': 6294314713052160.0, 'train_loss': 1.1494783550072418, 'epoch': 3.0})

Inference

In [None]:
# 모델과 토크나이저 로드
model_path = "src/bert_classification_model"  # 저장된 모델 디렉토리
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# 모델을 평가 모드로 전환
model.eval()

In [None]:
def predict_batch(text_list):
    inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1)
    return preds.tolist()

import pandas as pd

def predict_from_csv(csv_path):
    df = pd.read_csv(csv_path)
    
    # 학습할 때와 같은 컬럼명을 유지
    text_list = df['media_cn_cleaned'].astype(str).tolist()
    
    preds = predict_batch(text_list)
    
    # 결과를 원래 df에 붙여서 반환
    df['predicted_label'] = preds
    return df


In [None]:
result_df = predict_from_csv("test.csv")
result_df.to_csv("predicted_test.csv", index=False)

##### Using Torch

In [None]:
# # huggingface가 아니라 pytorch library 사용 시 방법
# from torch.utils.data import Dataset, DataLoader

# class TokenDataset(Dataset):
  
#     def __init__(self, dataframe, model_name):
#         # sentence, label 컬럼으로 구성된 데이터프레임 전달
#         self.data = dataframe        
#         # Huggingface 토크나이저 생성
#         # self.tokenizer = BertTokenizerFast.from_pretrained(tokenizer_pretrained)
#         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
  
#     def __len__(self):
#         return len(self.data)
  
#     def __getitem__(self, idx):
#         sentence = self.data.iloc[idx]['document']
#         label = self.data.iloc[idx]['label']

#         # 토큰화 처리
#         tokens = self.tokenizer(
#             sentence,                # 1개 문장 
#             return_tensors='pt',     # 텐서로 반환
#             truncation=True,         # 잘라내기 적용
#             padding='max_length',    # 패딩 적용
#             add_special_tokens=True  # 스페셜 토큰 적용
#         )

#         input_ids = tokens['input_ids'].squeeze(0)           # 2D -> 1D
#         attention_mask = tokens['attention_mask'].squeeze(0) # 2D -> 1D
#         token_type_ids = torch.zeros_like(attention_mask)

#         # input_ids, attention_mask, token_type_ids 이렇게 3가지 요소를 반환하도록 합니다.
#         # input_ids: 토큰
#         # attention_mask: 실제 단어가 존재하면 1, 패딩이면 0 (패딩은 0이 아닐 수 있습니다)
#         # token_type_ids: 문장을 구분하는 id. 단일 문장인 경우에는 전부 0
#         return {
#             'input_ids': input_ids,
#             'attention_mask': attention_mask, 
#             'token_type_ids': token_type_ids,
#         }, torch.tensor(label)

In [None]:
# # train, test 데이터셋 생성
# train_data = TokenDataset(train, model_name)
# test_data = TokenDataset(test, model_name)

# # DataLoader로 이전에 생성한 Dataset를 지정하여, batch 구성, shuffle, num_workers 등을 설정합니다.
# train_loader = DataLoader(train_data, batch_size=8, shuffle=True, num_workers=8)
# test_loader = DataLoader(test_data, batch_size=8, shuffle=True, num_workers=8)

In [None]:
# # 1개의 batch 꺼내기
# inputs, labels = next(iter(train_loader))

# # 데이터셋을 device 설정
# inputs = {k: v.to(device) for k, v in inputs.items()}
# labels.to(device)