<a href="https://colab.research.google.com/github/OhJin-Soo/fintuned-model/blob/main/Kobert_fintuned_restaurant_review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, Trainer, TrainingArguments
from sklearn.metrics import f1_score

In [5]:
# 1️⃣ TSV 파일 로드
df = pd.read_csv("/content/kr3.tsv", sep="\t")  # 컬럼: Review, Rating

# 2️⃣ label=2 제거 (모호한 리뷰)
df = df[df["Rating"] != 2].reset_index(drop=True)

# 3️⃣ 텍스트와 라벨 분리
texts = df["Review"].tolist()
labels = df["Rating"].tolist()  # 이제 0, 1만 남음

In [6]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# 예시 데이터
texts = df["Review"].tolist()
labels = df["Rating"].tolist()

# Oversampling 적용
ros = RandomOverSampler(random_state=42)
texts_resampled, labels_resampled = ros.fit_resample(
    np.array(texts).reshape(-1, 1), labels
)

texts_resampled = texts_resampled.flatten().tolist()

print("Before:", Counter(labels))
print("After:", Counter(labels_resampled))

Before: Counter({1: 388111, 0: 70910})
After: Counter({1: 388111, 0: 388111})


In [7]:
# 2️⃣ train/validation/test 분리
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    texts_resampled,
    labels_resampled,
    test_size=0.3,
    random_state=42,
    stratify=labels_resampled
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=0.5,
    random_state=42,
    stratify=temp_labels
)

In [8]:
import numpy as np
print(np.unique(train_labels))
print(np.unique(val_labels))
print(np.unique(test_labels))

[0 1]
[0 1]
[0 1]


In [9]:
# 3️⃣ 모델 & 토크나이저
model_name = "skt/kobert-base-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # num_labels=2

tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at skt/kobert-base-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
MAX_LEN = 128  # 또는 256, 절대 512 초과 X

train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
)

val_encodings = tokenizer(
    val_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
)

test_encodings = tokenizer(
    test_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LEN,
)

In [11]:
max_id = max([max(ids) for ids in train_encodings['input_ids']])
print(max_id, tokenizer.vocab_size)

8001 8002


In [12]:
# 5️⃣ PyTorch Dataset 생성
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {}

        item["input_ids"] = torch.tensor(
            self.encodings["input_ids"][idx], dtype=torch.long
        )
        item["attention_mask"] = torch.tensor(
            self.encodings["attention_mask"][idx], dtype=torch.long
        )

        # 🔥 핵심: token_type_ids 강제 0
        item["token_type_ids"] = torch.zeros_like(item["input_ids"])

        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [13]:
# 6️⃣ 데이터 길이 맞춤
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
# 7️⃣ TrainingArguments 설정
training_args = TrainingArguments(
    output_dir="kobert-sentiment-analysis-restaurant",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
)

In [15]:
# 8️⃣ Metrics 정의
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = np.mean(predictions == labels)
    f1 = f1_score(labels, predictions, average="macro")
    return {"accuracy": acc, "f1_macro": f1}

In [16]:
# 9️⃣ Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [17]:
model.config.max_position_embeddings

512

In [18]:
max_len_in_data = max(len(ids) for ids in train_encodings["input_ids"])
print("max sequence length:", max_len_in_data)
print("model max_position_embeddings:", model.config.max_position_embeddings)

max sequence length: 128
model max_position_embeddings: 512


In [19]:
sample = train_dataset[0]
print(sample["token_type_ids"].unique())

tensor([0])


In [20]:
# 10️⃣ 학습 시작
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjin-soo[0m ([33mjin-soo-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.1134,0.108932,0.96659,0.966589
2,0.0644,0.0721,0.980169,0.980167
3,0.032,0.061723,0.986061,0.986059


TrainOutput(global_step=25470, training_loss=0.08756643762577268, metrics={'train_runtime': 9109.177, 'train_samples_per_second': 178.948, 'train_steps_per_second': 2.796, 'total_flos': 1.072220306138496e+17, 'train_loss': 0.08756643762577268, 'epoch': 3.0})

In [21]:
trainer.push_to_hub("Training complete!")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...s-restaurant/spiece.model: 100%|##########|  371kB /  371kB            

  ...taurant/model.safetensors:   0%|          | 11.0kB /  369MB            

  ...90721.6fda3dfd8950.1656.0:   2%|1         |   284B / 17.3kB            

  ...taurant/training_args.bin:   2%|1         |  97.0B / 5.91kB            

CommitInfo(commit_url='https://huggingface.co/jin-soo/kobert-sentiment-analysis-restaurant/commit/ce22668a9d4fdc28ff96f741840859384d22c534', commit_message='Training complete!', commit_description='', oid='ce22668a9d4fdc28ff96f741840859384d22c534', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jin-soo/kobert-sentiment-analysis-restaurant', endpoint='https://huggingface.co', repo_type='model', repo_id='jin-soo/kobert-sentiment-analysis-restaurant'), pr_revision=None, pr_num=None)

In [23]:
trainer.save_model("./best_model")  # best_model 폴더에 저장
tokenizer.save_pretrained("./best_model")  # 토크나이저도 저장

('./best_model/tokenizer_config.json',
 './best_model/special_tokens_map.json',
 './best_model/spiece.model',
 './best_model/added_tokens.json')

In [24]:
# 11️⃣ 테스트 평가
trainer.evaluate(test_dataset)

{'eval_loss': 0.06265444308519363,
 'eval_accuracy': 0.9856141676829792,
 'eval_f1_macro': 0.9856129730288448,
 'eval_runtime': 209.4479,
 'eval_samples_per_second': 555.909,
 'eval_steps_per_second': 8.69,
 'epoch': 3.0}

In [26]:
import os
import shutil
from google.colab import drive

# Google 드라이브 마운트
drive.mount('/content/drive')

# 원본 경로
source_path = '/content'

# 목적지 경로
destination_path = '/content/drive/MyDrive/fintunned-model'

# 목적지 경로가 없으면 생성
if not os.path.exists(destination_path):
    os.makedirs(destination_path)

# '/content' 경로에 있는 모든 파일과 폴더를 이동
for filename in os.listdir(source_path):
    file_path = os.path.join(source_path, filename)
    if os.path.isfile(file_path) or os.path.isdir(file_path):
        shutil.move(file_path, destination_path)

Mounted at /content/drive


Error: Cannot move a directory '/content/drive' into itself '/content/drive/MyDrive/fintunned-model'.