In [None]:
!pip install -q transformers sentencepiece accelerate scikit-learn

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
)
from torch.optim import AdamW


from sklearn.metrics import accuracy_score, f1_score
from tqdm.auto import tqdm
import numpy as np
import random
import pandas as pd

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

Device: cuda


# NSMC

In [None]:
!git clone -q https://github.com/e9t/nsmc.git
!ls nsmc

code		  ratings_train.txt  raw	synopses.json
ratings_test.txt  ratings.txt	     README.md


In [None]:
train_df = pd.read_table("/content/nsmc/ratings_train.txt")
test_df  = pd.read_table("/content/nsmc/ratings_test.txt")

print(train_df.head())
print(train_df.shape, test_df.shape)

         id                                           document  label
0   9976970                                아 더빙.. 진짜 짜증나네요 목소리      0
1   3819312                  흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나      1
2  10265843                                  너무재밓었다그래서보는것을추천한다      0
3   9045019                      교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정      0
4   6483659  사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...      1
(150000, 3) (50000, 3)


In [None]:
train_df = train_df.dropna()
test_df  = test_df.dropna()

# 문장 길이 0인 경우 제거
train_df = train_df[train_df["document"].str.len() > 0]
test_df  = test_df[test_df["document"].str.len() > 0]

train_df.shape, test_df.shape

((149995, 3), (49997, 3))

In [None]:
class NSMCDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# KoBERT

In [None]:
MODEL_NAME = "monologg/kobert"

# KoBERT는 커스텀 토크나이저가 있어서 trust_remote_code=True 필요
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
)

# 하이퍼파라미터
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
LR = 5e-5


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

tokenization_kobert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/monologg/kobert:
- tokenization_kobert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_78b3253a26.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [None]:
TRAIN_SIZE = 50000

small_train_df = train_df.sample(n=TRAIN_SIZE, random_state=42)

train_dataset = NSMCDataset(
    small_train_df["document"],
    small_train_df["label"],
    tokenizer,
    MAX_LEN,
)

valid_dataset = NSMCDataset(
    test_df["document"],
    test_df["label"],
    tokenizer,
    MAX_LEN,
)

len(train_dataset), len(valid_dataset)

(50000, 49997)

In [None]:
train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True if device.type == "cuda" else False,
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True if device.type == "cuda" else False,
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,          # NSMC: 부정(0) / 긍정(1)
)
model.to(device)

optimizer = AdamW(model.parameters(), lr=LR)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps,
)

config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def train_one_epoch(epoch, model, train_loader, optimizer, scheduler, device):
    model.train()
    total_loss = 0.0

    pbar = tqdm(train_loader, desc=f"[Epoch {epoch}] Train", leave=False)
    for batch in pbar:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
        )

        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    return total_loss / len(train_loader)


def evaluate(model, valid_loader, device):
    model.eval()
    total_loss = 0.0
    preds = []
    trues = []

    with torch.no_grad():
        pbar = tqdm(valid_loader, desc="[Eval]", leave=False)
        for batch in pbar:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"],
            )

            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds.append(torch.argmax(logits, dim=-1).cpu())
            trues.append(batch["labels"].cpu())

    preds = torch.cat(preds).numpy()
    trues = torch.cat(trues).numpy()

    avg_loss = total_loss / len(valid_loader)
    acc = accuracy_score(trues, preds)
    f1 = f1_score(trues, preds)

    return avg_loss, acc, f1

In [None]:
best_acc = 0.0

for epoch in range(1, EPOCHS + 1):
    train_loss = train_one_epoch(epoch, model, train_loader, optimizer, scheduler, device)
    val_loss, val_acc, val_f1 = evaluate(model, valid_loader, device)

    print(f"===== Epoch {epoch}/{EPOCHS} =====")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Valid Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        model.save_pretrained("kobert_nsmc_exp1")
        print(f"✅ Best model saved (Acc={best_acc:.4f})")

[Epoch 1] Train:   0%|          | 0/1563 [00:00<?, ?it/s]

[Eval]:   0%|          | 0/1563 [00:00<?, ?it/s]

===== Epoch 1/3 =====
Train Loss: 0.2414
Valid Loss: 0.3383 | Acc: 0.8841 | F1: 0.8862
✅ Best model saved (Acc=0.8841)


[Epoch 2] Train:   0%|          | 0/1563 [00:00<?, ?it/s]

[Eval]:   0%|          | 0/1563 [00:00<?, ?it/s]

===== Epoch 2/3 =====
Train Loss: 0.1367
Valid Loss: 0.3680 | Acc: 0.8862 | F1: 0.8879
✅ Best model saved (Acc=0.8862)


[Epoch 3] Train:   0%|          | 0/1563 [00:00<?, ?it/s]

[Eval]:   0%|          | 0/1563 [00:00<?, ?it/s]

===== Epoch 3/3 =====
Train Loss: 0.0864
Valid Loss: 0.3680 | Acc: 0.8862 | F1: 0.8879


# KoELECTRA

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

MODEL_NAME = "monologg/koelectra-base-v3-discriminator"

tokenizer_elec = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    use_fast=True,          # fast tokenizer
    trust_remote_code=True,
)

MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
LR = 3e-5

train_dataset_elec = NSMCDataset(
    train_df["document"],
    train_df["label"],
    tokenizer_elec,
    MAX_LEN,
)

valid_dataset_elec = NSMCDataset(
    test_df["document"],
    test_df["label"],
    tokenizer_elec,
    MAX_LEN,
)

train_loader_elec = DataLoader(
    train_dataset_elec,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=2,
    pin_memory=True if device.type == "cuda" else False,
)

valid_loader_elec = DataLoader(
    valid_dataset_elec,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True if device.type == "cuda" else False,
)

model_elec = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,   # NSMC: 0=부정, 1=긍정
)
model_elec.to(device)

optimizer_elec = AdamW(model_elec.parameters(), lr=LR)

total_steps_elec = len(train_loader_elec) * EPOCHS
scheduler_elec = get_linear_schedule_with_warmup(
    optimizer_elec,
    num_warmup_steps=int(0.1 * total_steps_elec),
    num_training_steps=total_steps_elec,
)

Device: cuda


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
best_acc_elec = 0.0

for epoch in range(1, EPOCHS + 1):
    train_loss = train_one_epoch(
        epoch,
        model_elec,
        train_loader_elec,
        optimizer_elec,
        scheduler_elec,
        device,
    )
    val_loss, val_acc, val_f1 = evaluate(
        model_elec,
        valid_loader_elec,
        device,
    )

    print(f"===== [KoELECTRA] Epoch {epoch}/{EPOCHS} =====")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Valid Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}")

    if val_acc > best_acc_elec:
        best_acc_elec = val_acc
        save_dir = "koelectra_nsmc_exp1"
        model_elec.save_pretrained(save_dir)
        print(f"✅ [KoELECTRA] Best model saved to '{save_dir}' (Acc={best_acc_elec:.4f})")


[Epoch 1] Train:   0%|          | 0/4688 [00:00<?, ?it/s]

[Eval]:   0%|          | 0/1563 [00:00<?, ?it/s]

===== [KoELECTRA] Epoch 1/3 =====
Train Loss: 0.3031
Valid Loss: 0.2349 | Acc: 0.9050 | F1: 0.9077
✅ [KoELECTRA] Best model saved to 'koelectra_nsmc_exp1' (Acc=0.9050)


[Epoch 2] Train:   0%|          | 0/4688 [00:00<?, ?it/s]

[Eval]:   0%|          | 0/1563 [00:00<?, ?it/s]

===== [KoELECTRA] Epoch 2/3 =====
Train Loss: 0.1900
Valid Loss: 0.2376 | Acc: 0.9105 | F1: 0.9101
✅ [KoELECTRA] Best model saved to 'koelectra_nsmc_exp1' (Acc=0.9105)


[Epoch 3] Train:   0%|          | 0/4688 [00:00<?, ?it/s]

[Eval]:   0%|          | 0/1563 [00:00<?, ?it/s]

===== [KoELECTRA] Epoch 3/3 =====
Train Loss: 0.1284
Valid Loss: 0.2833 | Acc: 0.9112 | F1: 0.9125
✅ [KoELECTRA] Best model saved to 'koelectra_nsmc_exp1' (Acc=0.9112)


# TEST

In [1]:
!pip install -q transformers sentencepiece scikit-learn accelerate
!apt-get -y install git-lfs

!rm -rf Movie_review_sentiment_analysis
!git clone https://github.com/SolarHO/Movie_review_sentiment_analysis.git
%cd Movie_review_sentiment_analysis

!git lfs install
!git lfs pull

!ls
!ls kobert_nsmc_exp2
!ls koelectra_nsmc_exp1

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.
Cloning into 'Movie_review_sentiment_analysis'...
remote: Enumerating objects: 49, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 49 (delta 13), reused 27 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (49/49), 480.45 KiB | 8.58 MiB/s, done.
Resolving deltas: 100% (13/13), done.
Filtering content: 100% (2/2), 782.48 MiB | 55.87 MiB/s, done.
/content/Movie_review_sentiment_analysis
Updated git hooks.
Git LFS initialized.
kobert_nsmc_exp1  koelectra_nsmc_exp1  README.md
ls: cannot access 'kobert_nsmc_exp2': No such file or directory
config.json  model.safetensors


In [2]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score

device = torch.device("cpu")
print("Device:", device)

Device: cpu


In [3]:
!rm -rf nsmc
!git clone -q https://github.com/e9t/nsmc.git

test_df = pd.read_table("nsmc/ratings_test.txt")
test_df = test_df.dropna()
test_df = test_df[test_df["document"].str.len() > 0]

print(test_df.shape)
test_df.head()

(49997, 3)


Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0


## KoBERT (kobert_nsmc_exp1)

In [6]:
kobert_model_dir = "kobert_nsmc_exp1"

kobert_model = AutoModelForSequenceClassification.from_pretrained(kobert_model_dir)
kobert_model.to(device)
kobert_model.eval()

kobert_tokenizer = AutoTokenizer.from_pretrained(
    "monologg/kobert",
    trust_remote_code=True,
)

MAX_LEN = 128
id2label = {0: "부정", 1: "긍정"}

tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

tokenization_kobert.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/monologg/kobert:
- tokenization_kobert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


tokenizer_78b3253a26.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

## KoELECTRA (koelectra_nsmc_exp1)

In [5]:
koelectra_model_dir = "koelectra_nsmc_exp1"

koelectra_model = AutoModelForSequenceClassification.from_pretrained(koelectra_model_dir)
koelectra_model.to(device)
koelectra_model.eval()

koelectra_tokenizer = AutoTokenizer.from_pretrained(
    "monologg/koelectra-base-v3-discriminator",
    use_fast=True,
    trust_remote_code=True,
)

tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [7]:
def predict_sentiment_kobert(text: str):
    inputs = kobert_tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_tensors="pt",
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = kobert_model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)

        pred_id = int(torch.argmax(probs, dim=-1).cpu().item())
        label = id2label[pred_id]
        prob = float(probs[0, pred_id].cpu().item())

    return label, prob


def predict_sentiment_koelectra(text: str):
    inputs = koelectra_tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
        return_tensors="pt",
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = koelectra_model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)

        pred_id = int(torch.argmax(probs, dim=-1).cpu().item())
        label = id2label[pred_id]
        prob = float(probs[0, pred_id].cpu().item())

    return label, prob

In [10]:
sentences = [
    "진짜 너무 재밌는 영화였다. 또 보고 싶다.",
    "스토리도 별로고 연기도 어색해서 시간 아까웠다.",
    "그냥저냥 볼 만했지만 다시 볼 정도는 아니다.",
    "돈이 전혀 아깝지 않았고, 시간 가는 줄 몰랐다.",
    "뭘 말하고 싶은지도 모르겠고 전체적으로 난해함",
]

for s in sentences:
    k_label, k_p = predict_sentiment_kobert(s)
    e_label, e_p = predict_sentiment_koelectra(s)
    print(f"[문장] {s}")
    print(f"  KoBERT     → {k_label} (p={k_p:.4f})")
    print(f"  KoELECTRA  → {e_label} (p={e_p:.4f})")
    print()

[문장] 진짜 너무 재밌는 영화였다. 또 보고 싶다.
  KoBERT     → 긍정 (p=0.9945)
  KoELECTRA  → 긍정 (p=0.9910)

[문장] 스토리도 별로고 연기도 어색해서 시간 아까웠다.
  KoBERT     → 부정 (p=0.9969)
  KoELECTRA  → 부정 (p=0.9988)

[문장] 그냥저냥 볼 만했지만 다시 볼 정도는 아니다.
  KoBERT     → 부정 (p=0.9958)
  KoELECTRA  → 부정 (p=0.9978)

[문장] 돈이 전혀 아깝지 않았고, 시간 가는 줄 몰랐다.
  KoBERT     → 긍정 (p=0.9919)
  KoELECTRA  → 긍정 (p=0.9971)

[문장] 뭘 말하고 싶은지도 모르겠고 전체적으로 난해함
  KoBERT     → 부정 (p=0.9967)
  KoELECTRA  → 부정 (p=0.9988)

