In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch transformers gluonnlp pandas
!pip install torch transformers gluonnlp sentencepiece pandas
# !pip install git+https://git@github.com/SKTBrain/KoBERT.git@master
!pip install kobert-transformers


Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gluonnlp
  Building wheel for gluonnlp (setup.py) ... [?25l[?25hdone
  Created wheel for gluonnlp: filename=gluonnlp-0.10.0-cp310-cp310-linux_x86_64.whl size=661656 sha256=daac01b6d620627f3b04f6981ebd661ca3946dcc5dffb9cede70d24e53acd1e8
  Stored in directory: /root/.cache/pip/wheels/1a/1e/0d/99f55911d90f2b95b9f7c176d5813ef3622894a4b30fde6bd3
Successfully built gluonnlp
Installing collected packages: gluonnlp
Successfully installed gluonnlp-0.10.0
Collecting kobert-transformers
  Downloading kobert_transformers-0.6.0-py3-none-any.whl.metadata (7.3 kB)
Downloading kobert_transformers-0.6.0-py3-none-any.whl (12 kB)
Installing collected packages: kobert-transformers
Successfully installed kobert-transform

In [3]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from transformers.optimization import AdamW
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# KoBERT 모델 및 Tokenizer 로드
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')  # 'skt/kobert-base-v1'에서 'monologg/kobert'로 변경
bert_model = BertModel.from_pretrained('monologg/kobert')

# CSV 파일 로드 (경로를 적절히 설정해 주세요)
df = pd.read_csv('/content/drive/My Drive/SookLog/sentiment_dialogues.csv')

# 감정 라벨을 숫자로 변환
emotion_label_mapping = {
    "분노": 0, "슬픔": 1, "행복": 2, "불안": 3,
    "당황": 4, "혐오": 5, "공포": 6
}
df['label'] = df['감정'].map(emotion_label_mapping)

# 결측치 제거 및 정수형 변환
df = df.dropna(subset=['label'])  # NaN 값이 있는 행 제거
df['label'] = df['label'].astype(int)  # 라벨을 정수형으로 변환

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(df['발화'], df['label'], test_size=0.2, random_state=42)

# Dataset 정의
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        label = self.labels.iloc[idx]

        # Tokenizer로 변환
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 하이퍼파라미터 설정
MAX_LEN = 64
BATCH_SIZE = 8

# Dataset 생성
train_dataset = SentimentDataset(X_train, y_train, tokenizer, MAX_LEN)
test_dataset = SentimentDataset(X_test, y_test, tokenizer, MAX_LEN)

# DataLoader 정의
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 테스트 데이터 확인
for batch in train_loader:
    print(batch['input_ids'].shape)  # Tensor 형태의 입력 ID
    print(batch['attention_mask'].shape)  # Attention Mask
    print(batch['label'].shape)  # Label
    break


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/263 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/77.8k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

torch.Size([8, 64])
torch.Size([8, 64])
torch.Size([8])


In [4]:
class SentimentClassifier(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(SentimentClassifier, self).__init__()
        self.bert = bert_model
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        return self.out(output)

# KoBERT 기반 감정 분석 모델 생성
model = SentimentClassifier(bert_model=bert_model, num_labels=7)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
# 옵티마이저 및 손실 함수 정의
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
loss_fn = nn.CrossEntropyLoss().to('cuda' if torch.cuda.is_available() else 'cpu')

# 학습 함수 정의
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

# 평가 함수 정의
def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)
            total_loss += loss.item()

            _, preds = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(preds == labels)

    return correct_predictions.double() / len(data_loader.dataset), total_loss / len(data_loader)

# 학습 및 평가 진행
EPOCHS = 2
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')

    train_loss = train_epoch(model, train_loader, loss_fn, optimizer, device)
    print(f'Train loss: {train_loss:.4f}')

    acc, val_loss = eval_model(model, test_loader, loss_fn, device)
    print(f'Validation accuracy: {acc:.4f}, Validation loss: {val_loss:.4f}')




Epoch 1/2


In [None]:
# 모델 저장
torch.save(model.state_dict(), 'kobert_sentiment_model.pth')

# 토크나이저 저장
with open('kobert_tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("모델과 토크나이저 저장 완료!")
