In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
from collections import Counter
from sklearn.preprocessing import LabelEncoder

import pandas as pd

# 엑셀 파일 경로
train_path = "Training.xlsx"
val_path = "Validation.xlsx"

# DataFrame으로 읽기
train_df = pd.read_excel(train_path)
val_df = pd.read_excel(val_path)


# 1. 텍스트 결합
def combine_texts(df):
    return df['사람문장1'].fillna('') + ' ' + df['사람문장2'].fillna('') + ' ' + df['사람문장3'].fillna('')

train_df['text'] = combine_texts(train_df)
val_df['text'] = combine_texts(val_df)

# 2. 라벨 인코딩
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['감정_대분류'])
val_df['label'] = label_encoder.transform(val_df['감정_대분류'])

# 3. 토크나이저 및 vocab 구축
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

counter = Counter()
for sentence in train_df['text']:
    counter.update(tokenize(sentence))

vocab = {word: idx + 2 for idx, (word, _) in enumerate(counter.items())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# 4. 토큰 ID로 변환
def encode(text, max_len=100):
    tokens = re.findall(r'\b\w+\b', text.lower())
    ids = [vocab.get(t, vocab['<UNK>']) for t in tokens]
    return ids[:max_len] + [vocab['<PAD>']] * max(0, max_len - len(ids))

MAX_LEN = 100
train_df['input_ids'] = train_df['text'].apply(lambda x: encode(x, max_len=MAX_LEN))
val_df['input_ids'] = val_df['text'].apply(lambda x: encode(x, max_len=MAX_LEN))

train_df['text'] = combine_texts(train_df)
val_df['text'] = combine_texts(val_df)

# Preview processed data
train_df[['text', '감정_대분류', 'label']].head()

Unnamed: 0,text,감정_대분류,label
0,일은 왜 해도 해도 끝이 없을까? 화가 난다. 그냥 내가 해결하는 게 나아. 남들한...,분노,2
1,이번 달에 또 급여가 깎였어! 물가는 오르는데 월급만 자꾸 깎이니까 너무 화가 나....,분노,2
2,회사에 신입이 들어왔는데 말투가 거슬려. 그런 애를 매일 봐야 한다고 생각하니까 스...,분노,2
3,직장에서 막내라는 이유로 나에게만 온갖 심부름을 시켜. 일도 많은 데 정말 분하고 ...,분노,2
4,얼마 전 입사한 신입사원이 나를 무시하는 것 같아서 너무 화가 나. 상사인 나에게 ...,분노,2


In [None]:
import torch
from torch.utils.data import Dataset

class EmotionDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        x = torch.tensor(self.input_ids[idx], dtype=torch.long)
        y = torch.tensor(self.labels[idx], dtype=torch.long)
        return x, y


In [None]:
import torch.nn as nn

class BiLSTMAttentionModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.attn = nn.Linear(hidden_dim * 2, 1)  # for attention weights
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)  # [B, T, E]
        lstm_out, _ = self.lstm(embedded)  # [B, T, 2H]

        # Attention mechanism
        attn_weights = torch.softmax(self.attn(lstm_out), dim=1)  # [B, T, 1]
        context = torch.sum(attn_weights * lstm_out, dim=1)  # [B, 2H]

        return self.fc(context)  # [B, output_dim]


In [None]:
from torch.utils.data import DataLoader
import torch.optim as optim
from sklearn.metrics import classification_report
import pandas as pd

# CSV 파일 불러오기
train_df = pd.read_csv("processed_train.csv")
val_df = pd.read_csv("processed_val.csv")

# 문자열로 저장된 input_ids를 다시 정수 리스트로 복원
train_df['input_ids'] = train_df['input_ids_str'].apply(lambda x: list(map(int, x.strip().split())))
val_df['input_ids'] = val_df['input_ids_str'].apply(lambda x: list(map(int, x.strip().split())))

train_dataset = EmotionDataset(train_df['input_ids'].tolist(), train_df['label'].tolist())
val_dataset   = EmotionDataset(val_df['input_ids'].tolist(), val_df['label'].tolist())

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

# 모델 초기화
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMAttentionModel(
    vocab_size=len(vocab),
    embedding_dim=128,
    hidden_dim=128,
    output_dim=len(label_encoder.classes_)
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 학습 루프
for epoch in range(20):
    model.train()
    total_loss = 0
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        out = model(x)
        loss = criterion(out, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for x, y in val_loader:
        x = x.to(device)
        out = model(x)
        preds = out.argmax(dim=1).cpu().tolist()
        all_preds.extend(preds)
        all_labels.extend(y.tolist())

# 모델 저장
torch.save(model.state_dict(), "bilstm_attention_model.pt")
print("모델이 저장되었습니다.")

from google.colab import files
files.download("bilstm_attention_model.pt")


# 성능 출력
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))


Epoch 1, Loss: 2163.6709
Epoch 2, Loss: 1614.8473
Epoch 3, Loss: 1172.2204
Epoch 4, Loss: 680.6844
Epoch 5, Loss: 300.8119
Epoch 6, Loss: 128.9843
Epoch 7, Loss: 61.0640
Epoch 8, Loss: 52.4640
Epoch 9, Loss: 45.5374
Epoch 10, Loss: 51.1412
Epoch 11, Loss: 26.2518
Epoch 12, Loss: 23.8283
Epoch 13, Loss: 27.8046
Epoch 14, Loss: 13.4079
Epoch 15, Loss: 21.3356
Epoch 16, Loss: 25.3039
Epoch 17, Loss: 12.2397
Epoch 18, Loss: 11.2412
Epoch 19, Loss: 20.7171
Epoch 20, Loss: 14.1984
모델이 저장되었습니다.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

              precision    recall  f1-score   support

          기쁨       0.93      0.84      0.88      1213
          당황       0.61      0.71      0.66      1048
          분노       0.65      0.70      0.67      1257
          불안       0.69      0.67      0.68      1113
          상처       0.62      0.58      0.60      1007
          슬픔       0.68      0.64      0.66      1003

    accuracy                           0.70      6641
   macro avg       0.70      0.69      0.69      6641
weighted avg       0.70      0.70      0.70      6641



In [None]:
# 모델 로드 준비
model = BiLSTMAttentionModel(
    vocab_size=len(vocab),
    embedding_dim=128,
    hidden_dim=128,
    output_dim=len(label_encoder.classes_)
).to(device)

# state_dict 로드
model.load_state_dict(torch.load("bilstm_attention_model.pt"))
model.eval()  # 평가 모드로 전환
print("모델이 로드되었습니다.")
