In [None]:

# ✅ 설치 필요시
# !pip install konlpy torch sklearn

import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
from konlpy.tag import Okt
from collections import Counter
import numpy as np

# 데이터 로드
df = pd.read_csv("lstm_reviews_dataset.csv")
df = df.dropna()

# 토큰화
tokenizer = Okt()
df['tokens'] = df['review_content'].apply(lambda x: tokenizer.morphs(str(x)))

# 단어 사전 생성
all_tokens = sum(df['tokens'].tolist(), [])
vocab_counter = Counter(all_tokens)
vocab = {word: idx + 2 for idx, (word, _) in enumerate(vocab_counter.items())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

# 인덱스 변환 + 패딩
max_len = 50
def encode(tokens):
    ids = [vocab.get(token, 1) for token in tokens]
    return ids[:max_len] + [0] * (max_len - len(ids))
df['input_ids'] = df['tokens'].apply(encode)

# Dataset
class ReviewDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.x = torch.tensor(input_ids, dtype=torch.long)
        self.y = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

train_x, test_x, train_y, test_y = train_test_split(
    df['input_ids'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

train_dataset = ReviewDataset(train_x, train_y)
test_dataset = ReviewDataset(test_x, test_y)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

# LSTM 모델
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, _) = self.lstm(embedded)
        output = self.fc(hidden[-1])
        return output

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(len(vocab), embed_dim=100, hidden_dim=128, output_dim=3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# 학습
for epoch in range(3):
    model.train()
    total_loss = 0
    for x_batch, y_batch in tqdm(train_loader):
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(x_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        x_batch = x_batch.to(device)
        outputs = model(x_batch)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(y_batch.numpy())

print(classification_report(all_labels, all_preds, target_names=["부정", "중립", "긍정"]))
