In [1]:
!pip install kss
import os
import json
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report
import kss

Collecting kss
  Downloading kss-6.0.4.tar.gz (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m0.6/1.1 MB[0m [31m18.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji==1.2.0 (from kss)
  Downloading emoji-1.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting pecab (from kss)
  Downloading pecab-1.0.8.tar.gz (26.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.4/26.4 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jamo (from kss)
  Downloading jamo-0.4.1-py3-none-any.whl.metadata (2.3 kB)
Collecting hangul-jamo (from kss)
  Downloading hangul_jamo-1.0.1-py3-none-any.whl.

In [3]:
import zipfile

zip_path = '/content/dataset.zip'
extract_path = '/content/dataset'

os.makedirs(extract_path, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [4]:
# ===== 데이터 로드 =====
base_dir = '/content/dataset/dataset'

def load_json_folder(folder):
    data = []
    for fname in os.listdir(folder):
        if fname.endswith('.json'):
            with open(os.path.join(folder, fname), encoding='utf-8') as f:
                item = json.load(f)
                if isinstance(item, dict):
                    data.append(item['sourceDataInfo'])
                elif isinstance(item, list):
                    data.extend([d['sourceDataInfo'] for d in item])
    return pd.DataFrame(data)

train_df = load_json_folder(os.path.join(base_dir, 'Training'))
valid_df = load_json_folder(os.path.join(base_dir, 'Validation'))
test_df  = load_json_folder(os.path.join(base_dir, 'Test'))

In [None]:
# ===== HAN 전처리 =====
SENT_MAXLEN = 16
WORD_MAXLEN = 64

# vocab 사전 구축
all_texts = pd.concat([
    train_df['newsTitle'] + '. ' + train_df['newsContent'],
    valid_df['newsTitle'] + '. ' + valid_df['newsContent']
])
vocab = {'<PAD>': 0, '<UNK>': 1}
for doc in all_texts:
    for sent in kss.split_sentences(str(doc)):
        for word in sent.split():
            if word not in vocab:
                vocab[word] = len(vocab)
VOCAB_SIZE = len(vocab)

def encode_korean(text):
    sents = kss.split_sentences(str(text))[:SENT_MAXLEN]
    doc_idx = []
    for sent in sents:
        word_idx = [vocab.get(w, 1) for w in str(sent).split()[:WORD_MAXLEN]]
        word_idx += [0] * (WORD_MAXLEN - len(word_idx))
        doc_idx.append(word_idx)
    while len(doc_idx) < SENT_MAXLEN:
        doc_idx.append([0]*WORD_MAXLEN)
    return torch.tensor(doc_idx, dtype=torch.long)

class HANDNewsDataset(Dataset):
    def __init__(self, df):
        self.texts = (df['newsTitle'] + '. ' + df['newsContent']).tolist()
        if 'useType' in df.columns:
            self.labels = df['useType'].tolist()
        else:
            raise KeyError("'useType' 컬럼이 데이터프레임에 없습니다.")
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        doc_tensor = encode_korean(self.texts[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return doc_tensor, label

trainset = HANDNewsDataset(train_df)
validset = HANDNewsDataset(valid_df)
testset  = HANDNewsDataset(test_df)




In [None]:
# ===== HAN 모델 정의 =====
class WordAttention(nn.Module):
    def __init__(self, embed_size, hidden_size):
        super().__init__()
        self.gru = nn.GRU(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.context = nn.Parameter(torch.randn(hidden_size * 2))
    def forward(self, x):
        out, _ = self.gru(x)
        u = torch.tanh(self.fc(out))
        attn = torch.matmul(u, self.context)
        attn = F.softmax(attn, dim=1).unsqueeze(-1)
        s = torch.sum(out * attn, dim=1)
        return s

class SentenceAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, hidden_size * 2)
        self.context = nn.Parameter(torch.randn(hidden_size * 2))
    def forward(self, x):
        out, _ = self.gru(x)
        u = torch.tanh(self.fc(out))
        attn = torch.matmul(u, self.context)
        attn = F.softmax(attn, dim=1).unsqueeze(-1)
        v = torch.sum(out * attn, dim=1)
        return v

class HAN(nn.Module):
    def __init__(self, vocab_size, embed_size=128, hidden_size=64, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.word_attn = WordAttention(embed_size, hidden_size)
        self.sen_attn = SentenceAttention(hidden_size)
        self.fc = nn.Linear(hidden_size * 2, num_classes)
    def forward(self, x):  # x: (B, S, W)
        B, S, W = x.shape
        sents = []
        for s in range(S):
            e = self.embedding(x[:, s, :])
            s_vec = self.word_attn(e)
            sents.append(s_vec)
        s_mat = torch.stack(sents, dim=1)
        doc_vec = self.sen_attn(s_mat)
        out = self.fc(doc_vec)
        return out

In [None]:
# ===== 모델 초기화 및 학습 설정 =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
han_model = HAN(vocab_size=VOCAB_SIZE)
if torch.cuda.device_count() > 1:
    han_model = nn.DataParallel(han_model)
han_model.to(device)

train_loader = DataLoader(trainset, batch_size=32, shuffle=True)
valid_loader = DataLoader(validset, batch_size=32)
test_loader = DataLoader(testset, batch_size=32)

In [None]:
# ===== 학습 함수 =====
def train(model, loader, optimizer, device, epoch):
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    for batch in loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)
        logits = model(inputs)
        loss = loss_fn(logits, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch} completed.")

In [None]:
# ===== 평가 함수 =====
def evaluate(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            logits = model(inputs)
            preds = logits.argmax(dim=-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    print(classification_report(all_labels, all_preds, digits=4))

In [None]:
# ===== 학습 루프 =====
optimizer = torch.optim.Adam(han_model.parameters(), lr=1e-5)

for epoch in range(1, 6):
    train(han_model, train_loader, optimizer, device, epoch)
    evaluate(han_model, valid_loader, device)

Epoch 1 completed.
              precision    recall  f1-score   support

           0     0.8223    0.7855    0.8035      4000
           1     0.7947    0.8303    0.8121      4000

    accuracy                         0.8079      8000
   macro avg     0.8085    0.8079    0.8078      8000
weighted avg     0.8085    0.8079    0.8078      8000

Epoch 2 completed.
              precision    recall  f1-score   support

           0     0.8710    0.8085    0.8386      4000
           1     0.8213    0.8802    0.8498      4000

    accuracy                         0.8444      8000
   macro avg     0.8462    0.8444    0.8442      8000
weighted avg     0.8462    0.8444    0.8442      8000

Epoch 3 completed.
              precision    recall  f1-score   support

           0     0.8995    0.8455    0.8716      4000
           1     0.8542    0.9055    0.8791      4000

    accuracy                         0.8755      8000
   macro avg     0.8769    0.8755    0.8754      8000
weighted avg     

In [None]:
# ===== 예측 함수 (뉴스 제목 + 본문 입력) =====
def predict_news(title, content):
    han_model.eval()
    text = f"{title}. {content}"
    with torch.no_grad():
        encoded = encode_korean(text).unsqueeze(0).to(device)  # (1, S, W)
        output = han_model(encoded)
        pred = output.argmax(dim=-1).item()
    return pred

In [None]:
# ===== 모델 저장 =====
import pickle

# 저장할 정보를 딕셔너리로 구성
save_data = {
    'model_state': han_model.module.state_dict() if hasattr(han_model, 'module') else han_model.state_dict(),
    'vocab': vocab,
    'config': {
        'SENT_MAXLEN': SENT_MAXLEN,
        'WORD_MAXLEN': WORD_MAXLEN,
        'num_classes': 2,
        'embed_size': 128,
        'hidden_size': 64
    }
}

# 파일 저장
with open('han_model.pkl', 'wb') as f:
    pickle.dump(save_data, f)

In [None]:
# ===== CPU 모드로 모델 저장 =====
def save_model_cpu_mode(model, vocab, filename='han_model_cpu.pkl'):
    # 모델 설정 파라미터 추출
    config = {
        'SENT_MAXLEN': SENT_MAXLEN,
        'WORD_MAXLEN': WORD_MAXLEN,
        'num_classes': 2,
        'embed_size': 128,
        'hidden_size': 64
    }
    
    # 모델 상태를 CPU로 이동
    model_state_cpu = model.module.state_dict() if hasattr(model, 'module') else model.state_dict()
    for key in model_state_cpu:
        model_state_cpu[key] = model_state_cpu[key].cpu()

    save_data = {
        'model_state': model_state_cpu,
        'vocab': vocab,
        'config': config
    }

    with open(filename, 'wb') as f:
        pickle.dump(save_data, f)

# 실제 저장 실행 (config 파라미터 제거)
save_model_cpu_mode(han_model, vocab)