### [자연어처리] 텍스트 분류 모델 구현

- 데이터 셋 : 200개 한국어 뉴스 기사
- 라 벨 : 정치(0), 경제(1), 사회(2), 생활/문화(3), 세계(4), 기술/IT(5), 연예(6), 스포츠(7)

- 입력 받은 TEXT에 대한 기사 종류 분류 결과 출력까지 완성

- data: '../data/news_data_df.csv'
---
- 25로 padding 한 데이터

In [136]:
import pandas as pd
import numpy as np
import ast

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optima
from torch.utils.data import TensorDataset, DataLoader

from collections import Counter

In [3]:
DATA_PATH = '../data/news_data_df.csv'
LABEL_TRANSLATE = {0:'정치', 1:'경제', 2:'사회', 3:'생활/문화', 4:'세계',
                   5:'기술/IT', 6:'연예', 7:'스포츠'}

In [120]:
# model class
class SentenceClassifier(nn.Module):
    def __init__(self, n_vocab, hidden_dim, embedding_dim, n_layers, dropout=0.5, bidirectional=True, model_type='lstm'):
        super().__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        if model_type == 'rnn':
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True
            )
        elif model_type == 'lstm':
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
                proj_size=0
            )
        if bidirectional:
            self.classifier = nn.Sequential(
                nn.Linear(hidden_dim*2, 30),
                nn.ReLU(),
                nn.Linear(30, 8)
            )
            
        else:
            self.classifier = nn.Sequential(
                nn.Linear(hidden_dim, 30),
                nn.ReLU(),
                nn.Linear(30, 8)
            )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        # 전체 시퀀스에서 마지막 시점만 선택하거나 다른 방법을 사용
        if len(output.shape) == 3:
            last_output = output[:, -1, :]  # 시퀀스의 마지막 출력을 선택
        else:
            last_output = output  # 2D 텐서인 경우 전체 출력을 사용
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

In [157]:
# load data
data_df = pd.read_csv(DATA_PATH, encoding='utf-8')

# data_df['padding_25'].astype('Int32')

train_data = data_df.sample(frac=0.9, random_state=9)
test_data = data_df.drop(train_data.index)

print(train_data.head().to_markdown())
print(f"train size: {len(train_data)}")
print(f"test size: {len(test_data)}")

|      | text                                                                                                                                                                                                                                      |   label | del_stop                                                                                                                                                                                                                                                                                 | encoding                                                                                                                                                                             | padding_25                                                                                                                             | padding_15                                                                    | padding_10                                           |
|-----:

In [158]:
train_ids = train_data.padding_25.apply(lambda x: ast.literal_eval(x)).to_list()
test_ids = test_data.padding_25.apply(lambda x: ast.literal_eval(x)).to_list()

train_ids = np.array(train_ids, dtype=np.int32)
test_ids = np.array(test_ids, dtype=np.int32)

print(train_ids[0])
print(test_ids[0])

[   3   18 8902 1607  516 2859 4477 4478 4476 8903  136   70  347 4466
  136  255   49 1569    0    0    0    0    0    0    0]
[4714 1602 4715  232  627 1603 1055  276  232 2113  733 1604  404 1056
    0    0    0    0    0    0    0    0    0    0    0]


In [159]:
# apply dataloader
train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

train_labels = torch.tensor(train_data.label.values, dtype=torch.float32)
test_labels = torch.tensor(test_data.label.values, dtype=torch.float32)

train_datasets = TensorDataset(train_ids, train_labels)
test_datasets = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_datasets, batch_size=16, shuffle=True)
test_loader = DataLoader(test_datasets, batch_size=16, shuffle=False)

In [155]:
# model train
def train(model, datasets, criterion, optimizer, interval):
    model.train()
    losses = list()
    
    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids
        labels = labels.long()
        
        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % interval == 0:
            print(f"train loss {step}: {np.mean(losses)}")

In [156]:
# model test
def test(model, datasets, criterion):
    model.eval()
    losses = list()
    corrects = list()
    
    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids
        labels = labels.long()
        
        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.argmax(logits, dim=1)
        corrects.extend(torch.eq(yhat, labels).cpu().tolist())
    print(f"validation loss: {np.mean(losses)}, accuracy: {np.mean(corrects)*100:.6f} %")


In [121]:
# loss function & optimizer
n_vocab = 9783
hidden_dim = 64
embedding_dim = 128
n_layer = 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
classifier = SentenceClassifier(n_vocab, hidden_dim, embedding_dim, n_layer, dropout=0.25)
criterion = nn.CrossEntropyLoss()
optimizer = optima.RMSprop(classifier.parameters(), lr=0.001)

In [122]:
epochs = 100
interval = 500

for epoch in range(1, epochs+1):
    print(f"[epoch] - {epoch}")
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)
    print()

[epoch] - 1
train loss 0: 2.0753889083862305
validation loss: 1.9075286388397217, accuracy: 19.469027 %

[epoch] - 2
train loss 0: 1.7629797458648682
validation loss: 1.9197792530059814, accuracy: 19.469027 %

[epoch] - 3
train loss 0: 2.0568859577178955
validation loss: 1.856026824315389, accuracy: 23.893805 %

[epoch] - 4
train loss 0: 1.5607961416244507
validation loss: 1.9669328927993774, accuracy: 23.451327 %

[epoch] - 5
train loss 0: 1.6960220336914062
validation loss: 1.9390289704004924, accuracy: 29.646018 %

[epoch] - 6
train loss 0: 1.2119064331054688
validation loss: 1.8548398852348327, accuracy: 29.646018 %

[epoch] - 7
train loss 0: 1.225199580192566
validation loss: 1.8610042889912923, accuracy: 29.203540 %

[epoch] - 8
train loss 0: 1.2493815422058105
validation loss: 1.8222648700078328, accuracy: 29.203540 %

[epoch] - 9
train loss 0: 1.276947259902954
validation loss: 1.908656402428945, accuracy: 34.070796 %

[epoch] - 10
train loss 0: 1.3227410316467285
validation lo

- 음... 성능 구데기..


In [161]:
predct = classifier(test_ids[10])
predct = F.softmax(predct, dim=1)
predct = torch.argmax(predct, dim=1)
predct = LABEL_TRANSLATE[torch.argmax(predct).item()]
label = LABEL_TRANSLATE[int(test_labels[10])]

print(f"real label: {label}")
print(f"predct label: {predct}")
print(f"news sentence:\n{test_data.text.iloc[10]}")

real label: 정치
predct label: 정치
news sentence:
홍준표 배현진 송파 압승 확신 실제 당선 가능성은자유한국당 배현진 전 앵커가 홍준표 대표를 등에 업고 서울 송파을 국회의원 보궐선거에 공식 출사표를 던졌다 
