In [18]:
# 문장 분류 모델
from torch import nn

class SentenceClassifier(nn.Module):
    def __init__(self,
                 n_vocab,
                 hidden_dim,
                 embedding_dim,
                 n_layers,
                 dropout=0.5,
                 bidirectional=True,
                 model_type='lstm'
                 ):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        if model_type == 'rnn':
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
        elif model_type == 'lstm':
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )

        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, 1)
        else:
            self.classifier = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

In [1]:
# 데이터세트 불러오기

import pandas as pd
from Korpora import Korpora

In [2]:
corpus = Korpora.load('nsmc')
corpus_df = pd.DataFrame(corpus.test)


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-50\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KD

In [3]:
train = corpus_df.sample(frac=0.9, random_state=42)
test = corpus_df.drop(train.index)

In [5]:
print(train.head(5).to_markdown())
print("Training Data Size :", len(train))
print("Testing Data Size :", len(test))

|       | text                                                                                     |   label |
|------:|:-----------------------------------------------------------------------------------------|--------:|
| 33553 | 모든 편견을 날려 버리는 가슴 따뜻한 영화. 로버트 드 니로, 필립 세이모어 호프만 영원하라. |       1 |
|  9427 | 무한 리메이크의 소재. 감독의 역량은 항상 그 자리에...                                    |       0 |
|   199 | 신날 것 없는 애니.                                                                       |       0 |
| 12447 | 잔잔 격동                                                                                |       1 |
| 39489 | 오랜만에 찾은 주말의 명화의 보석                                                         |       1 |
Training Data Size : 45000
Testing Data Size : 5000


In [6]:
# 데이터 토큰화 및 단어 사전 구축

from konlpy.tag import Okt
from collections import Counter

In [7]:
def build_vocab(corpus, n_vocab, special_tokens):
    counter = Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab = special_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

In [8]:
tokenizer = Okt()
train_tokens = [tokenizer.morphs(review) for review in train.text]
test_tokens = [tokenizer.morphs(review) for review in test.text]

In [9]:
vocab = build_vocab(corpus=train_tokens, n_vocab=5000, special_tokens=["<pad>", "<unk>"])
token_to_id = {token: idx for idx, token in enumerate(vocab)}
id_to_token = {idx: token for idx, token in enumerate(vocab)}

In [10]:
print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


In [11]:
# 정수 인코딩 및 패딩

import numpy as np

def pad_sequences(sequences, max_length, pad_value):
    result = list()
    for sequence in sequences:
        sequence = sequence[:max_length]
        pad_length = max_length - len(sequence)
        padded_sequence = sequence + [pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)

In [12]:
unk_id = token_to_id["<unk>"]
train_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in train_tokens
    ]
test_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in test_tokens
    ]

In [13]:
max_length = 32
pad_id = token_to_id['<pad>']
train_ids = pad_sequences(train_ids, max_length, pad_id)
test_ids = pad_sequences(test_ids, max_length, pad_id)

In [14]:
print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [15]:
# 데이터로더 적용

import torch
from torch.utils.data import TensorDataset, DataLoader

train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

train_lables = torch.tensor(train.label.values, dtype=torch.float32)
test_lables = torch.tensor(test.label.values, dtype=torch.float32)

train_dataset = TensorDataset(train_ids, train_lables)
test_dataset = TensorDataset(test_ids, test_lables)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [19]:
# 손실 함수와 최적화 함수 정의

from torch import optim

n_vocab = len(token_to_id)
hidden_dim = 64
embedding_dim = 128
n_layers = 2

device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = SentenceClassifier(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layers=n_layers
).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr=0.001)

In [20]:
# 모델 학습 및 테스트
def train(model, datasets, criterion, optimizer, device, interval):
    model.train()
    losses = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval == 0:
            print(f'Train Loss {step}: {np.mean(losses)}')

In [21]:
def test(model, datasets, criterion, device):
    model.eval()
    losses = list()
    corrects = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat, labels).cpu().tolist()
        )
    print(f'Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}')

In [23]:
epochs = 5
interval = 500
for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)

Train Loss 0: 0.7001947164535522
Train Loss 500: 0.693438629904193
Train Loss 1000: 0.692995163527402
Train Loss 1500: 0.6807639113987866
Train Loss 2000: 0.6570099064822438
Train Loss 2500: 0.6348229087099748
Val Loss : 0.5196410457071025, Val Accuracy : 0.761
Train Loss 0: 0.622211217880249
Train Loss 500: 0.4764801027056224
Train Loss 1000: 0.47221748003890585
Train Loss 1500: 0.46395730927119966
Train Loss 2000: 0.45882484196559126
Train Loss 2500: 0.45338339929948185
Val Loss : 0.42812847015195, Val Accuracy : 0.8006
Train Loss 0: 0.4823038578033447
Train Loss 500: 0.37774101536193055
Train Loss 1000: 0.37726656680221443
Train Loss 1500: 0.37932356872870715
Train Loss 2000: 0.37941422996700913
Train Loss 2500: 0.37917396082121674
Val Loss : 0.421879488105972, Val Accuracy : 0.796
Train Loss 0: 0.3029881417751312
Train Loss 500: 0.33133462168082983
Train Loss 1000: 0.33316914476714765
Train Loss 1500: 0.33145839197785754
Train Loss 2000: 0.33323383683535174
Train Loss 2500: 0.33664

In [24]:
# 학습된 모델로부터 임베딩 추출
token_to_embedding = dict()
embedding_matrix = classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab, embedding_matrix):
    token_to_embedding[word] = emb

token = vocab[1000]
print(token, token_to_embedding[token])

보고싶다 [-2.7252326  -1.6364336   0.2682625   0.9603476   1.2904423  -2.8310812
 -0.44858474  0.53599757  0.51257706  0.12093402  1.0669913   1.146301
 -0.6921513   0.5708175   1.6692575  -1.5926312  -1.4043725   2.3508148
 -2.5604475   0.2849467  -2.2112677   0.92306584  0.14503534 -0.5120045
  1.6272874   0.02130163  0.10072117 -0.78453374  0.08679244 -1.0180925
  2.1305308  -0.53132004 -0.61496264  0.6868998  -0.91903543 -1.2498301
 -0.50619674  1.2894588  -1.0364957  -0.09542676  1.1538835   0.20951161
 -0.6741956  -0.6324141  -0.1256448  -0.7929636  -0.06957742 -0.72102255
 -0.2644245  -1.1001936   0.78355145  0.21204749  0.90505666 -1.6403633
  0.5900494  -0.47923645  0.84786457  0.27278078  0.7322763   1.2687445
  0.531476    0.8172104  -0.5445721   0.26276433  2.507312   -0.4791292
 -1.3700311   1.0872015   0.51680446  1.3758634   0.6327788  -0.5940596
  0.8857784  -0.88710934 -1.6362709  -0.17476794  0.04585332  0.14498466
 -0.7323526   0.02354389 -1.3686942  -0.8400328  -0.03826

In [28]:
# 사전 학습된 모델로 임베딩 계층 초기화
from gensim.models import Word2Vec

word2vec = Word2Vec.load('../models/word2vec.model')
init_embeddings = np.zeros((n_vocab, embedding_dim))

for index, token in id_to_token.items():
    if token not in ['<pad>','<unk>']:
        init_embeddings[index] = word2vec.wv[token]

embedding_layer = nn.Embedding.from_pretrained(
    torch.tensor(init_embeddings, dtype=torch.float32)
)

In [29]:
# 사전 학습된 임베딩 계층 적용

class SentenceClassifier2(nn.Module):
    def __init__(self,
                 n_vocab,
                 hidden_dim,
                 embedding_dim,
                 n_layers,
                 dropout=0.5,
                 bidirectional=True,
                 model_type='lstm',
                 pretrained_embedding=None
                 ):
        super().__init__()

        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        if pretrained_embedding is not None:
            self.embedding = nn.Embedding.from_pretrained(
                torch.tensor(pretrained_embedding, dtype=torch.float32)
            )
        else:
            self.embedding = nn.Embedding(
                num_embeddings=n_vocab,
                embedding_dim=embedding_dim,
                padding_idx=0
            )

        if model_type == 'rnn':
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
        elif model_type == 'lstm':
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )

        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, 1)
        else:
            self.classifier = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

In [31]:
# 사전 학습된 임베딩을 사용한 모델 학습
classifier2 = SentenceClassifier2(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim,
    n_layers=n_layers, pretrained_embedding=init_embeddings
).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier2.parameters(), lr=0.001)

In [32]:
epochs = 5
interval = 500
for epoch in range(epochs):
    train(classifier2, train_loader, criterion, optimizer, device, interval)
    test(classifier2, test_loader, criterion, device)

Train Loss 0: 0.6844468116760254
Train Loss 500: 0.6665256898441239
Train Loss 1000: 0.6643826766804858
Train Loss 1500: 0.6460254401028117
Train Loss 2000: 0.6330621158373707
Train Loss 2500: 0.6146822075088803
Val Loss : 0.49753828422901347, Val Accuracy : 0.7704
Train Loss 0: 0.6750407218933105
Train Loss 500: 0.526442036687019
Train Loss 1000: 0.5409470669307432
Train Loss 1500: 0.5406075732796133
Train Loss 2000: 0.5430256086683226
Train Loss 2500: 0.5399326432149251
Val Loss : 0.5102891431639369, Val Accuracy : 0.7566
Train Loss 0: 0.4817048907279968
Train Loss 500: 0.5146990398506442
Train Loss 1000: 0.5090822963775335
Train Loss 1500: 0.5040682891382526
Train Loss 2000: 0.49618661636742634
Train Loss 2500: 0.49410975490008197
Val Loss : 0.47849095072418735, Val Accuracy : 0.7824
Train Loss 0: 0.6110131740570068
Train Loss 500: 0.4788760099106444
Train Loss 1000: 0.47007176603053835
Train Loss 1500: 0.467499750701687
Train Loss 2000: 0.46518161624059384
Train Loss 2500: 0.462057