### RNN 책 예제
- 네이버 영화 리뷰


In [12]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optima
from torch.utils.data import TensorDataset, DataLoader

from Korpora import Korpora
from konlpy.tag import Okt
from collections import Counter

In [3]:
# model class
class SentenceClassifier(nn.Module):
    def __init__(self, n_vocab, hidden_dim, embedding_dim, n_layers, dropout=0.5, bidirectional=True, model_type='lstm'):
        super().__init__()
        
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        if model_type == 'rnn':
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True
            )
        elif model_type == 'lstm':
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
                proj_size=0
            )
        if bidirectional:
            self.classifier = nn.Linear(hidden_dim*2, 1)
        else:
            self.classifier = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits

In [5]:
# load dataset
corpus = Korpora.load('nsmc')
corpus_df = pd.DataFrame(corpus.test)

train = corpus_df.sample(frac=0.9, random_state=4)
test = corpus_df.drop(train.index)

print(train.head(5).to_markdown())
print(f"train size: {len(train)}")
print(f"test size: {len(test)}")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\PC\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\PC\Kor

In [7]:
# data tokenization & create vocab
def build_vocab(corpus, n_vocab, spacial_tokens):
    counter = Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab = spacial_tokens
    for token, count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

In [9]:
tokenizer = Okt()
train_tokens = [tokenizer.morphs(review) for review in train.text]
test_tokens = [tokenizer.morphs(review) for review in test.text]

vocab = build_vocab(corpus=train_tokens, n_vocab=5000, spacial_tokens=['<pad>', '<unk>'])
token_to_id = {token:idx for idx, token in enumerate(vocab)}
id_to_token = {idx:token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '에', '가', '...']
5002


In [10]:
# intager encoding & padding
def pad_sequences(sequences, max_length, pad_value):
    result = list()
    for sequence in sequences:
        sequence = sequence[:max_length]
        pad_length = max_length - len(sequence)
        padded_sequence = sequence + [pad_value] * pad_length
        result.append(padded_sequence)
    return np.asarray(result)

In [11]:
unk_id = token_to_id['<unk>']
train_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in train_tokens
]
test_ids = [
    [token_to_id.get(token, unk_id) for token in review] for review in test_tokens
]

max_length = 32
pad_id = token_to_id['<pad>']
train_ids = pad_sequences(train_ids, max_length, pad_id)
test_ids = pad_sequences(test_ids, max_length, pad_id)

print(train_ids[0])
print(test_ids[0])

[   1 1356 4829   51    6    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[  1  13   1  61 147  13 147   2   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [13]:
# apply dataloader
train_ids = torch.tensor(train_ids)
test_ids = torch.tensor(test_ids)

train_labels = torch.tensor(train.label.values, dtype=torch.float32)
test_labels = torch.tensor(test.label.values, dtype=torch.float32)

train_dataset = TensorDataset(train_ids, train_labels)
test_dataset = TensorDataset(test_ids, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [14]:
# loss function & optimizer
n_vocab = len(token_to_id)
hidden_dim = 64
embedding_dim = 128
n_layer = 2

device = 'cuda' if torch.cuda.is_available() else 'cpu'
classifier = SentenceClassifier(n_vocab, hidden_dim, embedding_dim, n_layer).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optima.RMSprop(classifier.parameters(), lr=0.001)

In [16]:
# model train
def train(model, datasets, criterion, optimizer, device, interval):
    model.train()
    losses = list()
    
    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)
        
        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if step % interval == 0:
            print(f"train loss {step}: {np.mean(losses)}")

In [17]:
# model test
def test(model, datasets, criterion, device):
    model.eval()
    losses = list()
    corrects = list()
    
    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)
        
        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())
        yhat = torch.sigmoid(logits) > 0.5
        corrects.extend(torch.eq(yhat, labels).cpu().tolist())
    print(f"validation loss: {np.mean(losses)}, accuracy: {np.mean(corrects)}")

In [18]:
epochs = 5
interval = 500

for epoch in range(epochs):
    train(classifier, train_loader, criterion, optimizer, device, interval)
    test(classifier, test_loader, criterion, device)
    print()

train loss 0: 0.6933217644691467
train loss 500: 0.6943136583307308
train loss 1000: 0.6905747528200026
train loss 1500: 0.6749273884264649
train loss 2000: 0.6555213789562175
train loss 2500: 0.6367976230556895
validation loss: 0.5279639783187415, accuracy: 0.7494
train loss 0: 0.31776538491249084
train loss 500: 0.5169963894668453
train loss 1000: 0.5211648493707478
train loss 1500: 0.5198383837501499
train loss 2000: 0.5221993933091575
train loss 2500: 0.5202475592690627
validation loss: 0.5058195431487629, accuracy: 0.7298
train loss 0: 0.5961745977401733
train loss 500: 0.48100612843464946
train loss 1000: 0.4821624792033023
train loss 1500: 0.48279716428878067
train loss 2000: 0.4821023522750072
train loss 2500: 0.4799151421367812
validation loss: 0.4666051073862722, accuracy: 0.7894
train loss 0: 0.18460184335708618
train loss 500: 0.43342965932008987
train loss 1000: 0.4310774285342548
train loss 1500: 0.4307644139729604
train loss 2000: 0.4270257778290747
train loss 2500: 0.42

In [19]:
# 학습된 모델로부터 임베딩 추출
token_to_embedding = dict()
embedding_matrix = classifier.embedding.weight.detach().cpu().numpy()

for word, emb in zip(vocab, embedding_matrix):
    token_to_embedding[word] = emb

token = vocab[1000]
print(token, token_to_embedding[token])

리메이크 [ 0.48094285 -0.83728343 -0.31191462  0.2117208   0.04564255 -0.596668
  1.0199872   0.20349438  0.30593103  0.5277261   0.1662459   0.89788216
  0.1946853   1.195648    0.8976036  -1.0416138  -0.58721733  0.6944218
 -0.46866128 -0.60765386 -2.1934917  -0.4531325   1.1143396   0.37227768
 -1.4145187   0.2735119  -0.14097662  1.9225914   0.45480484 -0.17459026
  1.5516007   0.2936152   0.64963645 -0.4215774   0.74884295 -0.6422965
 -0.6698972   0.30147874  0.16571277  0.19205159 -0.5371739  -1.991847
 -0.27439493 -0.7088234   1.4496316  -0.41890925 -0.76643884  0.88281
 -0.8680259   1.3656698   0.11580013 -0.35289472  1.0115812   0.29589006
 -0.82661825 -0.01542456 -0.58443356 -0.18068504  0.8624229   0.44413677
  0.1760467   1.1405487   0.9097203  -0.6424394   0.10887486  1.4188663
  0.769108   -1.9375563   1.9752502   0.12015982 -0.8225379  -2.1790164
  0.02257181 -1.6617163   0.9023272  -0.54046655 -2.1477268   1.0831587
 -0.38413635 -0.44572026 -0.25716084  0.24758536 -0.536951