##### [한글 데이터셋 RNN] <hr>


In [1]:
import pandas as pd
from Korpora import Korpora


In [2]:
corpus = Korpora.load("nsmc")



    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-25\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KD

In [3]:
nsmcDF = pd.DataFrame(corpus.test)
nsmcDF.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    50000 non-null  object
 1   label   50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [4]:
trainDF = nsmcDF.sample(frac=0.9, random_state=42)
trainDF.info()


<class 'pandas.core.frame.DataFrame'>
Index: 45000 entries, 33553 to 6838
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    45000 non-null  object
 1   label   45000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 1.0+ MB


In [5]:
testDF = nsmcDF.drop(trainDF.index)
testDF.info()


<class 'pandas.core.frame.DataFrame'>
Index: 5000 entries, 9 to 49997
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 117.2+ KB


-   [2] 단어 사전 생성 <hr>
    -   토큰화 진행 ==> 형태소 분석기 선택
    -   단어 사전


-   [2-1] 토큰화 진행 ==> 문장 --> 단어


In [6]:
## ===> 모듈 로딩
from konlpy.tag import Okt

### 토큰화 인스턴스 생성
tokenizer = Okt()


In [7]:
### ===> 문장 ====> [단어 분리]
# for text in trainDF.text:
#     print(tokenizer.morphs(text, stem=True))
#     break
train_tokens = [tokenizer.morphs(text, stem=True) for text in trainDF.text]
test_tokens = [tokenizer.morphs(text, stem=True) for text in testDF.text]


In [8]:
print(f"[train_tokens] {len(train_tokens)}개\n[test_tokens] {len(test_tokens)}개")
print(
    f"[train_tokens[0]] {len(train_tokens[0])}개\n[test_tokens[0]] {len(test_tokens[0])}개"
)
print(
    f"[train_tokens[1]] {len(train_tokens[1])}개\n[test_tokens[1]] {len(test_tokens[1])}개"
)


[train_tokens] 45000개
[test_tokens] 5000개
[train_tokens[0]] 19개
[test_tokens[0]] 18개
[train_tokens[1]] 14개
[test_tokens[1]] 6개


-   [2-2] 토큰 ===> 단어/어휘 사전 생성


In [9]:
from collections import Counter
import string

string.punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
### 단어 사전 생성 함수
def build_vocab(corpus, vocab_size, special_token):
    counter = Counter()

    # 단어/토큰에 대한 빈도수 계산
    for token in corpus:
        for pun in string.punctuation:
            if pun in token:
                token.remove(pun)
        counter.update(token)

    # 단어/어휘 사전 생성
    vocab = special_token

    # 단어/어휘 사전에 빈도수가 높은 단어 추가
    for token, count in counter.most_common(vocab_size):
        vocab.append(token)
    return vocab


In [11]:
VOCAB = build_vocab(train_tokens, 5000, ["<PAD>", "<UNK>"])


In [12]:
print(f"[VOCAB] ---> {len(VOCAB)}개\n{VOCAB[:30]}")


[VOCAB] ---> 5002개
['<PAD>', '<UNK>', '이', '영화', '보다', '하다', '의', '..', '에', '가', '.', '...', '을', '도', '들', '는', '를', '은', '없다', '이다', '있다', '좋다', '너무', '다', '정말', '한', '되다', '재밌다', '만', '진짜']


-   [2-3] 인코딩 & 디코딩 인덱싱


In [13]:
### 인코딩 : 문자 >>>> 숫자로 변환
token_to_id = {vo: id for id, vo in enumerate(VOCAB)}

### 디코딩 : 숫자 >>>> 문자로 변환
id_to_token = {id: vo for id, vo in enumerate(VOCAB)}


-   [3] 데이터 가공 <hr>
    -   토큰 데이터 정수 인코딩
    -   데이터 길이 표준화 => 다른 길이의 데이터를 길이 맞추기 -> 1개 문장 구성하는 단어 수 맞추기


-   [3-1] 토큰 정수화


In [14]:
### ===> 리뷰의 문자를 정수로 변환 및 단어/어휘 사전에 없는 문자도 처리
unk_id = token_to_id["<UNK>"]
train_ids = [
    [token_to_id.get(token, unk_id) for token in text] for text in train_tokens
]
test_ids = [[token_to_id.get(token, unk_id) for token in text] for text in test_tokens]


-   [3-2] 데이터 구성 단어 수 맞추기 즉, 패딩(padding)
    -   단어 수 선정 필요
    -   선정된 단어 수에 맞게 데이터 조절 => 길면 잘라내기, 짧으면 채우기


In [15]:
a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 0]
b = 2
a[:2], a[-b:]


([1, 2], [9, 0])

In [16]:
### ===> 패딩 처리 함수
# - sentences : 토큰화된 문장 데이터
# - max_length : 최대 문장 길이 즉, 1개 문장 구성 단어 수
# - pad : 패딩 처리 시 추가될 문자 값
# - start : 패딩 시 처리 방향 [기:R 오른쪽 즉, 뒷부분 자르기/추가하기]
def pad_sequence(sentences, max_length, pad, start="R"):
    result = []
    for sen in sentences:
        sen = sen[:max_length] if start == "R" else sen[-max_length:]
        padd_sen = (
            (sen + [pad] * (max_length - len(sen)))
            if start == "R"
            else ([pad] * (max_length - len(sen)) + sen)
        )
        result.append(padd_sen)
    return result


In [17]:
### 학습용, 테스트용 데이터 패딩 처리
PAD_ID = token_to_id["<PAD>"]
MAX_LENGTH = 32
train_ids = pad_sequence(train_ids, MAX_LENGTH, PAD_ID)
test_ids = pad_sequence(test_ids, MAX_LENGTH, PAD_ID)


In [18]:
print(f"[train_ids[0]] ---> {len(train_ids[0])}개\n{train_ids[0]}")
print(f"[test_ids[0]] ---> {len(test_ids[0])}개")


[train_ids[0]] ---> 32개
[256, 1610, 12, 1362, 167, 219, 361, 3, 2090, 1038, 253, 33, 3985, 1, 1, 1023, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[test_ids[0]] ---> 32개


-   [4] 학습 준비 <hr>
    -   데이터 로더 준비
    -   학습용/테스트용 함수
    -   모델 클래스
    -   학습 관련 변수 => DEVICE, OPTIMIZER, MODEL 인스턴스, EPOCHS, BATCH_SIZE, LOSS_FN


In [19]:
import torch
from torch.utils.data import TensorDataset, DataLoader


In [20]:
### ==> 데이터셋 생성 : List >>>> Tensor
# 학습용 데이터셋
dataTS = torch.LongTensor(train_ids)
labelTS = torch.FloatTensor(trainDF.label.values)

print(f"dataTS => {dataTS.shape}, labelTS => {labelTS.shape}")

trainDS = TensorDataset(dataTS, labelTS)


dataTS => torch.Size([45000, 32]), labelTS => torch.Size([45000])


In [21]:
# 테스트용 데이터셋
dataTS_ = torch.LongTensor(test_ids)
labelTS_ = torch.FloatTensor(testDF.label.values)

print(f"dataTS_ => {dataTS_.shape}, labelTS_ => {labelTS_.shape}")

testDS = TensorDataset(dataTS_, labelTS_)


dataTS_ => torch.Size([5000, 32]), labelTS_ => torch.Size([5000])


In [22]:
### ==> 데이터로더 생성
BATCH_SIZE = 32

trainDL = DataLoader(trainDS, batch_size=BATCH_SIZE, shuffle=True)
testDL = DataLoader(testDS, batch_size=BATCH_SIZE, shuffle=True)


In [26]:
for x in trainDL:
    print(x)
    break


[tensor([[1077, 2024, 1079,  ...,    0,    0,    0],
        [ 122,   42,    1,  ...,    0,    0,    0],
        [   7, 1020,    1,  ...,    0,    0,    0],
        ...,
        [ 221,   20,   21,  ...,    0,    0,    0],
        [ 106,   53,   26,  ...,    0,    0,    0],
        [   1, 1605, 1938,  ...,    0,    0,    0]]), tensor([1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0.,
        0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0.])]


-   [4-2] 모델 클래스 정의
    -   입력층 : Embedding Layer
    -   은닉층 : RNN/LSTM Layer
    -   은닉층 : Dropout Layer
    -   출력층 : Linear Layer


In [23]:
from torch import nn


class SentenceClassifier(nn.Module):
    def __init__(
        self,
        n_vocab,
        hidden_dim,
        embedding_dim,
        n_layer,
        dropout=0.5,
        bidirectional=True,
        model_type="lstm",
    ) -> None:
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab, embedding_dim=embedding_dim, padding_idx=0
        )
        if model_type == "rnn":
            self.model = nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layer,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )
        elif model_type == "lstm":
            self.model = nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layer,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True,
            )

        if bidirectional:
            self.classifier = nn.Linear(hidden_dim * 2, 1)
        else:
            self.classifier = nn.Linear(hidden_dim, 1)

        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        output, _ = self.model(embeddings)
        last_output = output[:, -1, :]
        last_output = self.dropout(last_output)
        logits = self.classifier(last_output)
        return logits


In [24]:
from torch import optim

n_vocab = len(token_to_id)
hidden_dim = 64
embedding_dim = 128
n_layer = 2

device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = SentenceClassifier(
    n_vocab=n_vocab, hidden_dim=hidden_dim, embedding_dim=embedding_dim, n_layer=n_layer
).to(device)
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = optim.RMSprop(classifier.parameters(), lr=0.001)


In [28]:
from torchinfo import summary

summary(classifier)


Layer (type:depth-idx)                   Param #
SentenceClassifier                       --
├─Embedding: 1-1                         640,256
├─LSTM: 1-2                              198,656
├─Linear: 1-3                            129
├─Dropout: 1-4                           --
Total params: 839,041
Trainable params: 839,041
Non-trainable params: 0

In [25]:
import numpy as np


def train(model, datasets, criterion, optimizer, device, interval):
    model.train()
    losses = list()

    for step, (input_ids, labels) in enumerate(datasets):
        input_ids = input_ids.to(device)
        labels = labels.to(device).unsqueeze(1)

        logits = model(input_ids)
        loss = criterion(logits, labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % interval == 0:
            print(f"Train Loss {step} : {np.mean(losses)}")


def test(model, datasets, criterion, device):
    model.eval()
    losses = list()
    corrects = list()

    with torch.no_grad():
        for step, (input_ids, labels) in enumerate(datasets):
            input_ids = input_ids.to(device)
            labels = labels.to(device).unsqueeze(1)

            logits = model(input_ids)
            loss = criterion(logits, labels)
            losses.append(loss.item())
            yhat = torch.sigmoid(logits) > 0.5
            corrects.extend(torch.eq(yhat, labels).cpu().tolist())

    print(f"Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}")


epochs = 5
interval = 500

for epoch in range(epochs):
    train(classifier, trainDL, criterion, optimizer, device, interval)
    test(classifier, testDL, criterion, device)


Train Loss 0 : 0.6981192827224731
Train Loss 500 : 0.6728089472014985
Train Loss 1000 : 0.6483240639651334
Val Loss : 0.6151549288421679, Val Accuracy : 0.6624
Train Loss 0 : 0.5543519854545593
Train Loss 500 : 0.5826633009249103
Train Loss 1000 : 0.5664007042671417
Val Loss : 0.503889705724777, Val Accuracy : 0.7748
Train Loss 0 : 0.6051123142242432
Train Loss 500 : 0.447650252612765
Train Loss 1000 : 0.4324425974568644
Val Loss : 0.4439732933500011, Val Accuracy : 0.7936
Train Loss 0 : 0.38300520181655884
Train Loss 500 : 0.3621136610379476
Train Loss 1000 : 0.35731632111372646
Val Loss : 0.3828697977172341, Val Accuracy : 0.8246
Train Loss 0 : 0.24730445444583893
Train Loss 500 : 0.31350751969212304
Train Loss 1000 : 0.3182889421160643
Val Loss : 0.3992704557385414, Val Accuracy : 0.8336


In [27]:
for step, (input_ids, labels) in enumerate(trainDL):
    input_ids = input_ids.to(device)
    labels = labels.to(device).unsqueeze(1)
    print(step, input_ids.shape, labels.shape)


0 torch.Size([32, 32]) torch.Size([32, 1])
1 torch.Size([32, 32]) torch.Size([32, 1])
2 torch.Size([32, 32]) torch.Size([32, 1])
3 torch.Size([32, 32]) torch.Size([32, 1])
4 torch.Size([32, 32]) torch.Size([32, 1])
5 torch.Size([32, 32]) torch.Size([32, 1])
6 torch.Size([32, 32]) torch.Size([32, 1])
7 torch.Size([32, 32]) torch.Size([32, 1])
8 torch.Size([32, 32]) torch.Size([32, 1])
9 torch.Size([32, 32]) torch.Size([32, 1])
10 torch.Size([32, 32]) torch.Size([32, 1])
11 torch.Size([32, 32]) torch.Size([32, 1])
12 torch.Size([32, 32]) torch.Size([32, 1])
13 torch.Size([32, 32]) torch.Size([32, 1])
14 torch.Size([32, 32]) torch.Size([32, 1])
15 torch.Size([32, 32]) torch.Size([32, 1])
16 torch.Size([32, 32]) torch.Size([32, 1])
17 torch.Size([32, 32]) torch.Size([32, 1])
18 torch.Size([32, 32]) torch.Size([32, 1])
19 torch.Size([32, 32]) torch.Size([32, 1])
20 torch.Size([32, 32]) torch.Size([32, 1])
21 torch.Size([32, 32]) torch.Size([32, 1])
22 torch.Size([32, 32]) torch.Size([32, 1]