# 데이터 전처리 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.data import Field, BucketIterator, TabularDataset

import spacy # 영어
from eunjeon import Mecab # 한글
import numpy as np
import random
import math
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
mecab = Mecab() # 한글
spacy_en = spacy.load('en') # 영어

In [4]:
def tokenize_ko(text):
    # 역순으로 형태소 분리를 해야 좋다고 한다.
    return [tok for tok in mecab.morphs(text)][::-1]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
tokenize_ko('한글을 너무 어렵당!')

['!', '당', '어렵', '너무', '을', '한글']

In [6]:
tokenize_en('Korean is too dificult for me!')

['Korean', 'is', 'too', 'dificult', 'for', 'me', '!']

In [7]:
# SRC : 한국어 처리
# TRG : 영어 처리
# <sos> : 문장의 시작 부분 / <eos> : 문장의 끝 부분
SRC = Field(tokenize = tokenize_ko,
            init_token = '<sos>',
            eos_token = '<eos>')

TRG = Field(tokenize = tokenize_en,
           init_token = '<sos>',
           eos_token = '<eos>',
           lower = True)

In [8]:
fields = {'ko':('src', SRC), 'en':('trg', TRG)}

In [9]:
train_data, test_data = TabularDataset.splits(path = 'C:\\Users\\abc\\jupyter\\pytorch\\Seq2Seq',
                                             train = 'train_data.csv',
                                             test = 'test_data.csv',
                                             format = 'csv',
                                             fields = fields)
valid_data = TabularDataset('C:\\Users\\abc\\jupyter\\pytorch\\Seq2Seq\\valid_data.csv',
                                  format = 'csv',
                                  fields = fields)

In [10]:
vars(train_data[0]), vars(valid_data[0])

({'src': ['.', '요', '가', '안', '가', '이해', '이', '문장', '이', '님', '선생'],
  'trg': ['sir',
   ',',
   'i',
   'do',
   "n't",
   'understand',
   'this',
   'sentence',
   'here',
   '.']},
 {'src': ['.', '가요', '로', '기숙사', '자마자', '끝나', '가', '학교'],
  'trg': ['i',
   'go',
   'to',
   'dormitory',
   'as',
   'soon',
   'as',
   'i',
   'finished',
   'class',
   '.']})

In [11]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 94463
Number of validation examples: 31688
Number of testing examples: 31822


In [12]:
# 최소 2번이상 등장한 단어로 단어집합 생성
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

In [13]:
print(f"Unique tokens in source (ko) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (ko) vocabulary: 33056
Unique tokens in target (en) vocabulary: 24617


In [14]:
BATCH_SIZE = 32

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.src),
    sort_within_batch = True,
    device = device)

In [15]:
next(iter(train_iterator)).src.shape

torch.Size([12, 32])

# 모델 생성

## 인코더 생성

In [16]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout): # 단어 갯수, 임베딩 차원, 은닉 차원, 층 겹수, dropout
        super().__init__()
        # 여기서는 왜 다시 정의를 해주는지?
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        # src = [src_len, batch_size]
        # embedding이 문장을 단어단위로 끊은것으로 알고 있는데 왜 dropout(임의로 끊음)을 하는것인지?
        embedded = self.dropout(self.embedding(src))      
        # embedded = [src_len, batch_size, emb_dim]
        
        # hidden : 각 레이어의 마지막 은닉상태,
        # cell : 각 레이어의 마지막 cell state, 
        outputs, (hidden, cell) = self.rnn(embedded)
        
        return hidden, cell

In [17]:
src = next(iter(train_iterator))
enc = Encoder(len(SRC.vocab), 200, 100, 2, 0.5)
enc.to('cuda')
enc(src.src)

(tensor([[[ 0.3193,  0.0409,  0.0325,  ...,  0.0209,  0.2238, -0.2870],
          [ 0.0503, -0.0967, -0.0239,  ...,  0.2544,  0.0280, -0.1930],
          [ 0.1695, -0.0434,  0.0893,  ..., -0.0180,  0.0344, -0.1957],
          ...,
          [-0.3918,  0.0924,  0.2154,  ...,  0.1492,  0.0814, -0.1922],
          [-0.1330, -0.0775,  0.0657,  ...,  0.1448, -0.0845, -0.1287],
          [ 0.0517, -0.0693, -0.1414,  ...,  0.1019, -0.1464, -0.1123]],
 
         [[-0.0287,  0.0170,  0.0409,  ..., -0.0051, -0.1357,  0.0211],
          [-0.0899, -0.0935, -0.0207,  ..., -0.0160, -0.0189,  0.1072],
          [-0.0941,  0.0912, -0.0602,  ...,  0.0333, -0.0636,  0.1001],
          ...,
          [-0.0327,  0.0656, -0.0036,  ..., -0.0886, -0.0398,  0.0667],
          [-0.0271, -0.0153, -0.0428,  ...,  0.0090, -0.1114,  0.0553],
          [-0.1139,  0.0036, -0.0602,  ..., -0.0117,  0.0072,  0.1298]]],
        device='cuda:0', grad_fn=<CudnnRnnBackward>),
 tensor([[[ 0.5717,  0.0704,  0.0969,  ...,  0.

## 디코더 생성

In [18]:
class Decoder(nn.Module):
    # output_dim : output 데이터의 vocab size 
    def __init__(self, output_dim, emb_dim, hid_dim , n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        # input = [batch_size]
        # hidden = [n_layers * n_direction, batch_size, hid_dim]
        # cell = [n_layers * n_direction, batch_size, hid_dim]
        
        input = input.unsqueeze(0)
        # input = [1, batch_size]
        
        embedded = self.dropout(self.embedding(input))
        # embedded = [1, batch_size, emb_dim]
        
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        # output = [1, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        
        prediction = self.fc_out(output.squeeze(0))
        # prediction = [batch_size, output_dim]
        
        return prediction, hidden, cell

# Seq2Seq 생성

In [19]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        # assert는 뒤의 조건이 True가 아니면 AssertError를 발생한다.
        # 두개의 은닉층이 같아야 할 이유가 뭐지?
        assert encoder.hid_dim == decoder.hid_dim, \
        "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
        "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        # 디코더 출력값을 저장할 텐서
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        # 인코더의 마지막 히든 스테이트는 디코더의 최초 히든 스테이트
        hidden, cell = self.encoder(src)
        
        # 디코더의 입력의 처음은 <sos> 토큰
        input = trg[0,:]
        
        for t in range(1, trg_len):
            # 인풋 토큰, 이전 히든/셀 스테이트를 입력으로 넣고
            # 아웃풋 텐서, 새로운 히든/셀 스테이트를 출력
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            # outputs에 저장 (output = [batch_size, output_dim])
            outputs[t] = output
            
            # teacher forcing : 다음 입력으로 디코더의 예측을 사용하는 대신 실제 목표 출력을 다음 입력으로 사용하는 컨셉
            # 무슨말인지 모르겟음??
            # teacher-forcing rate : 훈련 과정에서 다음 토큰의 입력을 실제 타겟 문장의 토큰으로 할지, 아니면 이전 토큰의 결과값으로 할지 비율을 결정
            # teacher forcing 쓸지 말지
            teacher_force = random.random() < teacher_forcing_ratio
            
            # 출력중 최고값
            top1 = output.argmax(1)
            
            # teacher_forcing=True 이면 groud truth,
            # 아니면 이전 예측값을 다음 입력으로 넣음
            input = trg[t] if teacher_force else top1
            
        return outputs

In [20]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
HID_DIM = 256
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [21]:
# 모델의 초기값은 (-0.08,0.08)$을 따르도록 한다.  왜??
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(33056, 128)
    (rnn): LSTM(128, 256, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(24617, 128)
    (rnn): LSTM(128, 256, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=256, out_features=24617, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [22]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 15,551,913 trainable parameters


# 모델 훈련, 검증 함수 생성

In [23]:
optimizer = optim.Adam(model.parameters())

In [24]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)

In [25]:
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        output = model(src, trg)
        
        # trg = [trg_len, batch_size]
        # output = [trg_len, batch_size, output_dim]
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim) # <sos> 토큰 제외
        trg = trg[1:].view(-1)
        
        # trg = [(trg_len - 1) * batch_size]
        # output = [(trg_len - 1) * batch_size, output_dim]
        
        loss = criterion(output, trg)
        loss.backward()
        
        # gradient clapping
        # gradient clapping : 말 그대로 기울기 값을 자르는 것을 의미합니다. 기울기 폭주를 막기 위해 임계값을 넘지 않도록 값을 자릅니다.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [26]:
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            
            # device 자리인데 이게 gpu에 올라가나 보다.
            output = model(src, trg, 0) # teacher forcing 제거
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)
            
            loss = criterion(output, trg)
            epoch_loss += loss.item()
            
    return epoch_loss / len(iterator)

In [27]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

# 모델 훈련

In [28]:
N_EPOCHS = 5
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
        
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 18m 42s
	Train Loss: 5.718 | Train PPL: 304.431
	 Val. Loss: 5.777 |  Val. PPL: 322.776
Epoch: 02 | Time: 18m 14s
	Train Loss: 5.211 | Train PPL: 183.297
	 Val. Loss: 5.601 |  Val. PPL: 270.768
Epoch: 03 | Time: 18m 39s
	Train Loss: 4.932 | Train PPL: 138.616
	 Val. Loss: 5.432 |  Val. PPL: 228.544
Epoch: 04 | Time: 17m 48s
	Train Loss: 4.701 | Train PPL: 110.068
	 Val. Loss: 5.329 |  Val. PPL: 206.254
Epoch: 05 | Time: 17m 59s
	Train Loss: 4.527 | Train PPL:  92.437
	 Val. Loss: 5.230 |  Val. PPL: 186.755


In [29]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 5.237 | Test PPL: 188.135 |


In [30]:
def predict_sentiment(model, sentence):
    model.eval()
    
    # 한글 문장을 역순으로 토크나이징
    tokenized = [tok for tok in reversed(mecab.morphs(sentence))]
    
    # 문장 앞뒤에 <sos>, <eos> 토큰 추가
    indexed = [SRC.vocab.stoi[SRC.init_token]]+[SRC.vocab.stoi[t] for t in tokenized]+[SRC.vocab.stoi[SRC.eos_token]]
    #print(indexed)
    
    # LongTensor 변환
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1) # 배치 
    #print(tensor)
    
    # TRG 문장은 처음에만 <sos> 토큰을 넣고 나머진 0 으로 입력
    zero_trg = torch.LongTensor([[TRG.vocab.stoi[TRG.init_token]]+[0 for _ in range(100)]]).t().to(device)
    #print(zero_trg.shape)
    outputs = model(tensor, zero_trg, 0)
    
    # 모델 출력값으로부터 번역 문장 생성
    # <eos> 토큰을 만나면 거기에서 종료
    res = []
    for i in range(1,outputs.shape[0]):
        ind = outputs[i].argmax(1)
        if ind == TRG.vocab.stoi[TRG.eos_token]:
            break
        res.append(TRG.vocab.itos[ind])
    return ' '.join(res)

In [31]:
predict_sentiment(model, '밥은 먹고 다니냐?')

'do you like to eat coffee ?'

In [32]:
predict_sentiment(model, '오늘 하늘은 하루종일 맑다.')

"today 's the day day today ."