In [1]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [2]:
# 훈련 데이터, 데이터 로더, 파이썬 파일 등을 불러오기 위해 경로 설정
import sys
sys.path.append('gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer')

In [3]:
!pip install konlpy

Collecting konlpy
[?25l  Downloading https://files.pythonhosted.org/packages/85/0e/f385566fec837c0b83f216b2da65db9997b35dd675e107752005b7d392b1/konlpy-0.5.2-py2.py3-none-any.whl (19.4MB)
[K     |████████████████████████████████| 19.4MB 1.4MB/s 
Collecting JPype1>=0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/fd/96/1030895dea70855a2e1078e3fe0d6a63dcb7c212309e07dc9ee39d33af54/JPype1-1.1.2-cp36-cp36m-manylinux2010_x86_64.whl (450kB)
[K     |████████████████████████████████| 460kB 58.6MB/s 
[?25hCollecting colorama
  Downloading https://files.pythonhosted.org/packages/44/98/5b86278fbbf250d239ae0ecb724f8572af1c91f4a11edf4d36a206189440/colorama-0.4.4-py2.py3-none-any.whl
Collecting beautifulsoup4==4.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/9e/d4/10f46e5cfac773e22707237bfcd51bbffeaf0a576b0a847ec7ab15bd7ace/beautifulsoup4-4.6.0-py3-none-any.whl (86kB)
[K     |████████████████████████████████| 92kB 8.9MB/s 
[?25hCollecting tweepy>=3.7.0
  Downloa

In [4]:
import tensorflow as tf
import numpy as np
import os

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from preprocess import *

In [5]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string], '')
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

학습 데이터 경로 정의

In [6]:
BASE_PATH = '/content/gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer'
DATA_IN_PATH = BASE_PATH + '/data_in/'
DATA_OUT_PATH = BASE_PATH + '/data_out/'
TRAIN_INPUTS = 'train_inputs.npy'
TRAIN_OUTPUTS = 'train_outputs.npy'
TRAIN_TARGETS = 'train_targets.npy'
DATA_CONFIGS = 'data_configs.json'

In [7]:
SEED_NUM = 1234
tf.random.set_seed(SEED_NUM)

파일 로드

In [8]:
index_inputs = np.load(open(DATA_IN_PATH + TRAIN_INPUTS, 'rb'))
index_outputs = np.load(open(DATA_IN_PATH + TRAIN_OUTPUTS , 'rb'))
index_targets = np.load(open(DATA_IN_PATH + TRAIN_TARGETS , 'rb'))
prepro_configs = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [9]:
print(len(index_inputs), len(index_outputs), len(index_targets))

11823 11823 11823


모델 만들기에 필요한 파라미터들 선언

In [10]:
MODEL_NAME = 'seq2seq_kor'
BATCH_SIZE = 2
MAX_SEQUENCE = 25
EPOCH = 30
UNITS = 1024
EMBEDDING_DIM = 256
VALIDATION_SPLIT = 0.1 

char2idx = prepro_configs['char2idx']
idx2char = prepro_configs['idx2char']
std_index = prepro_configs['std_symbol']
end_index = prepro_configs['end_symbol']
vocab_size = prepro_configs['vocab_size']

##인코더##

In [11]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim

    self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    
  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self, inp):
    return tf.zeros((tf.shape(inp)[0], self.enc_units))

##어텐션

In [12]:
class BahdanauAttention(tf.keras.layers.Layer):
  
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    hidden_with_time_axis = tf.expand_dims(query, 1)

    score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

    attention_weights = tf.nn.softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

##디코더##

In [13]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.vocab_size = vocab_size 
        self.embedding_dim = embedding_dim  
        
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(self.vocab_size)

        self.attention = BahdanauAttention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)

        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
            
        x = self.fc(output)
        
        return x, state, attention_weights

In [14]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')

def loss(real, pred): #인자로 정답과 예측값을 받아서
   #real 값 중 0인값(<PAD>)와 비교, real 값이 0(PAD) 이면 True 반환/ 아니면 False
   #logical_not을 먹여서 0은 1로  1은 0으로 변경
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred) #로쓰를 계산하고
  mask = tf.cast(mask, dtype=loss_.dtype) 
  loss_ *= mask # <pad>는 손실 계산에서 뺀다.
  return tf.reduce_mean(loss_)

def accuracy(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  mask = tf.expand_dims(tf.cast(mask, dtype=pred.dtype), axis=-1)
  pred *= mask
  acc = train_accuracy(real, pred)

  return tf.reduce_mean(acc)

##Seq To Seq 모델##

각각 분리돼 있는 위 클래스들을 이어주는 메인 클래스

In [15]:
class seq2seq(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, dec_units, batch_sz, end_token_idx=2):
    super(seq2seq, self).__init__()
    self.end_token_idx = end_token_idx
    self.encoder = Encoder(vocab_size, embedding_dim, enc_units, batch_sz)
    self.decoder = Decoder(vocab_size, embedding_dim, dec_units, batch_sz)

  def call(self, x):
    inp, tar = x

    enc_hidden = self.encoder.initialize_hidden_state(inp)
    enc_output, enc_hidden = self.encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    predict_tokens = list()
    for t in range(0, tar.shape[1]):
      dec_input = tf.dtypes.cast(tf.expand_dims(tar[:, t], 1), tf.float32)
      predictions, dec_hidden, _= self.decoder(dec_input, dec_hidden, enc_output)
      predict_tokens.append(tf.dtypes.cast(predictions, tf.float32))
    return tf.stack(predict_tokens, axis=1)
    
  def inference(self, x): #사용자의 입력에 대한 모델의 결과값을 확인하기 위해 테스트 목적 함수
    inp = x

    enc_hidden = self.encoder.initialize_hidden_state(inp)
    enc_output, enc_hidden = self.encoder(inp, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([char2idx[std_index]], 1)

    predict_tokens = list()
    for t in range(0, MAX_SEQUENCE):
      predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
      predict_token = tf.argmax(predictions[0])

      if predict_token == self.end_token_idx:
        break

      predict_tokens.append(predict_token)
      dec_input = tf.dtypes.cast(tf.expand_dims([predict_token], 0), tf.float32)

    return tf.stack(predict_tokens, axis=0).numpy()

In [28]:
model = seq2seq(vocab_size, EMBEDDING_DIM, UNITS, UNITS, BATCH_SIZE, char2idx[end_index])
model.compile(loss=loss, optimizer=tf.keras.optimizers.Adam(1e-3), metrics=[accuracy])

In [None]:
PATH = DATA_OUT_PATH + MODEL_NAME
if not(os.path.isdir(PATH)):
        os.makedirs(os.path.join(PATH))
        
checkpoint_path = DATA_OUT_PATH + MODEL_NAME + '/weights.h5'
    
cp_callback = ModelCheckpoint(
    checkpoint_path, monitor='val_accuracy', verbose=1, save_best_only=True, save_weights_only=True)

earlystop_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=10)

history = model.fit([index_inputs, index_outputs], index_targets,
                    batch_size=BATCH_SIZE, epochs=EPOCH,
                    validation_split=VALIDATION_SPLIT, callbacks=[earlystop_callback, cp_callback])

model.save(filepath = '/content/gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer/data_out/seq2seq_kor/', save_format="tf")

Epoch 1/30
Epoch 00001: val_accuracy improved from -inf to 0.80997, saving model to /content/gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer/data_out/seq2seq_kor/weights.h5
Epoch 2/30
Epoch 00002: val_accuracy improved from 0.80997 to 0.81580, saving model to /content/gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer/data_out/seq2seq_kor/weights.h5
Epoch 3/30
Epoch 00003: val_accuracy improved from 0.81580 to 0.82255, saving model to /content/gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer/data_out/seq2seq_kor/weights.h5
Epoch 4/30
Epoch 00004: val_accuracy improved from 0.82255 to 0.82894, saving model to /content/gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer/data_out/seq2seq_kor/weights.h5
Epoch 5/30
Epoch 00005: val_accuracy improved from 0.82894 to 0.83548, saving model to /content/gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer/data_out/seq2seq_kor/weights.h5
Epoch 6/30
Epoch 00006: val_accuracy improved from 0.83548 to 0.84241, saving model

INFO:tensorflow:Assets written to: /content/gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer/data_out/seq2seq_kor/assets


In [17]:
SAVE_FILE_NM = "weights.h5"
print(os.path.join(DATA_OUT_PATH, MODEL_NAME, SAVE_FILE_NM))

/content/gdrive/My Drive/RogerHeederer/NLP_KR/6.CHATBOT_heederer/data_out/seq2seq_kor/weights.h5


In [30]:
model.fit([index_inputs, index_outputs], index_targets,
                    batch_size=BATCH_SIZE, epochs=1)

  58/5912 [..............................] - ETA: 20:51 - loss: 1.8006 - accuracy: 0.8023

KeyboardInterrupt: ignored

In [36]:
model.fit([index_inputs, index_outputs], index_targets)



<tensorflow.python.keras.callbacks.History at 0x7f913b76aa20>

모델 웨이트를 로드하기 전에는 model.compile이 된 베이스를 바탕으로 fit을 먼저 해줘야

모델 웨이트 로딩을 할 수 있는 것 같다

In [37]:
model.load_weights(os.path.join(DATA_OUT_PATH, MODEL_NAME, SAVE_FILE_NM))

In [39]:
query = "요즘 잠이 안와"

test_index_inputs, _ = enc_processing([query], char2idx)    
predict_tokens = model.inference(test_index_inputs)
print(predict_tokens)

print(' '.join([idx2char[str(t)] for t in predict_tokens]))

[ 4290 13858 11647  1976 15501  1274]
계속 좋지 않으면 병원 에가 보세요
