<a href="https://colab.research.google.com/github/ParkEunHyeok/AI_Study/blob/main/NLP/seq2seq_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install Korpora
!pip install konlpy

Collecting Korpora
  Downloading Korpora-0.2.0-py3-none-any.whl (57 kB)
[?25l[K     |█████▊                          | 10 kB 19.8 MB/s eta 0:00:01[K     |███████████▍                    | 20 kB 17.8 MB/s eta 0:00:01[K     |█████████████████               | 30 kB 19.2 MB/s eta 0:00:01[K     |██████████████████████▊         | 40 kB 12.6 MB/s eta 0:00:01[K     |████████████████████████████▍   | 51 kB 5.5 MB/s eta 0:00:01[K     |████████████████████████████████| 57 kB 2.7 MB/s 
[?25hCollecting dataclasses>=0.6
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Collecting tqdm>=4.46.0
  Downloading tqdm-4.61.2-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 3.7 MB/s 
Collecting xlrd>=1.2.0
  Downloading xlrd-2.0.1-py2.py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 4.7 MB/s 
Installing collected packages: xlrd, tqdm, dataclasses, Korpora
  Attempting uninstall: xlrd
    Found existing installation: xlrd 1.1.0
    Unin

In [2]:
# songys 챗봇 데이터 불러오는 korpora 모듈
from Korpora import KoreanChatbotKorpus
from Korpora import Korpora
corpus = KoreanChatbotKorpus()

# regex expression 적용하는 모듈
import re

# konlpy 형태소 분석기 사용하여 형태소 분석
from konlpy.tag import Okt

# Tokenize
import numpy as np
import warnings
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# One Hot Encoding
from tensorflow.keras.utils import to_categorical

# seq2seq Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
from tensorflow.keras.utils import plot_model

# WARNING 무시
warnings.filterwarnings('ignore')


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : songys@github
    Repository : https://github.com/songys/Chatbot_data
    References :

    Chatbot_data_for_Korean v1.0
      1. 챗봇 트레이닝용 문답 페어 11,876개
      2. 일상다반사 0, 이별(부정) 1, 사랑(긍정) 2로 레이블링
    자세한 내용은 위의 repository를 참고하세요.

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/



[korean_chatbot_data] download ChatbotData.csv: 893kB [00:00, 8.44MB/s]                           


In [3]:
import os
from google.colab import drive
drive.mount('/content/gdrive/')
path = "gdrive/My Drive/Colab Notebooks/seq2seq"

Mounted at /content/gdrive/


In [4]:
# 구어체로 되어 있음.
# text와 pair가 쌍으로 이루어짐
corpus.get_all_texts()[:5]
corpus.get_all_pairs()[:5]

('하루가 또 가네요.', '위로해 드립니다.', '여행은 언제나 좋죠.', '여행은 언제나 좋죠.', '눈살이 찌푸려지죠.')

In [5]:
texts = []
pairs = []

for sentence in corpus.get_all_texts():
    texts.append(sentence)

for sentence in corpus.get_all_pairs():
    pairs.append(sentence)

In [6]:
texts = texts[:3000]
pairs = pairs[:3000]

In [7]:
list(zip(texts, pairs))[:5]

[('12시 땡!', '하루가 또 가네요.'),
 ('1지망 학교 떨어졌어', '위로해 드립니다.'),
 ('3박4일 놀러가고 싶다', '여행은 언제나 좋죠.'),
 ('3박4일 정도 놀러가고 싶다', '여행은 언제나 좋죠.'),
 ('PPL 심하네', '눈살이 찌푸려지죠.')]

In [8]:
def clean_sentence(sentence):
    # 한글, 숫자를 제외한 모든 문자 제거
    sentence = re.sub(r'[^0-9ㄱ-ㅎㅏ-ㅣ가-힣 ]',r'', sentence)
    return sentence
  
clean_sentence('abcef가나다^^$%@12시 땡^^!??')

'가나다12시 땡'

In [9]:
okt = Okt()

# 형태소 변환해주는 함수
# morphs 함수 안에 변환한 한글 문장 입력
def process_morph(sentence):
    return ' '.join(okt.morphs(sentence))

In [10]:
def clean_and_morph(sentence, is_question=True):
    sentence = clean_sentence(sentence)
    sentence = process_morph(sentence)
    # Question 인 경우, Answer인 경우를 분기
    if is_question:
        return sentence
    else:
        # START 토큰 : decoder input, END 토큰 : decoder output
        return ('<START> ' + sentence, sentence + ' <END>')

In [11]:
def preprocess(texts, pairs):
    questions = []
    answer_in = []
    answer_out = []

    # Question 전처리
    for text in texts:
        # 전처리와 morph 수행
        question = clean_and_morph(text, is_question=True)
        questions.append(question)

    # Answer 전처리
    for pair in pairs:
        # 전처리와 morph 수행
        in_, out_ = clean_and_morph(pair, is_question=False)
        answer_in.append(in_)
        answer_out.append(out_)
    
    return questions, answer_in, answer_out

In [12]:
questions, answer_in, answer_out = preprocess(texts, pairs)

In [13]:
questions[:5]

['12시 땡', '1 지망 학교 떨어졌어', '3 박 4일 놀러 가고 싶다', '3 박 4일 정도 놀러 가고 싶다', '심하네']

In [14]:
answer_out[:5]

['하루 가 또 가네요 <END>',
 '위로 해 드립니다 <END>',
 '여행 은 언제나 좋죠 <END>',
 '여행 은 언제나 좋죠 <END>',
 '눈살 이 찌푸려지죠 <END>']

In [15]:
all_sentences = questions + answer_in + answer_out

In [16]:
a = (' '.join(questions) + ' '.join(answer_in) + ' '.join(answer_out)).split()
len(set(a))

4658

In [17]:
# Tokenizer 정의
tokenizer = Tokenizer(filters='', lower=False, oov_token='<OOV>')

# Word Index Vocabulary 만들기
tokenizer.fit_on_texts(all_sentences)

In [18]:
for word, idx in tokenizer.word_index.items():
    print(f'{word}\t\t => \t{idx}')
    if idx > 10:
        break

<OOV>		 => 	1
<START>		 => 	2
<END>		 => 	3
이		 => 	4
거		 => 	5
을		 => 	6
가		 => 	7
예요		 => 	8
도		 => 	9
해보세요		 => 	10
에		 => 	11


In [19]:
# 토큰 개수 확인
len(tokenizer.word_index)

4657

In [20]:
# Text To Sequence Encoding
question_sequence = tokenizer.texts_to_sequences(questions)
answer_in_sequence = tokenizer.texts_to_sequences(answer_in)
answer_out_sequence = tokenizer.texts_to_sequences(answer_out)

In [21]:
MAX_LENGTH = 30

question_padded = pad_sequences(question_sequence, maxlen=MAX_LENGTH, truncating='post', padding='post')
answer_in_padded = pad_sequences(answer_in_sequence, maxlen=MAX_LENGTH, truncating='post', padding='post')
answer_out_padded = pad_sequences(answer_out_sequence, maxlen=MAX_LENGTH, truncating='post', padding='post')

In [22]:
print(question_padded.shape)
print(answer_in_padded.shape, answer_out_padded.shape)

(3000, 30)
(3000, 30) (3000, 30)


In [23]:
# Encoder
class Encoder(tf.keras.Model):
    def __init__(self, units, vocab_size, embedding_dim, time_steps):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim, input_length=time_steps, name='Embedding')
        self.dropout = Dropout(0.2, name='Dropout')
        # (attention) return_sequences=True 추가
        self.lstm = LSTM(units, return_state=True, return_sequences=True, name='LSTM')
        
    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.dropout(x)
        x, hidden_state, cell_state = self.lstm(x)
        # (attention) x return 추가
        return x, [hidden_state, cell_state]

In [24]:
# Decoder
class Decoder(tf.keras.Model):
    def __init__(self, units, vocab_size, embedding_dim, time_steps):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim, input_length=time_steps, name='Embedding')
        self.dropout = Dropout(0.2, name='Dropout')
        self.lstm = LSTM(units, 
                         return_state=True, 
                         return_sequences=True, 
                         name='LSTM'
                        )
        self.attention = Attention(name='Attention')
        self.dense = Dense(VOCAB_SIZE, activation='softmax', name='Dense')
    
    def call(self, inputs, initial_state):
        # (attention) encoder_inputs 추가
        encoder_inputs, decoder_inputs = inputs
        x = self.embedding(decoder_inputs)
        x = self.dropout(x)
        x, hidden_state, cell_state = self.lstm(x, initial_state=initial_state)
        
        # (attention) key_value, attention_matrix 추가
        # 이전 hidden_state의 값을 concat으로 만들어 vector를 생성합니다.        
        key_value = tf.concat([initial_state[0][:, tf.newaxis, :], x[:, :-1, :]], axis=1)        
        # 이전 hidden_state의 값을 concat으로 만든 vector와 encoder에서 나온 출력 값들로 attention을 구합니다.
        attention_matrix = self.attention([key_value, encoder_inputs])
        # 위에서 구한 attention_matrix와 decoder의 출력 값을 concat 합니다.
        x = tf.concat([x, attention_matrix], axis=-1)
        
        x = self.dense(x)
        return x, hidden_state, cell_state

In [25]:
class Seq2Seq(tf.keras.Model):
    def __init__(self, units, vocab_size, embedding_dim, time_steps, start_token, end_token):
        super(Seq2Seq, self).__init__()
        self.start_token = start_token
        self.end_token = end_token
        self.time_steps = time_steps
        
        self.encoder = Encoder(units, vocab_size, embedding_dim, time_steps)
        self.decoder = Decoder(units, vocab_size, embedding_dim, time_steps)
        
        
    def call(self, inputs, training=True):
        if training:
            encoder_inputs, decoder_inputs = inputs
            # (attention) encoder 출력 값 수정
            encoder_outputs, context_vector = self.encoder(encoder_inputs)
            # (attention) decoder 입력 값 수정
            decoder_outputs, _, _ = self.decoder((encoder_outputs, decoder_inputs), initial_state=context_vector)
            return decoder_outputs
        else:
            x = inputs
            # (attention) encoder 출력 값 수정
            encoder_outputs, context_vector = self.encoder(x)
            target_seq = tf.constant([[self.start_token]], dtype=tf.float32)
            results = tf.TensorArray(tf.int32, self.time_steps)
            
            for i in tf.range(self.time_steps):
                decoder_output, decoder_hidden, decoder_cell = self.decoder((encoder_outputs, target_seq), initial_state=context_vector)
                decoder_output = tf.cast(tf.argmax(decoder_output, axis=-1), dtype=tf.int32)
                decoder_output = tf.reshape(decoder_output, shape=(1, 1))
                results = results.write(i, decoder_output)
                
                if decoder_output == self.end_token:
                    break
                    
                target_seq = decoder_output
                context_vector = [decoder_hidden, decoder_cell]
                
            return tf.reshape(results.stack(), shape=(1, self.time_steps))

In [26]:
# 단어 별 One-Hot Encoding
# answer를 인코딩함

VOCAB_SIZE = len(tokenizer.word_index)+1

def convert_to_one_hot(padded):
    # 원핫인코딩 초기화
    one_hot_vector = np.zeros((len(answer_out_padded), MAX_LENGTH, VOCAB_SIZE))

    # 디코더 목표를 원핫인코딩으로 변환
    # 학습시 입력은 인덱스이지만, 출력은 원핫인코딩 형식임
    for i, sequence in enumerate(answer_out_padded):
        for j, index in enumerate(sequence):
            one_hot_vector[i, j, index] = 1

    return one_hot_vector

In [27]:
answer_in_one_hot = convert_to_one_hot(answer_in_padded)
answer_out_one_hot = convert_to_one_hot(answer_out_padded)

In [28]:
answer_in_one_hot[0].shape, answer_in_one_hot[0].shape

((30, 4658), (30, 4658))

In [29]:
def convert_index_to_text(indexs, end_token): 
    
    sentence = ''
    
    # 모든 문장에 대해서 반복
    for index in indexs:
        if index == end_token:
            # 끝 단어이므로 예측 중비
            break;
        # 사전에 존재하는 단어의 경우 단어 추가
        if index > 0 and tokenizer.index_word[index] is not None:
            sentence += tokenizer.index_word[index]
        else:
        # 사전에 없는 인덱스면 빈 문자열 추가
            sentence += ''
            
        # 빈칸 추가
        sentence += ' '
    return sentence

In [30]:
BUFFER_SIZE = 1000
BATCH_SIZE = 64
EMBEDDING_DIM = 100
TIME_STEPS = MAX_LENGTH
START_TOKEN = tokenizer.word_index['<START>']
END_TOKEN = tokenizer.word_index['<END>']

UNITS = 128

VOCAB_SIZE = len(tokenizer.word_index)+1
DATA_LENGTH = len(questions)
SAMPLE_SIZE = 3

In [31]:
checkpoint_path = 'gdrive/My Drive/Colab Notebooks/seq2seq/training_checkpoint-6.ckpt'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, 
                             save_weights_only=True,
                             save_best_only=True, 
                             monitor='loss', 
                             verbose=1
                            )

In [32]:
strategy = tf.distribute.MirroredStrategy()

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)


In [33]:
strategy.num_replicas_in_sync

1

In [34]:
# 분산 환경 적용시
with strategy.scope():
    seq2seq = Seq2Seq(UNITS, VOCAB_SIZE, EMBEDDING_DIM, TIME_STEPS, START_TOKEN, END_TOKEN)
    seq2seq.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [35]:
seq2seq = Seq2Seq(UNITS, VOCAB_SIZE, EMBEDDING_DIM, TIME_STEPS, START_TOKEN, END_TOKEN)
seq2seq.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [36]:
# 연속하여 학습시 체크포인트를 로드하여 이어서 학습합니다.
seq2seq.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fcb609ef390>

In [37]:
def make_prediction(model, question_inputs):
    results = model(inputs=question_inputs, training=False)
    # 변환된 인덱스를 문장으로 변환
    results = np.asarray(results).reshape(-1)
    return results

In [38]:
for epoch in range(5):
    seq2seq.fit([question_padded, answer_in_padded],
                answer_out_one_hot,
                epochs=10,
                batch_size=16, 
                callbacks=[checkpoint]
               )
    # 랜덤한 샘플 번호 추출
    samples = np.random.randint(DATA_LENGTH, size=SAMPLE_SIZE)

    # 예측 성능 테스트
    for idx in samples:
        question_inputs = question_padded[idx]
        # 문장 예측
        results = make_prediction(seq2seq, np.expand_dims(question_inputs, 0))
        
        # 변환된 인덱스를 문장으로 변환
        results = convert_index_to_text(results, END_TOKEN)
        
        print(f'Q: {questions[idx]}')
        print(f'A: {results}\n')
        print()

Epoch 1/10

Epoch 00001: loss improved from inf to 0.75450, saving model to gdrive/My Drive/Colab Notebooks/seq2seq/training_checkpoint-6.ckpt
Epoch 2/10
 12/188 [>.............................] - ETA: 31s - loss: 0.7357 - acc: 0.8684

KeyboardInterrupt: ignored

In [39]:
# 자연어 (질문 입력) 대한 전처리 함수
def make_question(sentence):
    sentence = clean_and_morph(sentence)
    question_sequence = tokenizer.texts_to_sequences([sentence])
    question_padded = pad_sequences(question_sequence, maxlen=MAX_LENGTH, truncating='post', padding='post')
    return question_padded

In [40]:
make_question('오늘 날씨가 정말 화창합니다')

array([[163, 227,   7, 322,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]], dtype=int32)

In [41]:
def run_chatbot(question):
    question_inputs = make_question(question)
    results = make_prediction(seq2seq, question_inputs)
    results = convert_index_to_text(results, END_TOKEN)
    return results

In [42]:
while True:
    user_input = input('<< 말을 걸어 보세요!\n')
    if user_input == 'q':
        break
    print('>> 챗봇 응답: {}'.format(run_chatbot(user_input)))

<< 말을 걸어 보세요!
오늘 날씨가 정말 화창합니다
>> 챗봇 응답: 잘 할 수 있을 거 예요 
<< 말을 걸어 보세요!
나 슬퍼
>> 챗봇 응답: 잘 할 수 있을 거 예요 


KeyboardInterrupt: ignored