<a href="https://colab.research.google.com/github/ParkEunHyeok/AI_Study/blob/main/NLP/seq2seq_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
import numpy as np
import re
import pandas as pd
import csv
import tensorflow as tf
from tensorflow import keras
from keras.models import Model, load_model, save_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking
from tensorflow.keras.models import Model

In [50]:
import os
from google.colab import drive
drive.mount('/content/gdrive/')
path = "gdrive/My Drive/Colab Notebooks/squad"

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [51]:
train = pd.read_csv(path+"/songysData.csv")
train[:5]

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [52]:
# 데이터 불러오기, 데이터에서 숫자, 문자가 아닌 문자 제거
train['Q'] = train['Q'].str.replace("[^\w]", " ")
train['A'] = train['A'].str.replace("[^\w]", " ")
train[:5]

Unnamed: 0,Q,A,label
0,12시 땡,하루가 또 가네요,0
1,1지망 학교 떨어졌어,위로해 드립니다,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠,0
4,PPL 심하네,눈살이 찌푸려지죠,0


In [53]:
# 데이터를 단어별로 나누고, <start>, <end> 태그 바르기
encoder_input, decoder_input, decoder_output = [], [], []

for stc in train['Q']:
    encoder_input.append(stc.split())

for stc in train['A']:
    decoder_input.append(("<start> "+stc).split())

for stc in train['A']:
    decoder_output.append((stc+" <end>").split())

In [54]:
# tokenizer 객체 생성해서 fit, tokenize
tokenizer_q = Tokenizer()
tokenizer_q.fit_on_texts(encoder_input)
encoder_input = tokenizer_q.texts_to_sequences(encoder_input)

tokenizer_a = Tokenizer()
tokenizer_a.fit_on_texts(decoder_input)
tokenizer_a.fit_on_texts(decoder_output)
decoder_input = tokenizer_a.texts_to_sequences(decoder_input)
decoder_output = tokenizer_a.texts_to_sequences(decoder_output)

In [55]:
# padding
encoder_input = pad_sequences(encoder_input, padding="post")
decoder_input = pad_sequences(decoder_input, padding="post")
decoder_output = pad_sequences(decoder_output, padding="post")

In [56]:
a_to_index = tokenizer_a.word_index
index_to_a = tokenizer_a.index_word

In [57]:
# 학습 데이터와 테스트 데이터 가르기
test_size = 2500
encoder_input_train = encoder_input[:-test_size]
decoder_input_train = decoder_input[:-test_size]
decoder_output_train = decoder_output[:-test_size]

encoder_input_test = encoder_input[-test_size:]
decoder_input_test = decoder_input[-test_size:]
decoder_output_test = decoder_output[-test_size:]

In [58]:
# 인코더 신경망 설계
encoder_inputs = Input(shape=(15,))
encoder_embed = Embedding(len(tokenizer_q.word_index)+1, 50)(encoder_inputs)
encoder_mask = Masking(mask_value=0)(encoder_embed)
encoder_outputs, h_state, c_state = LSTM(50, return_state=True)(encoder_mask)


# 디코더 신경망 설계
decoder_inputs = Input(shape=(22,))
decoder_embed = Embedding(len(tokenizer_a.word_index)+1, 50)(decoder_inputs)
decoder_mask = Masking(mask_value=0)(decoder_embed)

decoder_lstm = LSTM(50, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_mask, initial_state=[h_state, c_state])

decoder_dense = Dense(len(tokenizer_a.word_index)+1, activation='softmax')
decoder_softmax_outputs = decoder_dense(decoder_outputs)

In [59]:
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['acc'])

In [66]:
if not(os.path.isdir(path)):
    os.makedirs(os.path.join(path))

model.load_weights(os.path.join('gdrive/My Drive/Colab Notebooks/squad/best_model_save.h5'))

In [79]:
callback = ModelCheckpoint('gdrive/My Drive/Colab Notebooks/squad/best_model_save.h5', monitor = 'val_acc', verbose =10, save_best_only= True,
                            save_weights_only =True)
earlystop = EarlyStopping(monitor ='val_acc', min_delta = 0.001, patience =30)

In [80]:
model.fit(x = [encoder_input_train, decoder_input_train],
          y = decoder_output_train,
          validation_data = ([encoder_input_test,
                              decoder_input_test],
                             decoder_output_test),
          batch_size = 128, epochs = 100,
          callbacks = [earlystop, callback])

Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.79782, saving model to gdrive/My Drive/Colab Notebooks/squad/best_model_save.h5
Epoch 2/100

Epoch 00002: val_acc improved from 0.79782 to 0.79904, saving model to gdrive/My Drive/Colab Notebooks/squad/best_model_save.h5
Epoch 3/100

Epoch 00003: val_acc improved from 0.79904 to 0.79949, saving model to gdrive/My Drive/Colab Notebooks/squad/best_model_save.h5
Epoch 4/100

Epoch 00004: val_acc improved from 0.79949 to 0.79964, saving model to gdrive/My Drive/Colab Notebooks/squad/best_model_save.h5
Epoch 5/100

Epoch 00005: val_acc improved from 0.79964 to 0.79984, saving model to gdrive/My Drive/Colab Notebooks/squad/best_model_save.h5
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.79984
Epoch 7/100

Epoch 00007: val_acc improved from 0.79984 to 0.80029, saving model to gdrive/My Drive/Colab Notebooks/squad/best_model_save.h5
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.80029
Epoch 9/100

Epoch 00009: val_a

<tensorflow.python.keras.callbacks.History at 0x7ff98d1e68d0>

In [81]:
# 인코딩 결과로 발생할 상태값을 반환할 모델
encoder_model = Model(encoder_inputs, [h_state, c_state])

In [82]:
# input : 디코더 모델 만들기
# decoder_lstm : decoder_mask 거침
# decoder_dense : 결과치를 가지고 softmax 결과를 뽑아내 단어를 찾음

encoder_h_state = Input(shape=(50,))
encoder_c_state = Input(shape=(50,))

pd_decoder_outputs, pd_h_state, pd_c_state = decoder_lstm(decoder_mask, initial_state=[encoder_h_state, encoder_c_state])

pd_decoder_softmax_outputs = decoder_dense(pd_decoder_outputs)

decoder_model = Model([decoder_inputs, encoder_h_state, encoder_c_state], [pd_decoder_softmax_outputs, pd_h_state, pd_c_state])

In [83]:
input_stc = input()
token_stc = input_stc.split()
encode_stc = tokenizer_q.texts_to_sequences([token_stc])
pad_stc = pad_sequences(encode_stc, maxlen=15, padding="post")

states_value = encoder_model.predict(pad_stc)

predicted_seq = np.zeros((1,1))
predicted_seq[0, 0] = a_to_index['<start>']

decoded_stc = []

while True:
    output_words, h, c = decoder_model.predict([predicted_seq] + states_value)

    predicted_word = index_to_a[np.argmax(output_words[0,0])]

    if predicted_word == '<end>':
        break

    decoded_stc.append(predicted_word)

    predicted_seq = np.zeros((1,1))
    predicted_seq[0,0] = np.argmax(output_words[0,0])

    states_value = [h, c]

print(' '.join(decoded_stc))

나 우울해
자신을 해야 안에
