<a href="https://colab.research.google.com/github/ParkEunHyeok/AI_Study/blob/main/NLP/Chatbot_data_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install konlpy
!pip install preprocessing



In [2]:
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
import os
import re
import json
import numpy as np

from tensorflow import keras
from keras.models import Model, load_model, save_model
from konlpy.tag import Okt
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from preprocessing import *

In [3]:
import os
from google.colab import drive
drive.mount('/content/gdrive/')
path = "gdrive/My Drive/Colab Notebooks/squad"

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [4]:
train = pd.read_csv(path+"/songysData.csv")
train[:5]

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [5]:
'''
 데이터 전처리
'''
FILTERS = "([~.,!?\"':;)(])"
CHANGE_FILTER = re.compile(FILTERS) # 미리 Complie
PAD, PAD_INDEX = "<PAD>", 0 # 패딩 토큰
STD, STD_INDEX = "<SOS>", 1 # 시작 토큰
END, END_INDEX = "<END>", 2 # 종료 토큰
UNK, UNK_INDEX = "<UNK>", 3 # 사전에 없음
MARKER = [PAD,STD,END,UNK]
MAX_SEQUNECE = 25

In [6]:
# Data reading
def load_data(path):
    print(path)
    df = pd.read_csv(path,header=0)
    question, answer = list(df['Q']),list(df['A'])
    return question, answer
print(path)
inputs, outputs = load_data(path+"/songysData.csv")

gdrive/My Drive/Colab Notebooks/squad
gdrive/My Drive/Colab Notebooks/squad/songysData.csv


In [7]:
# Tokenizing
def data_tokenizer(data):
    words = []
    for sentence in data:
        # 미리 컴파일한 특수문자를 제거하는 코드
        sentence = re.sub(CHANGE_FILTER,"",sentence)
        for word in sentence.split():
            words.append(word) 
    # 공백 기준으로 단어를 나눠서 Return
    return [word for word in words if word]

In [8]:
# 형태소 분리 
def prepro_like_morphlized(data):
    morph_analyzer= Okt()
    results = list()
    for seq in tqdm(data):
        morphlized_seq = " ".join(morph_analyzer.morphs(seq.replace(' ','')))
        results.append(morphlized_seq)
    return results

In [9]:
# 단어 사전을 불러오는 함수
def load_vocabulary(path, vocab_path):
    vocabulary_list = []
    # vocab path가 없고 -- 단어 사전파일이 없고
    if not os.path.exists(vocab_path):
        # Raw데이터를 불러와서 사전을 만든다.
        # if (os.path.exists(path)):
        df = pd.read_csv(path,encoding='utf-8')
        question, answer = list(df['Q']),list(df['A'])
        data = []
        data.extend(question)
        data.extend(answer)
        # Tokenizing 
        words = data_tokenizer(data)
        words = list(set(words))
        words[:0] = MARKER # 사전에 정의한 토큰을 단어 리스트 앞에 추가
            # print(vocab_path)
        # print(words)
        with open(vocab_path, 'w', encoding = 'utf-8') as vocabulary_file:
            for word in words:
                # print(word)
                vocabulary_file.write(word + '\n')

    
        
    with open(vocab_path, 'r', encoding='utf-8') as vocabulary_file:
        for line in vocabulary_file:
            # print(line)
            vocabulary_list.append(line.strip())
    # print(vocabulary_list) 
    word2idx, idx2word = make_vocabulary(vocabulary_list)
    
    return word2idx, idx2word, len(word2idx)

In [10]:
def make_vocabulary(vocabulary_list):
    word2idx = {word: idx for idx, word in enumerate(vocabulary_list)}
    idx2word = {idx: word for idx, word in enumerate(vocabulary_list)}

    return word2idx, idx2word

In [11]:
# 인코더와 디코더 부분 처리하기
def enc_processing(value, dictionary):
    sequences_input_index = []
    sequences_length = []

    for sequence in value :
        sequence = re.sub(CHANGE_FILTER,"",sequence)
        sequence_index = []
        
        for word in sequence.split(): # 공백 기준으로 word를 구분
            if dictionary.get(word) is not None : # 사전에 있으면
                sequence_index.extend([dictionary[word]]) # index 값 쓰고
            else:
                sequence_index.extend([dictionary[UNK]])
        # 길이 제한
        if len(sequence_index) > MAX_SEQUNECE:
            sequence_index = sequence_index[:MAX_SEQUNECE]

        sequences_length.append(len(sequence_index)) # 이 문장의 길이 저장
        # Padding 추가
        # "안녕"  → "안녕,<PAD>,<PAD>,<PAD>,<PAD>"
        
        sequence_index += (MAX_SEQUNECE - len(sequence_index))*[dictionary[PAD]]
        
        sequences_input_index.append(sequence_index)

    return np.asarray(sequences_input_index), sequences_length

In [12]:
# Decoder input

def dec_output_processing(value, dictionary):
    sequences_output_index = []
    sequences_length = []

    for sequence in value:
        sequence = re.sub(CHANGE_FILTER,"",sequence)
        sequence_index = []
        # 앞부분에 시작을 알리는 토큰 넣기
        sequence_index = [dictionary[STD]]+[dictionary[word] for word in sequence.split()]

        if len(sequence_index) > MAX_SEQUNECE:
            sequence_index = sequence_index[:MAX_SEQUNECE]

        sequences_length.append(len(sequence_index))
        sequence_index += (MAX_SEQUNECE - len(sequence_index))*[dictionary[PAD]]

        sequences_output_index.append(sequence_index)
    return np.asarray(sequences_output_index), sequences_length

In [13]:
# 디코더 Target 값 전처리
def dec_target_processing(value,dictionary):
    sequences_target_index = []
    for sequence in value :
        sequence = re.sub(CHANGE_FILTER,"", sequence)
        sequence_index = [dictionary[word] for word in sequence.split() ]
        if len(sequence_index)>= MAX_SEQUNECE:
            # 이부분이 Decoder 입력값 전처리와 다른점
            sequence_index = sequence_index[:MAX_SEQUNECE-1] + [dictionary[END]] #마지막에 END xhzms
        else :
            sequence_index += [dictionary[END]]

        sequence_index += (MAX_SEQUNECE - len(sequence_index))*[dictionary[PAD]]
        sequences_target_index.append(sequence_index)

    return np.asarray(sequences_target_index)

In [14]:
if __name__ == "__main__":
    PATH = path+"/songysData.csv"
    VOCAB_PATH = path+"/vocabulary.txt"
    # 데이터 부르기
    inputs, outputs = load_data(PATH)
    # 단어 사전 부르기
    # 토크나이저를 사용하여 처리하도록 변경하기
    char2idx, idx2char, vocab_size = load_vocabulary(PATH,VOCAB_PATH)
    # print(char2idx)

    # encoder/decoder input /target
    index_inputs, input_seq_len = enc_processing(inputs, char2idx)
    index_outputs, output_seq_len = dec_output_processing(outputs, char2idx)
    index_targets =  dec_target_processing(outputs, char2idx)

    data_configs = {}
    data_configs['char2idx'] =char2idx
    data_configs['idx2char'] = idx2char
    data_configs['vocab_size'] = vocab_size
    data_configs['pad_symbol'] = PAD
    data_configs['std_symbol'] = STD
    data_configs['end_symbol'] = END
    data_configs['unk_symbol'] = UNK

    DATA_IN_PATH = path
    np.save(open(DATA_IN_PATH+'/train_inputs.npy','wb'), index_inputs)
    np.save(open(DATA_IN_PATH+'/train_outputs.npy','wb'), index_outputs)
    np.save(open(DATA_IN_PATH+'/train_targets.npy','wb'), index_targets)

    json.dump(data_configs, open(DATA_IN_PATH+'data_configs.json','w'))

gdrive/My Drive/Colab Notebooks/squad/songysData.csv


In [15]:
'''전처리 결과'''
seed = 99
tf.random.set_seed(seed)

# 인코더의 입력값
index_inputs = np.load(open(path+'/train_inputs.npy','rb'), allow_pickle=True)
# 디코더의 입력값
index_outputs = np.load(open(path+'/train_outputs.npy','rb'), allow_pickle=True)
# 디코더의 타깃값
index_targets = np.load(open(path+'/train_targets.npy','rb'), allow_pickle=True)
# dictonary
prepro_configs = json.load(open(path+'/data_configs.json'))

'''
인코더 Input : 최대 길이만큼 <PAD>
디코더 Input : 시작을 알리는 <SOS>
디코더 타겟 : 끝을 알리는 <END>
'''

'\n인코더 Input : 최대 길이만큼 <PAD>\n디코더 Input : 시작을 알리는 <SOS>\n디코더 타겟 : 끝을 알리는 <END>\n'

In [16]:
BATCH_SIZE = 2  # set을 키워보자 -> NoneType 에러가 발생한다. - 메모리이슈
MAX_SEQUENCE = 25
EPOCH = 5
UNITS =1024
EMBEDDING_DIM = 256
VALIDATION_SPLIT = 0.1

char2idx = prepro_configs['char2idx']
idx2char = prepro_configs['idx2char']
std_index = prepro_configs['std_symbol']
end_index = prepro_configs['end_symbol']
vocab_size = prepro_configs['vocab_size']

In [17]:
''' Encoder '''
class Encoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder,self).__init__()
        
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units, 
                                         return_sequences= True,
                                         return_state= True,
                                         # Xavier 초기화 = Glorot 초기화 방법
                                         # 이전 노드와 다음 노드의 개수에 의존하여 초기화 하는 방법
                                         recurrent_initializer= 'glorot_uniform'
                                        )
    def call(self,x,hidden): # 입력값 X와 은닉 상태 Hidden을 받는다.
        x = self.embedding(x)
        output,state = self.gru(x, initial_state = hidden)

        return output, state

    #초기에 사용될 Hidden state를 만듦
    def initialize_hidden_state(self, inp):
        return tf.zeros((tf.shape(inp)[0],self.enc_units))

In [18]:
# BandanauAttention : Attention 가중치도 같이 학습 시키는 것
class BandanauAttention(tf.keras.layers.Layer):
    def __init__(self,units): # 출력 벡터의 크기를 인자로 받음 
        super(BandanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
    
    def call(self, query, values): # 인코더 Hidden(query) + encoder output(values) -- 기존 Context + Hidden
        # query와 w2를 행렬곱 할 수 있도록 shape을 바꿈
        hidden_with_time_axis =  tf.expand_dims(query,1)
        # W1,W2의 결과를 더하여 activation function을 취함
        # Query와 value 에 가중치를 곱함
        score = self.V(tf.nn.tanh(
                                self.W1(values)+self.W2(hidden_with_time_axis)
                ))
        attention_weights = tf.nn.softmax(score,axis=1)
        
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis =1) # 행단위로 Sum 하는 것
        
        return context_vector, attention_weights

In [19]:

''' Decoder '''
class Decoder(tf.keras.layers.Layer):
    def __init__(self,vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        
        self.batch_size = batch_size
        self.dec_units =  dec_units
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        self.embedding = tf.keras.layers.Embedding(self.vocab_size, self.embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences = True,
                                        return_state = True,
                                        recurrent_initializer = 'glorot_uniform'
                                       )
        self.fc = tf.keras.layers.Dense(self.vocab_size)
        self.attention = BandanauAttention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        # 디코더의 입력값 x, 인코더의 은닉 상태값 hidden, 인코더의 결과값 enc_output
        # 인코딩이 Query, attention이 key, 인코더 결과가 values?
        
        context_vector,attention_weights = self.attention(hidden, enc_output) 
        x = self.embedding(x)

        x = tf.concat([tf.expand_dims(context_vector,1),x], axis =-1)  #axis -1
        # concat 한 결과를 LSTM 하는 것
        output,state = self.gru(x)
#         print(output.shape)
#         output = tf.concat([tf.expand_dims(context_vector,1),output], axis =2) 
#         print(output.shape)
        output = tf.reshape(output, (-1,output.shape[2]))
#         print(output.shape)
        x = self.fc(output)
#         print(1234, x)
        
        return x, state, attention_weights


In [20]:
optimizer = tf.keras.optimizers.Adam()

#크로스 엔트로피 손실값 측정
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction= 'none')
#정확도 측정 객체
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name = 'accuracy')

def loss(real, pred):# real 값 중에서 0 인 <PAD> 값 제거하기 위한 함수    
    mask = tf.math.logical_not(tf.math.equal(real,0)) # True 1 , <PAD> 제외한 나머지는 0 
    loss_ = loss_object(real,pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask # 요소간의 곱을 하면 <PAD>는 loss 계산에서 제외됨. True만 남고 나머지는 다 0으로 바뀌네
    return tf.reduce_mean(loss_)

def accuracy(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real,0))
    mask = tf.expand_dims(tf.cast(mask, dtype = pred.dtype), axis = -1)
    pred *= mask
    acc = train_accuracy(real, pred)
    
    return tf.reduce_mean(acc)

In [21]:
''' Main Class : encoding+decoding'''
class seq2seq(tf.keras.Model):
    def __init__(self,vocab_size, embedding_dim, enc_units, dec_units, batch_size, end_token_idx = 2):
        super(seq2seq, self).__init__()
        self.end_token_idx = end_token_idx
        self.encoder = Encoder(vocab_size, embedding_dim, enc_units, batch_size)
        self.decoder = Decoder(vocab_size, embedding_dim, dec_units, batch_size)
        
    def call(self,x): # x는 인코더, 디코더 입력값을 포함 함
#         print(x[1])
        inp, tar = x
#         print(inp, tar, x)
        
        # Encoder의 Hidden vector를 초기화 하여 encoding
        enc_hidden = self.encoder.initialize_hidden_state(inp)
        enc_output, enc_hidden = self.encoder(inp, enc_hidden)
        
        dec_hidden = enc_hidden
        
        # 반복적으로 state 별로 attention 결과를 받아와서 Decoding
        predict_tokens  = list()
        for t in range(0, tar.shape[1]):
#             print(t, tar.shape)
            dec_input = tf.dtypes.cast(tf.expand_dims(tar[:,t],1),tf.float32) #특정 state 디코더 입력값
            
            predictions, dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
#             print(predictions)
            predict_tokens.append(tf.dtypes.cast(predictions, tf.float32))
#             print(predict_tokens)
#         print(predict_tokens)
        result = tf.stack(predict_tokens, axis = 1)
#         print(1111)
#         print(np.array(result))
#         print(222)
        return result
        
    def inference(self, x): #모델의 결과값을 확인하기 위함, Test 목적
        inp = x
#         print(111)
        enc_hidden = self.encoder.initialize_hidden_state(inp)
        enc_output,enc_hidden = self.encoder(inp,enc_hidden)
        
        dec_hidden = enc_hidden
        
        dec_input = tf.expand_dims([char2idx[std_index]],1)  #end 
        
        predict_tokens = list()
        for t in range(0, MAX_SEQUENCE):
            predictions,dec_hidden, _ = self.decoder(dec_input, dec_hidden, enc_output)
            predict_token = tf.argmax(predictions[0])
            
            if predict_token == self.end_token_idx : # 끝을 만나면 종료
                break
            predict_tokens.append(predict_token)
            dec_input = tf.dtypes.cast(tf.expand_dims([predict_token],0),tf.float32)
        
        return tf.stack(predict_tokens, axis =0).numpy()

In [24]:
''' Model 생성'''
model = seq2seq(vocab_size, EMBEDDING_DIM, UNITS, UNITS,BATCH_SIZE, char2idx[end_index])
model.compile(loss = loss, optimizer= tf.keras.optimizers.Adam(1e-3), metrics =  [accuracy])

In [37]:
if not(os.path.isdir(path)):
    os.makedirs(os.path.join(path))

model.load_weights(os.path.join('gdrive/My Drive/Colab Notebooks/squad/weights.h5'))    
chk_path = path + '/weights.h5'

callback = ModelCheckpoint( chk_path, monitor = 'val_accuracy', verbose =1, save_best_only= True,
                            save_weights_only =True)
earlystop = EarlyStopping(monitor ='val_accuracy', min_delta = 0.001, patience =10)

history = model.fit([index_inputs, index_outputs], index_targets,
                   batch_size =BATCH_SIZE,
                   epochs = 5,
                   validation_split= 0.2, # set이 너무 작아서 valloss 계산이 안되는 거일수도 있다.
                   callbacks = [earlystop, callback])

Epoch 1/5
   6/4729 [..............................] - ETA: 8:54:40 - loss: 0.2235 - accuracy: 0.9310

KeyboardInterrupt: ignored

In [29]:
SAVE_FILE_NM = "weights.h5"
model.load_weights(os.path.join('gdrive/My Drive/Colab Notebooks/squad/weights.h5'))

In [30]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string],'')
    plt.xlabel('epochs')
    plt.ylabel(string)
    plt.legend([string,'val_'+string])
    plt.show()
plot_graphs(history,'accuracy')

NameError: ignored

In [None]:
plot_graphs(history,'loss')
# 뭔가 잘못 됐다.

In [36]:
query = "나는 어때"

test_index_inputs , _ = enc_processing([query],char2idx)
predict_tokens =  model.inference(test_index_inputs)
print(' '.join([idx2char['%s'%t] for  t in predict_tokens]))

오늘 단단히 많이 생각했나봐요


In [32]:
query = "남자친구 승진 선물로 뭐가 좋을까"

test_index_inputs , _ = enc_processing([query],char2idx)
predict_tokens =  model.inference(test_index_inputs)
print(' '.join([idx2char['%s'%t] for  t in predict_tokens]))

그렇게 해보세요


In [33]:
query = "뭐야?"

test_index_inputs , _ = enc_processing([query],char2idx)
predict_tokens =  model.inference(test_index_inputs)
print(' '.join([idx2char['%s'%t] for  t in predict_tokens]))

많이 사랑했나봐요
