In [1]:
# 필요한 모듈 임포트
import pandas as pd
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Conv1D, GlobalMaxPool1D, concatenate
import os

In [2]:
# ① 데이터 읽어오기
train_file = os.path.join('./models/intent', "Intent_train_data.csv")
df = pd.read_csv(train_file, delimiter=',', header=None)
df

Unnamed: 0,0,1,2
0,사이드메뉴,1,메뉴판 요구
1,사이드 메뉴,1,메뉴판 요구
2,빵 종류,1,메뉴판 요구
3,종류,1,메뉴판 요구
4,메뉴,1,메뉴판 요구
...,...,...,...
35627,후장,12,욕설
35628,후장입사,12,욕설
35629,ac발,12,욕설
35630,x대가리,12,욕설


In [3]:
df.columns = ['query', 'intent', 'intent_info']
seq_len = list(map(lambda x : len(x.split(' ')), df['query']))
max(seq_len)  # 문장 최대길이

10

In [4]:
# 단어 시퀀스 벡터 크기
MAX_SEQ_LEN = max(seq_len)

def GlobalParams():
    global MAX_SEQ_LEN

In [5]:
# '질문(query)' 과 '의도(intent)'
queries = df['query'].tolist()
intents = df['intent'].tolist()

In [6]:
queries[:20]

['사이드메뉴',
 '사이드 메뉴',
 '빵 종류',
 '종류',
 '메뉴',
 '메뉴판',
 '사이드메뉴는 뭐가 있나요?',
 '사이드 메뉴는 뭐가 있나요?',
 '빵 종류는 뭐가 있나요?',
 '종류는 뭐가 있나요?',
 '메뉴는 뭐가 있나요?',
 '메뉴판는 뭐가 있나요?',
 '사이드메뉴는 어디 있나요?',
 '사이드 메뉴는 어디 있나요?',
 '빵 종류는 어디 있나요?',
 '종류는 어디 있나요?',
 '메뉴는 어디 있나요?',
 '메뉴판는 어디 있나요?',
 '사이드메뉴는 어디 있어요?',
 '사이드 메뉴는 어디 있어요?']

In [7]:
# 전처리 모듈 불러오기
from utils.Preprocess import Preprocess
p = Preprocess(word2index_dic=os.path.join('./train_tools/dict', 'chatbot_dict.bin'),
               userdic=os.path.join('./utils', 'my_dict.tsv'))

In [8]:
# 단어 시퀀스 생성 (가장 시간 많이 걸림)
# 해당 단어에 매칭되는 번호로 시퀀스 생성

# ★ 시간 제법 걸림 * 몇십초 정도..

sequences = []
for sentence in queries:
    pos = p.pos(sentence)
    keywords = p.get_keywords(pos, without_tag=True)
    seq = p.get_wordidx_sequence(keywords)
    sequences.append(seq)

In [15]:
sequences

[[7837, 67],
 [7837, 67],
 [64, 8941],
 [8941],
 [67],
 [67, 3344],
 [7837, 67, 552, 25],
 [7837, 67, 552, 25],
 [64, 8941, 552, 25],
 [8941, 552, 25],
 [67, 552, 25],
 [67, 3344, 552, 25],
 [7837, 67, 167, 25],
 [7837, 67, 167, 25],
 [64, 8941, 167, 25],
 [8941, 167, 25],
 [67, 167, 25],
 [67, 3344, 167, 25],
 [7837, 67, 167, 25],
 [7837, 67, 167, 25],
 [64, 8941, 167, 25],
 [8941, 167, 25],
 [67, 167, 25],
 [67, 3344, 167, 25],
 [7837, 67, 176],
 [7837, 67, 176],
 [64, 8941, 176],
 [8941, 176],
 [67, 176],
 [67, 3344, 176],
 [7837, 67, 176],
 [7837, 67, 176],
 [64, 8941, 176],
 [8941, 176],
 [67, 176],
 [67, 3344, 176],
 [7837, 67, 176],
 [7837, 67, 176],
 [64, 8941, 176],
 [8941, 176],
 [67, 176],
 [67, 3344, 176],
 [7837, 67, 176],
 [7837, 67, 176],
 [64, 8941, 176],
 [8941, 176],
 [67, 176],
 [67, 3344, 176],
 [7837, 67, 1851],
 [7837, 67, 1851],
 [64, 8941, 1851],
 [8941, 1851],
 [67, 1851],
 [67, 3344, 1851],
 [7837, 67, 1, 147, 1851, 19],
 [7837, 67, 1, 147, 1851, 19],
 [64, 89

In [9]:
# ② 단어 인덱스 시퀀스 벡터 
# 단어 시퀀스 벡터 크기 (MAX_SEQ_LEN 로 동일하게 맞추기, 패딩처리)
from config.GlobalParams import MAX_SEQ_LEN
padded_seqs = preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQ_LEN, padding='post')

In [16]:
padded_seqs

array([[ 7837,    67,     0, ...,     0,     0,     0],
       [ 7837,    67,     0, ...,     0,     0,     0],
       [   64,  8941,     0, ...,     0,     0,     0],
       ...,
       [10258,   188,     0, ...,     0,     0,     0],
       [ 3599,  3566,     0, ...,     0,     0,     0],
       [    1,     0,     0, ...,     0,     0,     0]])

In [10]:
# ③ 학습용, 검증용, 테스트용 데이터셋 생성
ds = tf.data.Dataset.from_tensor_slices((padded_seqs, intents)) # 패딩처리된 시퀀스와 의도(intent) 리스트 전체를 데이터셋 객체로
ds = ds.shuffle(len(queries)) # 랜덤 섞기

# 학습셋:검증셋:테스트셋 = 7:2:1
train_size = int(len(padded_seqs) * 0.7)
val_size = int(len(padded_seqs) * 0.2)
test_size = int(len(padded_seqs) * 0.1)

train_ds = ds.take(train_size).batch(20)
val_ds = ds.skip(train_size).take(val_size).batch(20)
test_ds = ds.skip(train_size + val_size).take(test_size).batch(20)

# 하이퍼 파라미터 설정
dropout_prob = 0.5
EMB_SIZE = 128
EPOCH = 1
VOCAB_SIZE = len(p.word_index) + 1 #전체 단어 개수


In [19]:
train_ds

<BatchDataset element_spec=(TensorSpec(shape=(None, 10), dtype=tf.int32, name=None), TensorSpec(shape=(None,), dtype=tf.int32, name=None))>

In [11]:
# ④ CNN 모델 정의
# keras 함수형 모델 방식으로 구현
input_layer = Input(shape=(MAX_SEQ_LEN,))  # 입력크기
embedding_layer = Embedding(VOCAB_SIZE, EMB_SIZE, input_length=MAX_SEQ_LEN)(input_layer)
dropout_emb = Dropout(rate=dropout_prob)(embedding_layer)

conv1 = Conv1D(
    filters=128,
    kernel_size=3,
    padding='valid',
    activation=tf.nn.relu)(dropout_emb)
pool1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(
    filters=128,
    kernel_size=4,
    padding='valid',
    activation=tf.nn.relu)(dropout_emb)
pool2 = GlobalMaxPool1D()(conv2)

conv3 = Conv1D(
    filters=128,
    kernel_size=5,
    padding='valid',
    activation=tf.nn.relu)(dropout_emb)
pool3 = GlobalMaxPool1D()(conv3)

# 3,4,5gram 이후 합치기
concat = concatenate([pool1, pool2, pool3])

hidden = Dense(128, activation=tf.nn.relu)(concat)
dropout_hidden = Dropout(rate=dropout_prob)(hidden)
logits = Dense(12, name='logits')(dropout_hidden)  # 최종적으로 12가의 의도 클래스를 분류. 결과로 나온 값(logits) 을을 점수(score) 라 부른다
predictions = Dense(12, activation=tf.nn.softmax)(logits)


In [12]:
# ⑤ 모델 생성 
model = Model(inputs=input_layer, outputs=predictions)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 10)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 10, 128)      2272256     ['input_1[0][0]']                
                                                                                                  
 dropout (Dropout)              (None, 10, 128)      0           ['embedding[0][0]']              
                                                                                                  
 conv1d (Conv1D)                (None, 8, 128)       49280       ['dropout[0][0]']                
                                                                                              

In [14]:
# 모델학습
# ★ 시간 걸림 ★
model.fit(train_ds, validation_data=val_ds, epochs=EPOCH, verbose=1)

   1/1248 [..............................] - ETA: 32:03 - loss: 2.4622 - accuracy: 0.0500

InvalidArgumentError: Graph execution error:

Detected at node 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits' defined at (most recent call last):
    File "C:\ProgramData\Anaconda3\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\ProgramData\Anaconda3\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "C:\ProgramData\Anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "C:\ProgramData\Anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "C:\ProgramData\Anaconda3\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "C:\ProgramData\Anaconda3\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 457, in dispatch_queue
      await self.process_one()
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 446, in process_one
      await dispatch(*args)
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 353, in dispatch_shell
      await result
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 648, in execute_request
      reply_content = await reply_content
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 353, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "C:\ProgramData\Anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
      return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_cell
      result = self._run_cell(
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2947, in _run_cell
      return runner(coro)
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 68, in _pseudo_sync_runner
      coro.send(None)
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3172, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3364, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3444, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\2020M~1\AppData\Local\Temp/ipykernel_1980/2688748011.py", line 3, in <module>
      model.fit(train_ds, validation_data=val_ds, epochs=EPOCH, verbose=1)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 860, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\training.py", line 918, in compute_loss
      return self.compiled_loss(
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\losses.py", line 141, in __call__
      losses = call_fn(y_true, y_pred)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\losses.py", line 245, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\losses.py", line 1862, in sparse_categorical_crossentropy
      return backend.sparse_categorical_crossentropy(
    File "C:\ProgramData\Anaconda3\lib\site-packages\keras\backend.py", line 5202, in sparse_categorical_crossentropy
      res = tf.nn.sparse_softmax_cross_entropy_with_logits(
Node: 'sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits'
Received a label value of 12 which is outside the valid range of [0, 12).  Label values: 2 4 6 8 2 1 11 9 4 5 7 2 2 12 6 4 4 12 7 2
	 [[{{node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits}}]] [Op:__inference_train_function_1282]

In [None]:
# ⑦ 모델 평가(테스트 데이터 셋 이용)
loss, accuracy = model.evaluate(test_ds, verbose=1)
print('Accuracy: %f' % (accuracy * 100))
print('loss: %f' % (loss))

In [None]:
# ⑧ 모델 저장
model.save(os.path.join('./models/intent','intent_model.h5'))

## 의도분류 모듈 테스트

In [None]:
from utils.Preprocess import Preprocess
from models.intent.IntentModel import IntentModel
import os

p = Preprocess(word2index_dic=os.path.join('./train_tools/dict', 'chatbot_dict.bin'),
               userdic=os.path.join('./utils', 'train.tsv'))

In [None]:
intent = IntentModel(model_name=os.path.join('./models/intent', 'intent_model.h5'), preprocess=p)

In [None]:
query = "화장실"

In [None]:
predict = intent.predict_class(query)
predict_label = intent.labels[predict]

In [None]:
print(query)
print('의도 예측 클래스 : ', predict)
print('의도 예측 레이블 : ', predict_label)