In [1]:
import os
import re
import json
import numpy as np
import pandas as pd
from preprocess import *
from tqdm import tqdm
from konlpy.tag import Okt

In [2]:
FILTERS = "([~.,!?\"':;])"
PAD = "<PAD>"
STD = "<SOS>"
END = "<END>"
UNK = "<UNK>"

PAD_INDEX = 0
STD_INDEX = 1
END_INDEX = 2
UNK_INDEX = 3

MARKER = [PAD, STD, END, UNK]
CHANGE_FILTER = re.compile(FILTERS)
PATH = 'data_in/ChatBotData.csv'
VOCAB_PATH = 'data_in/vocabulary.txt'

MAX_SEQUENCE = 25

In [3]:
inputs, outputs = load_data(PATH)

In [4]:
char2idx, idx2char, vocab_size = load_vocabulary(PATH, VOCAB_PATH, tokenize_as_morph=False)

In [5]:
char2idx

{'<PAD>': 0,
 '<SOS>': 1,
 '<END>': 2,
 '<UNK>': 3,
 '마음을': 4,
 '있어도': 5,
 '사세요': 6,
 '믿어줘': 7,
 '끄고': 8,
 '많이': 9,
 '혼자인게': 10,
 '켜놓고': 11,
 '하세요': 12,
 '쫄딱': 13,
 '교회': 14,
 '좋다': 15,
 '나갔어': 16,
 '오늘': 17,
 '따라': 18,
 '따뜻하게': 19,
 '필요했던': 20,
 '더': 21,
 '사람도': 22,
 '나라를': 23,
 '잊고': 24,
 '나오세요': 25,
 '거예요': 26,
 '것': 27,
 '나': 28,
 '구하셨나요': 29,
 '빨리': 30,
 '가스비': 31,
 '그': 32,
 '집에': 33,
 '같아요': 34,
 '자의': 35,
 '승진': 36,
 '함께': 37,
 '땀을': 38,
 '생각해보세요': 39,
 '나온거': 40,
 '가상화폐': 41,
 '빠를수록': 42,
 '평소에': 43,
 '열': 44,
 '줄까': 45,
 '때까지': 46,
 '망함': 47,
 '너무': 48,
 '같아': 49,
 '비싼데': 50,
 '운동만': 51,
 '남자친구가': 52,
 '가난한': 53,
 '그럴': 54,
 '바빠': 55,
 '달에는': 56,
 '뭘': 57,
 '설움': 58,
 '해보세요': 59,
 '운동을': 60,
 '돌아가서': 61,
 '감기': 62,
 '식혀주세요': 63,
 '적당히': 64,
 '궁금해': 65,
 '즐기세요': 66,
 '거짓말': 67,
 '갔어': 68,
 '해': 69,
 '전생에': 70,
 '마세요': 71,
 '좋을': 72,
 '집착하지': 73,
 '남자친구': 74,
 '안': 75,
 '뭐가': 76,
 '잘생겼어': 77,
 '결단은': 78,
 '필요한': 79,
 '보인다': 80,
 '들어올': 81,
 '땀난다': 82,
 '게': 83,
 '싶어': 84,
 '다음'

In [8]:
index_inputs, input_seq_len = enc_processing(inputs, char2idx, tokenize_as_morph=False)
index_outputs, output_seq_len = dec_output_processing(outputs, char2idx, tokenize_as_morph=False) 
index_targets = dec_target_processing(outputs, char2idx, tokenize_as_morph=False)

In [14]:
data_configs = {}
data_configs['char2idx'] = char2idx
data_configs['idx2char'] = idx2char
data_configs['vocab_size'] = vocab_size
data_configs['pad_symbol'] = PAD
data_configs['std_symbol'] = STD
data_configs['end_symbol'] = END
data_configs['unk_symbol'] = UNK

In [15]:
DATA_IN_PATH = './data_in/'
TRAIN_INPUTS = 'train_inputs.npy'
TRAIN_OUTPUTS = 'train_outputs.npy'
TRAIN_TARGETS = 'train_targets.npy'
DATA_CONFIGS = 'data_configs.json'

np.save(open(DATA_IN_PATH + TRAIN_INPUTS, 'wb'), index_inputs)
np.save(open(DATA_IN_PATH + TRAIN_OUTPUTS, 'wb'), index_outputs)
np.save(open(DATA_IN_PATH + TRAIN_TARGETS, 'wb'), index_targets)

json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'))