In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline


In [None]:
DATA_IN_PATH = './data_in/'
print('파일 크기: ')
for file in os.listdir(DATA_IN_PATH):
    if 'csv' in file:
        print(file.ljust(30) + str(round(os.path.getsize(DATA_IN_PATH + file ) / 1000000, 2)) + 'MB' )

In [None]:
train_data = pd.read_csv(DATA_IN_PATH + 'train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv(DATA_IN_PATH + 'test.csv')
test_data.head()

In [None]:
print("전체 학습데이터 갯수 : {}".format(len(train_data)))
print("전체 테스트 갯수 : {}".format(len(test_data)))

In [None]:
train_long = train_data['data'].astype(str).apply(len)

In [None]:
train_long.head()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train_long, bins=300, alpha=0.4, color='b', label='word')
plt.yscale('log', nonposy='clip')
plt.title("Log Histogram of length - voice")
plt.xlabel('의견 길이')
plt.ylabel('의견의 수')

In [None]:
print("의견 길이 최대값 : {}".format(np.max(train_long)))
print("의견 길이 최소값 : {}".format(np.min(train_long)))
print("의견 길이 평균값 : {}".format(np.mean(train_long)))
print("의견 길이 표준편차, 중간값 : {}, {}".format(np.std(train_long), np.median(train_long)))

In [None]:
print("의견 길이 1사분위값 :", np.percentile(train_long, 25), "글자")
plt.figure(figsize=(12, 5))

plt.boxplot(train_long, labels=['counts'],showmeans=True )

In [None]:
print("각 특징 의견 개수 ", train_data['category'].describe)

In [None]:
import numpy as np
import re
import json
from konlpy.tag import Okt #nltk
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tqdm import tqdm

In [None]:
okt = Okt()
stop_words = ['은', '는', '이', '가', '하', '아', '것', '들','의', '있', '되', '수', '보', '주', '등', '한']
def preprocessing(voice_text, okt, remove_stopwords = False, stop_words = []):

    voice_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", voice_text)

    word = okt.morphs(voice_text, stem=True)

    if remove_stopwords:
        words = [token for token in word if not token in stop_words]
    return words

In [None]:
clean_train_voice = []
for voices in tqdm(train_data['data']):
    # 비어있는 데이터에서 멈추지 않도록 string인 경우만 진행
    if type(voices) == str:
        clean_train_voice.append(preprocessing(voices, okt, remove_stopwords = True, stop_words=stop_words))
    else:
        clean_train_voice.append([])  #string이 아니면 비어있는 값 추가

In [None]:
clean_test_voice = []
for voices in tqdm(test_data['data']):
    if type(voices) == str:
        clean_test_voice.append(preprocessing(voices, okt, remove_stopwords = True, stop_words=stop_words))
    else:
        clean_test_voice.append([])

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_train_voice)
train_seq = tokenizer.texts_to_sequences(clean_train_voice)
test_seq = tokenizer.texts_to_sequences(clean_test_voice)

word_vocab = tokenizer.word_index

In [None]:
MAX_SEQ_LEN = 512 # 문장 최대 길이

train_inputs = pad_sequences(train_seq, maxlen=MAX_SEQ_LEN, padding='post')
train_labels = np.array(train_data['category'])
test_inputs = pad_sequences(test_seq, maxlen=MAX_SEQ_LEN, padding='post')

In [None]:
DATA_IN_PATH = './data_in/'
train_input_data = 'train_input.npy'
train_label_data = 'train_label.npy'
test_input_data='test_input.npy'
DATA_CONFIGS = 'config_data.json'

data_configs = {}

data_configs['vocab'] = word_vocab
data_configs['vocab_size']=len(word_vocab)

In [None]:
if not os.path.exists(DATA_IN_PATH):
    os.makedirs(DATA_IN_PATH)

In [None]:
np.save(open(DATA_IN_PATH+train_input_data, 'wb'), train_inputs)
np.save(open(DATA_IN_PATH+test_input_data, 'wb'), test_inputs)

In [None]:
np.save(open(DATA_IN_PATH+train_label_data, 'wb'), train_labels)

In [None]:
json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'), ensure_ascii=False)
