In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
DATA_IN_PATH = './data_in/'

In [None]:
print('파일 크기: ')
for file in os.listdir(DATA_IN_PATH):
    if 'csv' in file:
        print(file.ljust(30) + str(round(os.path.getsize(DATA_IN_PATH + file ) / 1000000, 2)) + 'MB' )

In [None]:
train_data = pd.read_csv(DATA_IN_PATH + 'train.csv')
train_data.head()

In [None]:
test_data = pd.read_csv(DATA_IN_PATH + 'test.csv')
test_data.head()

In [None]:
print("전체 학습데이터 갯수 : {}".format(len(train_data)))

In [None]:
print("전체 테스트 갯수 : {}".format(len(test_data)))

In [None]:
train_long = train_data['data'].astype(str).apply(len)

In [None]:
train_long.head()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train_long, bins=300, alpha=0.4, color='b', label='word')
plt.yscale('log', nonposy='clip')
plt.title("Log Histogram of length - voice")
plt.xlabel('Length of voice')
plt.ylabel('Number of voice')

In [None]:
print("의견 길이 최대값 : {}".format(np.max(train_long)))
print("의견 길이 최소값 : {}".format(np.min(train_long)))
print("의견 길이 평균값 : {}".format(np.mean(train_long)))
print("의견 길이 표준편차, 중간값 : {}, {}".format(np.std(train_long), np.median(train_long)))

In [None]:
print("의견 길이 1사분위값 :", np.percentile(train_long, 25), "글자")

In [None]:
plt.figure(figsize=(12, 5))

plt.boxplot(train_long, labels=['counts'],showmeans=True )

In [None]:
print("각 특징 의견 개수 ", train_data['category'].describe)

In [None]:
import tensorflow as tf

In [1]:
import numpy as np
import pandas as pd
import re
import json
from konlpy.tag import Okt #nltk
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Model
from keras.layers import Input, Embedding, Conv1D, BatchNormalization, Activation, Add, MaxPooling1D, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.optimizers import SGD
from keras.engine.topology import get_source_inputs
import custom_callbacks
from k_maxpooling import *
import keras.backend as K
from vdcnn import *
from tqdm import tqdm
from datetime import datetime
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [None]:
# voice_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", train_data['data'][0])
# print(voice_text)
# voice_text = train_data['data'][0]

In [2]:
import os
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

In [None]:
okt = Okt()

In [None]:
stop_words = ['은', '는', '이', '가', '하', '아', '것', '들','의', '있', '되', '수', '보', '주', '등', '한']
def preprocessing(voice_text, okt, remove_stopwords = False, stop_words = []):

    voice_text = re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ\\s]", "", voice_text)
    
    word = okt.morphs(voice_text, stem=True)
    
    if remove_stopwords:
        words = [token for token in word if not token in stop_words]
    return words

In [None]:
clean_train_voice = []
for voices in tqdm(train_data['data']):
    # 비어있는 데이터에서 멈추지 않도록 string인 경우만 진행
    if type(voices) == str:
        clean_train_voice.append(preprocessing(voices, okt, remove_stopwords = True, stop_words=stop_words))
    else:
        clean_train_voice.append([])  #string이 아니면 비어있는 값 추가

In [None]:
clean_test_voice = []
for voices in tqdm(test_data['data']):
    if type(voices) == str:
        clean_test_voice.append(preprocessing(voices, okt, remove_stopwords = True, stop_words=stop_words))
    else:
        clean_test_voice.append([])

In [None]:
vocab_size=30000
tokenizer = Tokenizer(vocab_size)
tokenizer.fit_on_texts(clean_train_voice)
tokenizer.fit_on_texts(clean_test_voice)
train_seq = tokenizer.texts_to_sequences(clean_train_voice)
test_seq = tokenizer.texts_to_sequences(clean_test_voice)

word_vocab = tokenizer.word_index

In [None]:
MAX_SEQ_LEN = 500 # 문장 최대 길이

train_inputs = pad_sequences(train_seq, maxlen=MAX_SEQ_LEN, padding='post')
train_labels = np.array(train_data['category'])
test_inputs = pad_sequences(test_seq, maxlen=MAX_SEQ_LEN, padding='post')

In [None]:
DATA_IN_PATH = './data_in/'
train_input_data = 'train_input.npy'
train_label_data = 'train_label.npy'
test_input_data='test_input.npy'
DATA_CONFIGS = 'config_data.json'

data_configs = {}

data_configs['vocab'] = word_vocab
data_configs['vocab_size']=len(word_vocab)

In [None]:
print(len(word_vocab))

In [None]:
if not os.path.exists(DATA_IN_PATH):
    os.makedirs(DATA_IN_PATH)

In [None]:
np.save(open(DATA_IN_PATH+train_input_data, 'wb'), train_inputs)
np.save(open(DATA_IN_PATH+test_input_data, 'wb'), test_inputs)

In [None]:
np.save(open(DATA_IN_PATH+train_label_data, 'wb'), train_labels)

In [None]:
json.dump(data_configs, open(DATA_IN_PATH + DATA_CONFIGS, 'w'), ensure_ascii=False)

# ----------

In [3]:
DATA_IN_PATH = './data_in/'
DATA_OUT_PATH = './data_out/'
INPUT_TRAIN_DATA = 'train_input.npy'
INPUT_LABEL_DATA = 'train_label.npy'
DATA_CONFIGS = 'config_data.json'

input_data = np.load(open(DATA_IN_PATH + INPUT_TRAIN_DATA, 'rb'))
input_label = tf.keras.utils.to_categorical(np.load(open(DATA_IN_PATH + INPUT_LABEL_DATA, 'rb')))
pre_config = json.load(open(DATA_IN_PATH + DATA_CONFIGS, 'r'))

In [4]:
len(input_label)

40000

In [5]:
print(input_label)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [1. 0. 0.]]


In [6]:
TEST_SPLIT = 0.2
SEED = 42
VOCAB_SIZE = pre_config['vocab_size']+1
EMB_SIZE = 120
BATCH_SIZE = 16
NUM_EPOCH = 20
# x data y label

X_train, X_test, Y_train, Y_test = train_test_split(input_data, input_label, test_size=TEST_SPLIT, random_state=SEED)

In [7]:
def train(x_train, y_train, x_test, y_test):
    
    # Init Keras Model here
    model = VDCNN(num_classes=3, 
                  sequence_length=500,
                  shortcut=False,
                  pool_type='k_max', 
                  sorted=False, 
                  use_bias=False,embedding_dim=EMB_SIZE)

    model.compile(optimizer=SGD(lr=0.007, momentum=0.99), loss='categorical_crossentropy', metrics=['accuracy'])

    model_json = model.to_json()
    with open("vdcnn_model.json","w") as json_file:
        json_file.write(model_json)                    # Save model architecture
    time_str = datetime.now().isoformat()
    print("{}: Model saved as json.".format(time_str))
    print("")

    # Trainer
    # Tensorboard and extra callback to support steps history
#     tensorboard = TensorBoard(log_dir='./logs', histogram_freq=50, write_graph=True, write_images=True)
    checkpointer = ModelCheckpoint(filepath="./checkpoints/vdcnn_weights_val_acc_{val_acc:.4f}.h5", period=1,
                                   verbose=10, save_best_only=True, mode='max', monitor='val_acc')
#     loss_history = custom_callbacks.loss_history(model, tensorboard)
#     evaluate_step = custom_callbacks.evaluate_step(model, checkpointer, 100, BATCH_SIZE, x_test, y_test)

    # Fit model
    model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCH, validation_data=(x_test, y_test), 
              verbose=1, callbacks=[checkpointer])
    print('-'*30)
    time_str = datetime.now().isoformat()
    print("{}: Done training.".format(time_str))
    K.clear_session()
    print('-'*30)
    print()

if __name__=='__main__':
    train(x_train=X_train, y_train=Y_train, x_test=X_test, y_test=Y_test)

2020-06-11T04:09:46.033555: Model saved as json.


Train on 32000 samples, validate on 8000 samples
Epoch 1/20
 5168/32000 [===>..........................] - ETA: 1:01 - loss: 1.4400 - accuracy: 0.3373

InternalError: GPU sync failed

In [None]:
y_pred = model.predict_class(x_test)
    sample_submission = pd.read_csv('sample_submission.csv')
    sample_submission['category'] = y_pred
    sample_submission.to_csv('submission.csv', encoding='utf-8', index=False)