In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import tensorflow as tf
import os, sys, math
import config

tf.set_random_seed(seed=1991)
np.random.seed(seed=1991)

  from ._conv import register_converters as _register_converters


In [2]:
raw_df = pd.read_csv('./data/dataset_kor/교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv', engine='python')
raw_df = raw_df[config.test_cols]

In [3]:
for key in config.kv_map.keys():
#    print(key)
    raw_df[key] = raw_df[key].apply(lambda x: config.kv_map[key][x])

In [4]:
config.keep_prop = 0.7
config.train_size = 20480
config.cate_len = len(config.cate_cols)
config.cont_len = len(config.cont_cols)
config.cate_lens = [len(config.kv_map[k]) for k in config.cate_cols]

In [5]:
train_df = raw_df[:config.train_size]
valid_df = raw_df[config.train_size:]

# split

In [6]:
train_df_cate = train_df.loc[:,config.cate_cols]
train_df_cont = train_df.loc[:,config.cont_cols]

In [7]:
valid_df_cate = valid_df.loc[:,config.cate_cols]
valid_df_cont = valid_df.loc[:,config.cont_cols]

In [8]:
np.random.seed(seed=910919)
config.vaild_drop_mask = np.random.rand(valid_df_cate.values.shape[0], valid_df_cate.values.shape[1])> (1- config.keep_prop)

In [73]:
np.random.seed(seed=3632)
config.vaild_test_drop_mask = np.random.rand(valid_df_cate.values.shape[0], valid_df_cate.values.shape[1])> (1- config.keep_prop)

# model

In [131]:
config.dir_ckpt = '.\\ckpt'
config.path_ckpt = os.path.join(config.dir_ckpt, 'best.ckpt')
config.l1_size = 32
config.out_size = sum(config.cate_lens)
config.epochs = 500
config.batch_size = 1024

train_step = math.ceil(len(train_df)/ config.batch_size)
valid_step = math.ceil(len(valid_df)/ config.batch_size)

os.makedirs(config.dir_ckpt , exist_ok=True)

In [132]:
def build_model(config):
    tf.reset_default_graph()
    inp_cate = tf.placeholder(dtype=tf.int32, shape=[None, config.cate_len], name='categorical_input_layer')
    inp_cont = tf.placeholder(dtype=tf.int32, shape=[None, config.cont_len], name='continuous_input_layer')
    inp_cate_y = tf.placeholder(dtype=tf.int32, shape=[None, config.cate_len], name='categorical_input_y_layer')
    inp_cont_y = tf.placeholder(dtype=tf.int32, shape=[None, config.cont_len], name='continuous_input_y_layer')

    cate_oh = [tf.one_hot(inp_cate[:, i], depth=config.cate_lens[i]) for i in range(len(config.cate_cols))]
    cate_oh_y = [tf.one_hot(inp_cate_y[:, i], depth=config.cate_lens[i]) for i in range(len(config.cate_cols))]

    concat_cate = tf.concat(cate_oh, axis=-1)
    concat_cate_y = tf.concat(cate_oh_y, axis=-1)

    l1 = tf.contrib.layers.fully_connected(concat_cate, config.l1_size)
    l2 = tf.contrib.layers.fully_connected(concat_cate, config.out_size, activation_fn = tf.nn.sigmoid)
    
    pred = l2

    loss_op = tf.losses.mean_squared_error(concat_cate_y, l2)
    optimizer = tf.train.AdamOptimizer()
    train_op = optimizer.minimize(loss_op)

    return loss_op, train_op, inp_cate, inp_cont, inp_cate_y, inp_cont_y, pred


In [133]:
mdl = build_model(config)

In [134]:
saver = tf.train.Saver(max_to_keep=5)
init = tf.global_variables_initializer()

In [135]:
def run_session(sess, max_step, datas, config, epoch, mode=1):
    total_loss = 0.0
    loss_op, train_op, inp_cate, inp_cont, inp_cate_y, inp_cont_y, pred = mdl

    #train
    if mode ==1:
        drop_mask = np.random.rand(datas[0].values.shape[0], datas[0].values.shape[1])> (1- config.keep_prop)
        droped_cate = datas[0].values * drop_mask
        
        for step in range(max_step):
            start_idx = step * config.batch_size
            loss_, _ = sess.run([loss_op, train_op], feed_dict = {inp_cate:droped_cate[start_idx:start_idx+config.batch_size], 
                                                                  inp_cont:datas[1].values[start_idx:start_idx+config.batch_size], 
                                                                  inp_cate_y:datas[0].values[start_idx:start_idx+config.batch_size]})
            
            total_loss += loss_
        total_loss /= max_step
        return total_loss 
    
    #validation
    elif mode == 2:
        droped_valid_cate = datas[0].values * config.vaild_drop_mask
        
        for step in range(max_step):
            start_idx = step * config.batch_size
            loss_ = sess.run(loss_op, feed_dict = {inp_cate:droped_valid_cate[start_idx:start_idx+config.batch_size], 
                                                   inp_cont:datas[1].values[start_idx:start_idx+config.batch_size], 
                                                   inp_cate_y:datas[0].values[start_idx:start_idx+config.batch_size]})
            
            total_loss += loss_
        total_loss /= max_step
        return total_loss 
    
    #valid_test
    elif mode == 3:
        droped_valid_cate = datas[0].values * config.vaild_test_drop_mask
        preds = []
        for step in range(max_step):
            start_idx = step * config.batch_size
            pred_ = sess.run(pred, feed_dict = {inp_cate:droped_valid_cate[start_idx:start_idx+config.batch_size], 
                                                inp_cont:datas[1].values[start_idx:start_idx+config.batch_size]})
            preds.append(pred_)
            
        return preds
    
    else:
        print('error')
        return None

In [136]:
# =======================================================================================
# run session
with tf.Session() as sess:
    tf.set_random_seed(seed=1991)
    np.random.seed(seed=1991)
    sess.run(init)
    min_val_loss = 9999
    
    for epoch in range(1, config.epochs+1):
        print("Epoch : {}".format(epoch), end='\t')
        
        #train
        trn_total_loss_ = run_session(sess, train_step, [train_df_cate, train_df_cont], config, epoch, mode=1)
        print("Train_loss : {:.6f}".format(trn_total_loss_), end = '\t')
        
        #valid
        val_total_loss_ = run_session(sess, valid_step, [valid_df_cate, valid_df_cont], config, epoch, mode=2)
        print("Valid_loss : {:.6f}".format(val_total_loss_), end = '\t')
        
        if val_total_loss_ < min_val_loss:
            saver.save(sess, config.path_ckpt)
            min_val_loss = val_total_loss_
            print("Saved")
        else:
            print("No Saved")

Epoch : 1	Train_loss : 0.227814	Valid_loss : 0.201575	Saved
Epoch : 2	Train_loss : 0.180795	Valid_loss : 0.159486	Saved
Epoch : 3	Train_loss : 0.143410	Valid_loss : 0.127049	Saved
Epoch : 4	Train_loss : 0.115071	Valid_loss : 0.102857	Saved
Epoch : 5	Train_loss : 0.094035	Valid_loss : 0.084999	Saved
Epoch : 6	Train_loss : 0.078521	Valid_loss : 0.071767	Saved
Epoch : 7	Train_loss : 0.066933	Valid_loss : 0.061840	Saved
Epoch : 8	Train_loss : 0.058201	Valid_loss : 0.054271	Saved
Epoch : 9	Train_loss : 0.051460	Valid_loss : 0.048399	Saved
Epoch : 10	Train_loss : 0.046195	Valid_loss : 0.043761	Saved
Epoch : 11	Train_loss : 0.042003	Valid_loss : 0.040034	Saved
Epoch : 12	Train_loss : 0.038626	Valid_loss : 0.036993	Saved
Epoch : 13	Train_loss : 0.035846	Valid_loss : 0.034475	Saved
Epoch : 14	Train_loss : 0.033501	Valid_loss : 0.032362	Saved
Epoch : 15	Train_loss : 0.031560	Valid_loss : 0.030566	Saved
Epoch : 16	Train_loss : 0.029872	Valid_loss : 0.029024	Saved
Epoch : 17	Train_loss : 0.028434	

In [137]:
# =======================================================================================
# valid_test
with tf.Session() as sess:
    saver.restore(sess, config.path_ckpt)
    preds_ = run_session(sess, valid_step, [valid_df_cate, valid_df_cont], config, epoch, mode=3)
    preds_ = np.concatenate(preds_)

pred_args = []
for p_ in preds_:
    start_idx = 0
    pred_arg = []
    for kl in config.cate_lens:
        pred_arg.append(np.argmax(p_[start_idx: start_idx+kl]))
        start_idx += kl
    pred_args.append(pred_arg)

pred_args_np = np.array(pred_args)

val_test_ijv = []
for i, vs in enumerate(config.vaild_test_drop_mask):
    for j, v in enumerate(vs):
        if v:
            val_test_ijv.append([i,j,valid_df_cate.values[i,j]])    

sum_true = 0
for ijv in val_test_ijv:
    i,j,v = ijv
    sum_true+=(pred_args_np[i, j] == v)

val_tst_score = sum_true/len(val_test_ijv)
print("val_tst_score : {:6f}".format(val_tst_score))

INFO:tensorflow:Restoring parameters from .\ckpt\best.ckpt


INFO:tensorflow:Restoring parameters from .\ckpt\best.ckpt


val_tst_score : 0.987826


Epoch : 500	Train_loss : 0.004380	Valid_loss : 0.004452	Saved => 0.987826