In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import tensorflow as tf
import os, sys, math
import config

tf.set_random_seed(seed=1991)
np.random.seed(seed=1991)

  from ._conv import register_converters as _register_converters


In [2]:
raw_df = pd.read_csv('./data/dataset_kor/교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv', engine='python')
raw_df = raw_df[config.test_cols]

In [3]:
for key in config.kv_map.keys():
#    print(key)
    raw_df[key] = raw_df[key].apply(lambda x: config.kv_map[key][x])

In [4]:
config.keep_prop = 0.7
config.train_size = 22528
config.cate_len = len(config.cate_cols)
config.cont_len = len(config.cont_cols)
config.cate_lens = [len(config.kv_map[k]) for k in config.cate_cols]

In [5]:
train_df = raw_df[:config.train_size]
valid_df = raw_df[config.train_size:]

# train_df = raw_df[len(raw_df)-config.train_size:]
# valid_df = raw_df[:len(raw_df)-config.train_size]

# split

In [6]:
train_df_cate = train_df.loc[:,config.cate_cols]
train_df_cont = train_df.loc[:,config.cont_cols]

In [7]:
valid_df_cate = valid_df.loc[:,config.cate_cols]
valid_df_cont = valid_df.loc[:,config.cont_cols]

In [8]:
np.random.seed(seed=910919)
config.vaild_drop_mask = np.random.rand(valid_df_cate.values.shape[0], valid_df_cate.values.shape[1])> (1- config.keep_prop)

In [9]:
np.random.seed(seed=3632)
config.vaild_test_drop_mask = np.random.rand(valid_df_cate.values.shape[0], valid_df_cate.values.shape[1])> (1- config.keep_prop)

# model

In [10]:
config.dir_ckpt = '.\\ckpt'
config.path_ckpt = os.path.join(config.dir_ckpt, 'best.ckpt')
config.l1_size = 128
config.l2_size = 64
config.cate_out_size = sum(config.cate_lens)
config.cont_out_size = config.cont_len
config.epochs = 600
config.batch_size = 64

train_step = math.ceil(len(train_df)/ config.batch_size)
valid_step = math.ceil(len(valid_df)/ config.batch_size)

os.makedirs(config.dir_ckpt , exist_ok=True)

In [35]:
def build_model(config):
    tf.reset_default_graph()
    inp_cate = tf.placeholder(dtype=tf.int32, shape=[None, config.cate_len], name='categorical_input_layer')
    inp_cont = tf.placeholder(dtype=tf.float32, shape=[None, config.cont_len], name='continuous_input_layer')
    inp_cate_y = tf.placeholder(dtype=tf.int32, shape=[None, config.cate_len], name='categorical_input_y_layer')
    inp_cont_y = tf.placeholder(dtype=tf.float32, shape=[None, config.cont_len], name='continuous_input_y_layer')

    cate_oh = [tf.one_hot(inp_cate[:, i], depth=config.cate_lens[i]) for i in range(len(config.cate_cols))]
    cate_oh_y = [tf.one_hot(inp_cate_y[:, i], depth=config.cate_lens[i]) for i in range(len(config.cate_cols))]
    concat_cate_y = tf.concat(cate_oh_y, axis=-1)
    
    concat_cate = tf.concat(cate_oh, axis=-1)
    concat_all = tf.concat([concat_cate, inp_cont], axis=-1)
    
    l1 = tf.contrib.layers.fully_connected(concat_all, config.l1_size)
#    l2 = tf.contrib.layers.fully_connected(l1, config.l1_size)
    
    logit_cates = [tf.contrib.layers.fully_connected(l1, l, activation_fn = None) for l in config.cate_lens]
    logit_cont = tf.contrib.layers.fully_connected(l1, config.cont_out_size, activation_fn = None)
    
    pred_cates = [tf.nn.softmax(logit) for logit in logit_cates]
    pred_cate = tf.concat(pred_cates, axis=-1)
    pred_cont = tf.nn.softmax(logit_cont)
    
#    loss_cate_op = tf.losses.mean_squared_error(concat_cate_y, pred_cate)
    loss_cate_ops = [tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=inp_cate_y[:,idx], logits=logit_cates[idx])) for idx in range(11)]
    loss_cate_op = tf.reduce_sum(loss_cate_ops, axis=-1)

    loss_cont_op = tf.losses.mean_squared_error(inp_cont_y, pred_cont)
    loss_op = loss_cate_op
    #+ loss_cont_op
    
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0005)
    train_op = optimizer.minimize(loss_op)
    
    return loss_op, train_op, inp_cate, inp_cont, inp_cate_y, inp_cont_y, pred_cate, pred_cont

In [36]:
mdl = build_model(config)

saver = tf.train.Saver(max_to_keep=5)
init = tf.global_variables_initializer()

In [37]:
def run_session(sess, max_step, datas, config, epoch, mode=1):
    total_loss = 0.0
    loss_op, train_op, inp_cate, inp_cont, inp_cate_y, inp_cont_y, pred_cate, pred_cont = mdl

    #train
    if mode ==1:
        drop_mask = np.random.rand(datas[0].values.shape[0], datas[0].values.shape[1])> (1- config.keep_prop)
        droped_cate = datas[0].values * drop_mask
        
        for step in range(max_step):
            start_idx = step * config.batch_size
            loss_, _ = sess.run([loss_op, train_op], feed_dict = {inp_cate:droped_cate[start_idx:start_idx+config.batch_size], 
                                                                  inp_cont:datas[1].values[start_idx:start_idx+config.batch_size], 
                                                                  inp_cate_y:datas[0].values[start_idx:start_idx+config.batch_size],
                                                                  inp_cont_y:datas[1].values[start_idx:start_idx+config.batch_size]})
            
#            print(loss_)
            total_loss += loss_
        total_loss /= max_step
        return total_loss 
    
    #validation
    elif mode == 2:
        droped_valid_cate = datas[0].values * config.vaild_drop_mask
        
        for step in range(max_step):
            start_idx = step * config.batch_size
            loss_ = sess.run(loss_op, feed_dict = {inp_cate:droped_valid_cate[start_idx:start_idx+config.batch_size], 
                                                   inp_cont:datas[1].values[start_idx:start_idx+config.batch_size], 
                                                   inp_cate_y:datas[0].values[start_idx:start_idx+config.batch_size],
                                                   inp_cont_y:datas[1].values[start_idx:start_idx+config.batch_size]})
            
            total_loss += loss_
        total_loss /= max_step
        return total_loss 
    
    #valid_test
    elif mode == 3:
        droped_valid_cate = datas[0].values * config.vaild_test_drop_mask
        preds_cate, preds_cont = [], []
        for step in range(max_step):
            start_idx = step * config.batch_size
            pred_cate_, pred_cont_ = sess.run([pred_cate, pred_cont], 
                                              feed_dict = {inp_cate:droped_valid_cate[start_idx:start_idx+config.batch_size], 
                                                           inp_cont:datas[1].values[start_idx:start_idx+config.batch_size]})
            preds_cate.append(pred_cate_)
            preds_cont.append(pred_cont_)
            
        return preds_cate, preds_cont
    
    else:
        print('error')
        return None

In [38]:
# =======================================================================================
# run session
with tf.Session() as sess:
    tf.set_random_seed(seed=1991)
    np.random.seed(seed=1991)
    sess.run(init)
    min_val_loss = 9999
    
    for epoch in range(1, config.epochs+1):
        print("Epoch : {}".format(epoch), end='\t')
        
        #train
        trn_total_loss_ = run_session(sess, train_step, [train_df_cate, train_df_cont], config, epoch, mode=1)
        print(trn_total_loss_)
        print("Train_loss : {:.6f}".format(trn_total_loss_), end = '\t')
        
        #valid
        val_total_loss_ = run_session(sess, valid_step, [valid_df_cate, valid_df_cont], config, epoch, mode=2)
        print("Valid_loss : {:.6f}".format(val_total_loss_), end = '\t')
        
        if val_total_loss_ < min_val_loss:
            saver.save(sess, config.path_ckpt)
            min_val_loss = val_total_loss_
            print("Saved")
        else:
            print("No Saved")

Epoch : 1	1070.1061467257414
Train_loss : 1070.106147	Valid_loss : 733.772060	Saved
Epoch : 2	633.4662672389637
Train_loss : 633.466267	Valid_loss : 539.211455	Saved
Epoch : 3	490.59025851163
Train_loss : 490.590259	Valid_loss : 447.232726	Saved
Epoch : 4	417.1036650917747
Train_loss : 417.103665	Valid_loss : 392.022780	Saved
Epoch : 5	369.46913892572576
Train_loss : 369.469139	Valid_loss : 355.940692	Saved
Epoch : 6	335.4026738947088
Train_loss : 335.402674	Valid_loss : 330.145559	Saved
Epoch : 7	314.40616403926504
Train_loss : 314.406164	Valid_loss : 312.230476	Saved
Epoch : 8	297.05015507611364
Train_loss : 297.050155	Valid_loss : 297.827835	Saved
Epoch : 9	284.5749568939209
Train_loss : 284.574957	Valid_loss : 286.191349	Saved
Epoch : 10	271.0269792730158
Train_loss : 271.026979	Valid_loss : 278.503057	Saved
Epoch : 11	266.8626154119318
Train_loss : 266.862615	Valid_loss : 272.647685	Saved
Epoch : 12	260.52639350024134
Train_loss : 260.526394	Valid_loss : 267.003379	Saved
Epoch : 1

KeyboardInterrupt: 

In [39]:
# =======================================================================================
# valid_test
with tf.Session() as sess:
    saver.restore(sess, config.path_ckpt)
    preds_cate_, preds_cont_ = run_session(sess, valid_step, [valid_df_cate, valid_df_cont], config, epoch, mode=3)
    
    preds_cate_ = np.concatenate(preds_cate_)
    preds_cont_ = np.concatenate(preds_cont_)


INFO:tensorflow:Restoring parameters from .\ckpt\best.ckpt


In [40]:
pred_args = []
for p_ in preds_cate_:
    start_idx = 0
    pred_arg = []
    for kl in config.cate_lens:
        pred_arg.append(np.argmax(p_[start_idx: start_idx+kl]))
        start_idx += kl
    pred_args.append(pred_arg)

pred_args_np = np.array(pred_args)

val_test_ijv = []
for i, vs in enumerate(config.vaild_test_drop_mask):
    for j, v in enumerate(vs):
        if v:
            val_test_ijv.append([i,j,valid_df_cate.values[i,j]])    

sum_true = 0
for ijv in val_test_ijv:
    i,j,v = ijv
    sum_true+=(pred_args_np[i, j] == v)

val_tst_score = sum_true/len(val_test_ijv)
print("val_tst_score : {:6f}".format(val_tst_score))

val_tst_score : 0.999429


Epoch : 500	Train_loss : 0.004380	Valid_loss : 0.004452	Saved => 0.987826