In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import tensorflow as tf
import os, sys, math
import config

tf.set_random_seed(seed=1991)
np.random.seed(seed=1991)

  from ._conv import register_converters as _register_converters


In [2]:
raw_df = pd.read_csv('./data/dataset_kor/교통사망사고정보/Kor_Train_교통사망사고정보(12.1~17.6).csv', engine='python')
raw_df = raw_df[config.test_cols]

for key in config.kv_map.keys():
    raw_df[key] = raw_df[key].apply(lambda x: config.kv_map[key][x])

In [3]:
config.keep_prop = 0.7
config.train_size = 20480
config.cate_len = len(config.cate_cols)
config.cont_len = len(config.cont_cols)
config.cate_lens = [len(config.kv_map[k]) for k in config.cate_cols]

In [4]:
train_df = raw_df[:config.train_size]
valid_df = raw_df[config.train_size:]

# split

In [5]:
train_df_cate = train_df.loc[:,config.cate_cols]
train_df_cont = train_df.loc[:,config.cont_cols]

valid_df_cate = valid_df.loc[:,config.cate_cols]
valid_df_cont = valid_df.loc[:,config.cont_cols]

np.random.seed(seed=910919)
config.vaild_drop_mask = np.random.rand(valid_df_cate.values.shape[0], valid_df_cate.values.shape[1])>(1- config.keep_prop)
config.vaild_cont_mask = np.random.rand(valid_df_cont.values.shape[0], valid_df_cont.values.shape[1])<(1- config.keep_prop)

np.random.seed(seed=3632)
config.vaild_test_drop_mask = np.random.rand(valid_df_cate.values.shape[0], valid_df_cate.values.shape[1])>(1- config.keep_prop)
config.vaild_test_cont_mask = np.random.rand(valid_df_cont.values.shape[0], valid_df_cont.values.shape[1])<(1- config.keep_prop)

# model

In [6]:
config.dir_ckpt = '.\\ckpt'
config.path_ckpt = os.path.join(config.dir_ckpt, 'best.ckpt')
config.l1_size = 256
config.l2_size = 32
config.cate_out_size = sum(config.cate_lens)
config.cont_out_size = config.cont_len
config.epochs = 500
config.batch_size = 1024
config.replace_cont = -1
config.learning_rate = 0.0023

train_step = math.ceil(len(train_df)/ config.batch_size)
valid_step = math.ceil(len(valid_df)/ config.batch_size)

os.makedirs(config.dir_ckpt , exist_ok=True)

In [7]:
def build_model(config):
    tf.reset_default_graph()
    #inputs
    inp_cate = tf.placeholder(dtype=tf.int32, shape=[None, config.cate_len], name='categorical_input_layer')
    inp_cont = tf.placeholder(dtype=tf.float32, shape=[None, config.cont_len], name='continuous_input_layer')
    inp_cont_scale = tf.placeholder(dtype=tf.float32, shape=[None, config.cont_len], name='continuous_scaled_input_layer')
    inp_cont_tf = tf.placeholder(dtype=tf.int32, shape=[None, config.cont_len], name='continuous_tf_input_layer')
    inp_cate_y = tf.placeholder(dtype=tf.int32, shape=[None, config.cate_len], name='categorical_input_y_layer')
    inp_cont_y = tf.placeholder(dtype=tf.float32, shape=[None, config.cont_len], name='continuous_input_y_layer')
    
    #embedding
    cate_oh = [tf.one_hot(inp_cate[:, i], depth=config.cate_lens[i]) for i in range(len(config.cate_cols))]
    cate_oh_y = [tf.one_hot(inp_cate_y[:, i], depth=config.cate_lens[i]) for i in range(len(config.cate_cols))]
    concat_cate_y = tf.concat(cate_oh_y, axis=-1)
    
    oh_cont_tf = [tf.one_hot(inp_cont_tf[:, i], depth=2) for i in range(config.cont_len)]
    
    concat_cate = tf.concat(cate_oh, axis=-1)
    concat_cont_tf = tf.concat(oh_cont_tf, axis=-1)
    concat_all = tf.concat([concat_cate, inp_cont, concat_cont_tf], axis=-1)
    
    #hidden layer
    l1 = tf.contrib.layers.fully_connected(concat_all, config.l1_size)
    l12 = tf.contrib.layers.fully_connected(l1, config.l2_size)
    l21 = tf.contrib.layers.fully_connected(concat_all, config.l1_size)
    l22 = tf.contrib.layers.fully_connected(l21, config.l2_size)
    
    l2 = tf.concat([l12, l22], axis=-1)
    #logits
    logit_cates = [tf.contrib.layers.fully_connected(l1, length, activation_fn = None) for length in config.cate_lens]
    logit_cont1 = tf.contrib.layers.fully_connected(l12, config.cont_out_size, activation_fn = None)
    logit_cont2 = tf.contrib.layers.fully_connected(l2, config.cont_out_size, activation_fn = None)
    logit_cont3 = tf.contrib.layers.fully_connected(l2, config.cont_out_size, activation_fn = None)
    logit_cont = tf.reduce_mean([logit_cont1, logit_cont2, logit_cont3], axis=0)
    
    #predictions
    pred_cates = [tf.nn.softmax(logit) for logit in logit_cates]
    pred_cate = tf.concat(pred_cates, axis=-1)
    pred_cont = logit_cont

    #losses
    loss_cate_ops = [tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=inp_cate_y[:,idx], logits=logit_cates[idx])) for idx in range(11)]
    loss_cate_op = tf.reduce_sum(loss_cate_ops, axis=-1)
    loss_cont_op = tf.losses.mean_squared_error(inp_cont_y, pred_cont)
    
    loss_op = loss_cate_op + loss_cont_op*2
    
    #opt
    optimizer = tf.train.AdamOptimizer(learning_rate=config.learning_rate)
    train_op = optimizer.minimize(loss_op)
    
    return loss_op, loss_cate_op, loss_cont_op, train_op, inp_cate, inp_cont, inp_cont_scale, inp_cont_tf, inp_cate_y, inp_cont_y, pred_cate, pred_cont

In [8]:
mdl = build_model(config)

saver = tf.train.Saver(max_to_keep=5)
init = tf.global_variables_initializer()

In [11]:
def run_session(sess, max_step, datas, config, epoch, mode=1):
    total_loss = np.array([0.0, 0.0, 0.0])
    loss_op, loss_cate_op, loss_cont_op, train_op, inp_cate, inp_cont, inp_cont_scale, inp_cont_tf, inp_cate_y, inp_cont_y, pred_cate, pred_cont = mdl

    #train
    if mode ==1:
        drop_mask = np.random.rand(datas[0].values.shape[0], datas[0].values.shape[1])> (1- config.keep_prop)
        droped_cate = datas[0].values.copy()
        droped_cate = droped_cate * drop_mask
        
        cont_mask = np.random.rand(datas[1].values.shape[0], datas[1].values.shape[1])< (1- config.keep_prop)
        droped_cont = datas[1].values.copy()
        droped_cont[cont_mask] = config.replace_cont
        droped_cont_scaled = (droped_cont - config.cont_min)/(config.cont_max - config.cont_min)
        
        for step in range(max_step):
            start_idx = step * config.batch_size
            loss_ = sess.run([loss_op, loss_cate_op, loss_cont_op, train_op], feed_dict = {inp_cate:droped_cate[start_idx:start_idx+config.batch_size], 
                                                                  inp_cont:droped_cont[start_idx:start_idx+config.batch_size],
                                                                                           inp_cont_scale:droped_cont_scaled[start_idx:start_idx+config.batch_size],
                                                                                           inp_cont_tf:cont_mask[start_idx:start_idx+config.batch_size],
                                                                  inp_cate_y:datas[0].values[start_idx:start_idx+config.batch_size],
                                                                  inp_cont_y:datas[1].values[start_idx:start_idx+config.batch_size]})
            
            total_loss += loss_[:-1]
        total_loss /= max_step
        return total_loss 
    
    #validation
    elif mode == 2:
        droped_valid_cate = datas[0].values * config.vaild_drop_mask
        droped_valid_cont = datas[1].values.copy()
        droped_valid_cont[config.vaild_cont_mask] = config.replace_cont
        droped_valid_cont_scaled = (droped_valid_cont - config.cont_min)/(config.cont_max - config.cont_min)
        
        for step in range(max_step):
            start_idx = step * config.batch_size
            loss_ = sess.run([loss_op, loss_cate_op, loss_cont_op], feed_dict = {inp_cate:droped_valid_cate[start_idx:start_idx+config.batch_size], 
                                                   inp_cont:droped_valid_cont[start_idx:start_idx+config.batch_size], 
                                                                                 inp_cont_scale:droped_valid_cont_scaled[start_idx:start_idx+config.batch_size],
                                                                                 inp_cont_tf:config.vaild_cont_mask[start_idx:start_idx+config.batch_size],
                                                                                 
                                                   inp_cate_y:datas[0].values[start_idx:start_idx+config.batch_size],
                                                   inp_cont_y:datas[1].values[start_idx:start_idx+config.batch_size]})
            
            total_loss += loss_
        total_loss /= max_step
        return total_loss 
    
    #valid_test
    elif mode == 3:
        droped_valid_cate = datas[0].values * config.vaild_test_drop_mask
        droped_valid_cont = datas[1].values.copy()
        droped_valid_cont[config.vaild_test_cont_mask] = config.replace_cont
        droped_valid_cont_scaled = (droped_valid_cont - config.cont_min)/(config.cont_max - config.cont_min)
        
        
        preds_cate, preds_cont = [], []
        for step in range(max_step):
            start_idx = step * config.batch_size
            pred_cate_, pred_cont_ = sess.run([pred_cate, pred_cont], 
                                              feed_dict = {inp_cate:droped_valid_cate[start_idx:start_idx+config.batch_size], 
                                                           inp_cont:droped_valid_cont[start_idx:start_idx+config.batch_size],
                                                           inp_cont_scale:droped_valid_cont_scaled[start_idx:start_idx+config.batch_size],
                                                           inp_cont_tf:config.vaild_test_cont_mask[start_idx:start_idx+config.batch_size]})
            preds_cate.append(pred_cate_)
            preds_cont.append(pred_cont_)
            
        return preds_cate, preds_cont
    
    else:
        print('error')
        return None

In [10]:
# =======================================================================================
# run session
with tf.Session() as sess:
    tf.set_random_seed(seed=1991)
    np.random.seed(seed=1991)
    sess.run(init)
#     saver.restore(sess, config.path_ckpt)
    min_val_loss = 9999
    
    for epoch in range(1, config.epochs+1):
        print("Epoch : {}".format(epoch), end='\t')
        
        #train
        trn_total_loss_ = run_session(sess, train_step, [train_df_cate, train_df_cont], config, epoch, mode=1)
        print("Train_loss : {:.6f} / {:.6f} / {:.6f}".format(trn_total_loss_[0], trn_total_loss_[1], trn_total_loss_[2]), end = '\t')
        
        #valid
        val_total_loss_ = run_session(sess, valid_step, [valid_df_cate, valid_df_cont], config, epoch, mode=2)
        print("Valid_loss : {:.6f} / {:.6f} / {:.6f}".format(val_total_loss_[0], val_total_loss_[1], val_total_loss_[2]), end = '\t')
        
        if val_total_loss_[0] < min_val_loss:
            saver.save(sess, config.path_ckpt)
            min_val_loss = val_total_loss_[0]
            print("Saved")
        else:
            print("No Saved")

Epoch : 1	Train_loss : 23.273476 / 21.367727 / 0.952875	Valid_loss : 17.904607 / 16.697383 / 0.603612	Saved
Epoch : 2	Train_loss : 14.813630 / 14.080837 / 0.366396	Valid_loss : 12.508539 / 11.691790 / 0.408375	Saved
Epoch : 3	Train_loss : 10.482196 / 10.038865 / 0.221666	Valid_loss : 9.224146 / 8.594645 / 0.314751	Saved
Epoch : 4	Train_loss : 7.971938 / 7.613899 / 0.179019	Valid_loss : 7.284482 / 6.765380 / 0.259551	Saved
Epoch : 5	Train_loss : 6.710840 / 6.208759 / 0.251040	Valid_loss : 6.199692 / 5.709598 / 0.245047	Saved
Epoch : 6	Train_loss : 5.774313 / 5.360130 / 0.207092	Valid_loss : 5.531622 / 5.081039 / 0.225291	Saved
Epoch : 7	Train_loss : 5.332740 / 4.832860 / 0.249940	Valid_loss : 5.063585 / 4.686594 / 0.188495	Saved
Epoch : 8	Train_loss : 4.868441 / 4.465268 / 0.201586	Valid_loss : 4.774282 / 4.430639 / 0.171822	Saved
Epoch : 9	Train_loss : 4.649607 / 4.265882 / 0.191862	Valid_loss : 4.595292 / 4.271451 / 0.161921	Saved
Epoch : 10	Train_loss : 4.413883 / 4.134060 / 0.139911

KeyboardInterrupt: 

In [12]:
# =======================================================================================
# valid_test
with tf.Session() as sess:
    saver.restore(sess, config.path_ckpt)
    preds_cate_, preds_cont_ = run_session(sess, valid_step, [valid_df_cate, valid_df_cont], config, epoch, mode=3)
    
    preds_cate_ = np.concatenate(preds_cate_)
    preds_cont_ = np.concatenate(preds_cont_)
    #cliping
    preds_cont_[preds_cont_<0] = 0.0

INFO:tensorflow:Restoring parameters from .\ckpt\best.ckpt


In [13]:
# Categorical Vars Eval
pred_args = []
for p_ in preds_cate_:
    start_idx = 0
    pred_arg = []
    for kl in config.cate_lens:
        pred_arg.append(np.argmax(p_[start_idx: start_idx+kl]))
        start_idx += kl
    pred_args.append(pred_arg)

pred_args_np = np.array(pred_args)

val_test_ijv = []
for i, vs in enumerate(config.vaild_test_drop_mask):
    for j, v in enumerate(vs):
        if v:
            val_test_ijv.append([i,j,valid_df_cate.values[i,j]])    

sum_true = 0
for ijv in val_test_ijv:
    i,j,v = ijv
    sum_true+=(pred_args_np[i, j] == v)

val_tst_score = sum_true/len(val_test_ijv)
print("val_tst_score : {:6f}".format(val_tst_score))

# Numerical Vars Eval
val_test_ijv = []
for i, vs in enumerate(config.vaild_test_cont_mask):
    for j, v in enumerate(vs):
        if v:
            val_test_ijv.append([i,j,valid_df_cont.values[i,j]]) 

mse = 0.0
score = 0.0
for ijv in val_test_ijv:
    i,j,v = ijv
    mse += ((v - preds_cont_[i, j])**2)
    score += math.exp(-((v - preds_cont_[i, j])**2))

mse /= len(val_test_ijv)
score /= len(val_test_ijv)
print("mse : {:6f}, \tscore : {:6f}".format(mse, score))

val_tst_score : 0.999715
mse : 1.450852, 	score : 0.937216


# End