In [1]:
from __future__ import absolute_import
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import sklearn
from datetime import datetime

In [2]:
trn_df = pd.read_csv('./train/train.csv', encoding='utf8')
tst_df = pd.read_csv('./test/test.csv', encoding='utf8')
trn_df.head()

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,...,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,6881,113261,38038,513.8,5,0,N,0,N,N,...,0,33,172652.0,457,59333,N,0,102,0,516056
1,0,134508,45725,465.62,5,0,N,2,N,N,...,0,9,105114.0,451,0,N,5817,102,0,4376
2,6881,15408,188328,513.8,5,0,N,0,N,N,...,0,6,152458.0,457,59333,N,0,102,0,483434
3,6716,157159,29967,1016.11,5,62,N,5,N,N,...,0,5,172946.0,247,50436,N,3281,102,0,1407164
4,5975,105985,81305,713.66,5,62,N,4,N,N,...,0,6,182129.0,263,93775,N,5817,102,0,1051004


In [3]:
trn_df['loctm'] = trn_df.loctm.astype(int).astype(str).str.zfill(6)
trn_df['loctm'] = pd.to_datetime(trn_df.loctm, format='%H%M%S', errors='raise')
trn_df['loctm'] = trn_df.loctm.dt.time

tst_df['loctm'] = tst_df.loctm.astype(int).astype(str).str.zfill(6)
tst_df['loctm'] = pd.to_datetime(tst_df.loctm, format='%H%M%S', errors='raise')
tst_df['loctm'] = tst_df.loctm.dt.time

In [4]:
feature_list = ['locdt', 'loctm', 'contp', 'etymd', 'mcc', 'conam', 'ecfg', 'insfg', 'iterm', 'stocn', 'stscd', 'hcefg', 'csmcu']
trn_df = trn_df[feature_list + ['fraud_ind']]
tst_df = tst_df[feature_list]
trn_df.head()

Unnamed: 0,locdt,loctm,contp,etymd,mcc,conam,ecfg,insfg,iterm,stocn,stscd,hcefg,csmcu,fraud_ind
0,33,17:26:52,5,0,457,513.8,N,N,0,102,0,5,0,0
1,9,10:51:14,5,2,451,465.62,N,N,0,102,0,0,0,0
2,6,15:24:58,5,0,457,513.8,N,N,0,102,0,5,0,0
3,5,17:29:46,5,5,247,1016.11,N,N,0,102,0,5,62,0
4,6,18:21:29,5,4,263,713.66,N,N,0,102,0,5,62,0


In [5]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [6]:
trn_contp, tst_contp = target_encode(trn_df["contp"], 
                         tst_df["contp"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [7]:
trn_etymd, tst_etymd = target_encode(trn_df["etymd"], 
                         tst_df["etymd"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [8]:
trn_iterm, tst_iterm = target_encode(trn_df["iterm"], 
                         tst_df["iterm"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [9]:
trn_stocn, tst_stocn = target_encode(trn_df["stocn"], 
                         tst_df["stocn"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [10]:
trn_hcefg, tst_hcefg = target_encode(trn_df["hcefg"], 
                         tst_df["hcefg"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [11]:
trn_csmcu, tst_csmcu = target_encode(trn_df["csmcu"], 
                         tst_df["csmcu"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [69]:
trn_ecfg_insfg, tst_ecfg_insfg = pd.get_dummies(trn_df[['ecfg', 'insfg']]), pd.get_dummies(tst_df[['ecfg', 'insfg']])
# trn_insfg, tst_insfg = pd.get_dummies(trn_df['insfg']), pd.get_dummies(tst_df['insfg'])

In [13]:
target_encode_feature_list = ['contp', 'etymd', 'iterm', 'stocn', 'hcefg', 'csmcu']
one_hot_encode_feature_list = ['ecfg', 'insfg']

In [70]:
trn_ready = pd.concat([trn_ecfg_insfg, trn_contp, trn_etymd, trn_iterm, trn_stocn, trn_hcefg, trn_csmcu], axis=1)
trn_label = trn_df[['fraud_ind']]
trn_label = pd.get_dummies(trn_label.astype(object))

In [71]:
tst_ready = pd.concat([tst_ecfg_insfg, tst_contp, tst_etymd, tst_iterm, tst_stocn, tst_hcefg, tst_csmcu], axis=1)
# tst_label = tst_df[['fraud_ind']]

In [72]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import sys
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

### Train, Eval, Test

In [18]:
DATASET_SIZE = len(trn_ready)
train_size = int(0.7 * DATASET_SIZE)
val_size = int(0.15 * DATASET_SIZE)
test_size = int(0.15 * DATASET_SIZE)

full_dataset = tf.data.Dataset.from_tensor_slices((trn_ready.values, trn_label.values))
full_dataset = full_dataset.shuffle(1)
train_dataset = full_dataset.take(train_size)
test_dataset = full_dataset.skip(train_size)
val_dataset = test_dataset.skip(test_size)
test_dataset = test_dataset.take(test_size)

In [87]:
feature_key = list(trn_ready.columns)

['ecfg_N',
 'ecfg_Y',
 'insfg_N',
 'insfg_Y',
 'contp_mean',
 'etymd_mean',
 'iterm_mean',
 'stocn_mean',
 'hcefg_mean',
 'csmcu_mean']

### Model Build

In [93]:
inputs = {key: tf.keras.layers.Input(shape=(), name=key) for key in feature_key}

In [95]:
model = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(input_shape=(10,), name='input'),
    tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(2, activation='softmax')
])

In [96]:
model.compile(optimizer='sgd',loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 16)                176       
_________________________________________________________________
dropout_3 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 34        
Total params: 210
Trainable params: 210
Non-trainable params: 0
_________________________________________________________________


In [97]:
model.fit(trn_ready, trn_label, validation_split=0.7, epochs=5, verbose=2)





Train on 456536 samples, validate on 1065251 samples
Epoch 1/5
456536/456536 - 49s - loss: 0.0684 - accuracy: 0.9844 - val_loss: 0.0568 - val_accuracy: 0.9868
Epoch 2/5
456536/456536 - 49s - loss: 0.0583 - accuracy: 0.9863 - val_loss: 0.0542 - val_accuracy: 0.9868
Epoch 3/5
456536/456536 - 49s - loss: 0.0552 - accuracy: 0.9863 - val_loss: 0.0512 - val_accuracy: 0.9868
Epoch 4/5
456536/456536 - 49s - loss: 0.0520 - accuracy: 0.9863 - val_loss: 0.0484 - val_accuracy: 0.9868
Epoch 5/5
456536/456536 - 49s - loss: 0.0498 - accuracy: 0.9863 - val_loss: 0.0464 - val_accuracy: 0.9868


<tensorflow.python.keras.callbacks.History at 0x2acd3c0fc50>

In [None]:
test_loss, test_acc = model.evaluate(trn_ready, pd.get_dummies(trn_label.astype(object)), verbose=2)

In [98]:
keras_estimator = tf.keras.estimator.model_to_estimator(
    keras_model=model,
#     keras_model_path=None,
#     custom_objects=None,
    model_dir="/tmp/tfkeras_example/",
#     config=None,
    checkpoint_format='checkpoint'
)

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.


INFO:tensorflow:Using the Keras model provided.


INFO:tensorflow:Using the Keras model provided.


INFO:tensorflow:Using config: {'_model_dir': '/tmp/tfkeras_example/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002ACDB8B64E0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': '/tmp/tfkeras_example/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000002ACDB8B64E0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
model_dir = "/tmp/tfkeras_example/"
keras_estimator = tf.keras.estimator.model_to_estimator(
    keras_model=model, model_dir=model_dir)


In [99]:
def np_training_input_fn(x, y):
    return tf.compat.v1.estimator.inputs.numpy_input_fn(
        x= x,
        y= y,
        batch_size= 32,
        num_epochs= 5, # this way you can leave out steps from training
        shuffle= True,
        queue_capacity= 5000
    )

In [100]:
np_trn_label = trn_label.to_dict('list')
for key in np_trn_label.keys():
    np_trn_label[key] = np.array(np_trn_label[key])
    
np_trn_ready = trn_ready.to_dict('list')
for key in np_trn_ready.keys():
    np_trn_ready[key] = np.array(np_trn_ready[key])

In [101]:
def pd_input_fn(train_data, label_data):
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
        x=train_data,
        y=label_data,
        batch_size = 32,
        num_epochs = 5,
        shuffle = True,
        queue_capacity = 1000,
        num_threads = 1
    )

In [102]:
feature_cols = [tf.feature_column.numeric_column(k) for k in trn_ready.to_dict('list').keys()]

In [103]:
# keras_estimator = keras_estimator
keras_estimator.train(input_fn=np_training_input_fn(np_trn_ready, np_trn_label), steps=2000)

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


KeyError: "The dictionary passed into features does not have the expected inputs keys defined in the keras model.\n\tExpected keys: {'input'}\n\tfeatures keys: {'ecfg_Y', 'insfg_N', 'stocn_mean', 'etymd_mean', 'insfg_Y', 'contp_mean', 'ecfg_N', 'hcefg_mean', 'csmcu_mean', 'iterm_mean'}\n\tDifference: {'ecfg_Y', 'input', 'insfg_N', 'stocn_mean', 'etymd_mean', 'contp_mean', 'insfg_Y', 'ecfg_N', 'hcefg_mean', 'csmcu_mean', 'iterm_mean'}"