In [45]:
from __future__ import absolute_import
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import sklearn
from datetime import datetime

In [46]:
trn_df = pd.read_csv('./train/train.csv', encoding='utf8')
tst_df = pd.read_csv('./test/test.csv', encoding='utf8')
trn_df.head()

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,...,iterm,locdt,loctm,mcc,mchno,ovrlt,scity,stocn,stscd,txkey
0,6881,113261,38038,513.8,5,0,N,0,N,N,...,0,33,172652.0,457,59333,N,0,102,0,516056
1,0,134508,45725,465.62,5,0,N,2,N,N,...,0,9,105114.0,451,0,N,5817,102,0,4376
2,6881,15408,188328,513.8,5,0,N,0,N,N,...,0,6,152458.0,457,59333,N,0,102,0,483434
3,6716,157159,29967,1016.11,5,62,N,5,N,N,...,0,5,172946.0,247,50436,N,3281,102,0,1407164
4,5975,105985,81305,713.66,5,62,N,4,N,N,...,0,6,182129.0,263,93775,N,5817,102,0,1051004


In [3]:
trn_df['loctm'] = trn_df.loctm.astype(int).astype(str).str.zfill(6)
trn_df['loctm'] = pd.to_datetime(trn_df.loctm, format='%H%M%S', errors='raise')
trn_df['loctm'] = trn_df.loctm.dt.time

tst_df['loctm'] = tst_df.loctm.astype(int).astype(str).str.zfill(6)
tst_df['loctm'] = pd.to_datetime(tst_df.loctm, format='%H%M%S', errors='raise')
tst_df['loctm'] = tst_df.loctm.dt.time

In [74]:
feature_list = ['locdt', 'loctm', 'contp', 'etymd', 'mcc', 'conam', 'ecfg', 'insfg', 'iterm', 'stocn', 'stscd', 'hcefg', 'csmcu']
trn_df = trn_df[feature_list + ['fraud_ind']]
trn_df = trn_df.dropna()
tst_df = tst_df[feature_list]
trn_df.head()

Unnamed: 0,locdt,loctm,contp,etymd,mcc,conam,ecfg,insfg,iterm,stocn,stscd,hcefg,csmcu,fraud_ind
0,33,172652.0,5,0,457,513.8,N,N,0,102,0,5,0,0
1,9,105114.0,5,2,451,465.62,N,N,0,102,0,0,0,0
2,6,152458.0,5,0,457,513.8,N,N,0,102,0,5,0,0
3,5,172946.0,5,5,247,1016.11,N,N,0,102,0,5,62,0
4,6,182129.0,5,4,263,713.66,N,N,0,102,0,5,62,0


In [5]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior  
    """ 
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [6]:
trn_contp, tst_contp = target_encode(trn_df["contp"], 
                         tst_df["contp"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [7]:
trn_etymd, tst_etymd = target_encode(trn_df["etymd"], 
                         tst_df["etymd"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [8]:
trn_iterm, tst_iterm = target_encode(trn_df["iterm"], 
                         tst_df["iterm"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [9]:
trn_stocn, tst_stocn = target_encode(trn_df["stocn"], 
                         tst_df["stocn"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [10]:
trn_hcefg, tst_hcefg = target_encode(trn_df["hcefg"], 
                         tst_df["hcefg"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [11]:
trn_csmcu, tst_csmcu = target_encode(trn_df["csmcu"], 
                         tst_df["csmcu"], 
                         target=trn_df.fraud_ind, 
                         min_samples_leaf=100,
                         smoothing=10,
                         noise_level=0.01)

In [12]:
trn_ecfg_insfg, tst_ecfg_insfg = pd.get_dummies(trn_df[['ecfg', 'insfg']]), pd.get_dummies(tst_df[['ecfg', 'insfg']])
# trn_insfg, tst_insfg = pd.get_dummies(trn_df['insfg']), pd.get_dummies(tst_df['insfg'])

In [13]:
target_encode_feature_list = ['contp', 'etymd', 'iterm', 'stocn', 'hcefg', 'csmcu']
one_hot_encode_feature_list = ['ecfg', 'insfg']

In [14]:
trn_ready = pd.concat([trn_ecfg_insfg, trn_contp, trn_etymd, trn_iterm, trn_stocn, trn_hcefg, trn_csmcu], axis=1)
trn_label = trn_df[['fraud_ind']]
trn_label = pd.get_dummies(trn_label.astype(object))

In [31]:
tst_ready = pd.concat([tst_ecfg_insfg, tst_contp, tst_etymd, tst_iterm, tst_stocn, tst_hcefg, tst_csmcu], axis=1)
# tst_label = tst_df[['fraud_ind']]

In [16]:
trn_ready.head()

Unnamed: 0,ecfg_N,ecfg_Y,insfg_N,insfg_Y,contp_mean,etymd_mean,iterm_mean,stocn_mean,hcefg_mean,csmcu_mean
0,1,0,1,0,0.014307,0.001408,0.013701,0.000945,0.014345,0.001059
1,1,0,1,0,0.014351,0.020658,0.013587,0.000955,0.000387,0.001097
2,1,0,1,0,0.014218,0.001397,0.013712,0.000959,0.014116,0.001089
3,1,0,1,0,0.01416,0.000502,0.013584,0.000948,0.014382,0.007218
4,1,0,1,0,0.014239,0.000556,0.013794,0.000947,0.014028,0.007179


In [17]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import sys
import tensorflow as tf

# Method 2

In [None]:
trn_ready = trn_df[target_encode_feature_list+one_hot_encode_feature_list]

In [68]:
contp = tf.feature_column.categorical_column_with_vocabulary_list('contp', [5, 3, 6, 2, 4, 0, 1])
etymd = tf.feature_column.categorical_column_with_vocabulary_list('etymd', [0, 2, 5, 4, 8, 1, 9, 6, 3, 7, 10])
iterm = tf.feature_column.categorical_column_with_vocabulary_list('iterm', [0, 1, 2, 8, 3, 4, 7, 6, 5])
stocn = tf.feature_column.categorical_column_with_vocabulary_list('stocn', list(trn_df.stocn.unique()))
hcefg = tf.feature_column.categorical_column_with_vocabulary_list('hcefg', [5, 0, 3, 1, 2, 7, 8, 9, 6])
csmcu = tf.feature_column.categorical_column_with_vocabulary_list('csmcu', list(trn_df.csmcu.unique()))
# Binary
ecfg = tf.feature_column.categorical_column_with_vocabulary_list('ecfg', ['N', 'Y'])
insfg =tf.feature_column.categorical_column_with_vocabulary_list('insfg', ['N', 'Y'])


contp_one_hot = tf.feature_column.indicator_column(contp)
etymd_one_hot = tf.feature_column.indicator_column(etymd)
iterm_one_hot = tf.feature_column.indicator_column(iterm)
stocn_one_hot = tf.feature_column.indicator_column(stocn)
hcefg_one_hot = tf.feature_column.indicator_column(hcefg)
csmcu_one_hot = tf.feature_column.indicator_column(csmcu)
ecfg_one_hot = tf.feature_column.indicator_column(ecfg)
insfg_one_hot = tf.feature_column.indicator_column(insfg)
feature_cols = [contp_one_hot, etymd_one_hot, iterm_one_hot, stocn_one_hot, hcefg_one_hot, csmcu_one_hot, ecfg_one_hot, insfg_one_hot]

# Method 1

In [18]:
# Multi
contp_mean = tf.feature_column.numeric_column('contp_mean')
etymd_mean = tf.feature_column.numeric_column('etymd_mean')
iterm_mean = tf.feature_column.numeric_column('iterm_mean')
stocn_mean = tf.feature_column.numeric_column('stocn_mean')
hcefg_mean = tf.feature_column.numeric_column('hcefg_mean')
csmcu_mean = tf.feature_column.numeric_column('csmcu_mean')
# Binary
ecfg_N = tf.feature_column.numeric_column('ecfg_N')
ecfg_Y = tf.feature_column.numeric_column('ecfg_Y')
insfg_N = tf.feature_column.numeric_column('insfg_N')
insfg_Y = tf.feature_column.numeric_column('insfg_Y')

In [79]:
trn_label = trn_df[['fraud_ind']]
# labels = tf.feature_column.categorical_column_with_hash_bucket('fraud_ind', hash_bucket_size=2)

In [20]:
feature_cols = [contp_mean, etymd_mean, iterm_mean, stocn_mean, hcefg_mean, csmcu_mean, ecfg_N, ecfg_Y, insfg_N, insfg_Y]

In [80]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(trn_df, trn_label, test_size=0.3)

In [81]:
def input_fn(features, labels=None, training=True, batch_size=256):
    
    """An input function for training or evaluating"""
    # 将输入转换为数据集。
    if labels is not None:
        labels = labels.values.reshape(-1,)
        inputs = (dict(features), labels)
    else:
        inputs = (dict(features))
    dataset = tf.data.Dataset.from_tensor_slices(inputs)
    # 如果在训练模式下混淆并重复数据。
    if training:
        dataset = dataset.shuffle(1000).repeat()
    dataset = dataset.batch(batch_size)
    return dataset

In [82]:
input_fn(x_train, labels = y_train)

RuntimeError: Graph is finalized and cannot be modified.

In [78]:
def eval_input_func(features, labels, training=True, batch_size=256):
    labels = labels.values.reshape(-1,)
    """An input function for training or evaluating"""
    # 将输入转换为数据集。
    if labels is not None:
        inputs = (dict(features), labels)
    else:
        inputs = (dict(features))
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # 如果在训练模式下混淆并重复数据。
    if training:
        dataset = dataset.shuffle(1000).repeat()
    
    return dataset.batch(batch_size)

In [25]:
eval_input_func(x_test, y_test)

<BatchDataset shapes: ({ecfg_N: (None,), ecfg_Y: (None,), insfg_N: (None,), insfg_Y: (None,), contp_mean: (None,), etymd_mean: (None,), iterm_mean: (None,), stocn_mean: (None,), hcefg_mean: (None,), csmcu_mean: (None,)}, (None,)), types: ({ecfg_N: tf.int32, ecfg_Y: tf.int32, insfg_N: tf.int32, insfg_Y: tf.int32, contp_mean: tf.float64, etymd_mean: tf.float64, iterm_mean: tf.float64, stocn_mean: tf.float64, hcefg_mean: tf.float64, csmcu_mean: tf.float64}, tf.int64)>

In [83]:
model = tf.estimator.LinearClassifier(
    feature_columns=feature_cols,
    n_classes=2
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\tim19\\AppData\\Local\\Temp\\tmp9aer9ixv', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000277F70AF710>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [87]:
results = model.train(input_fn=lambda: input_fn(x_train, y_train), steps=100000)

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\tim19\AppData\Local\Temp\tmp9aer9ixv\model.ckpt-5000
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 5000 into C:\Users\tim19\AppData\Local\Temp\tmp9aer9ixv\model.ckpt.
INFO:tensorflow:loss = 0.08527371, step = 5000
INFO:tensorflow:global_step/sec: 35.3894
INFO:tensorflow:loss = 0.03924777, step = 5100 (2.810 sec)
INFO:tensorflow:global_step/sec: 1

### Evaluate

In [88]:
results = model.evaluate(input_fn = lambda: eval_input_func(x_test, y_test, training=False))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-10-15T01:02:05Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\tim19\AppData\Local\Temp\tmp9aer9ixv\model.ckpt-105000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-10-15-01:02:34
INFO:tensorflow:Saving dict for global step 105000: accuracy = 0.9867634, accuracy_baseline = 0.9865444, auc = 0.9420263, auc_precision_recall = 0.31993854, average_loss = 0.04014173, global_step = 105000, label/mean = 0.013455645, loss = 0.040165257, precision = 0.65337425, predi

In [89]:
results

{'accuracy': 0.9867634,
 'accuracy_baseline': 0.9865444,
 'auc': 0.9420263,
 'auc_precision_recall': 0.31993854,
 'average_loss': 0.04014173,
 'label/mean': 0.013455645,
 'loss': 0.040165257,
 'precision': 0.65337425,
 'prediction/mean': 0.013078295,
 'recall': 0.034673613,
 'global_step': 105000}

In [43]:
predictions = model.predict(input_fn = lambda: input_fn(tst_ready, training=False))

In [44]:
expected = ['fraud_ind']
# CATEGORY = ['Setosa', 'Versicolor', 'Virginica']
for pred_dict, expec in zip(predictions, expected):
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print('Prediction is "{}" ({:.1f}%), expected "{}"'.format(
        class_id, 100 * probability, expec))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\tim19\AppData\Local\Temp\tmp54zprb5r\model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Prediction is "0" (99.8%), expected "fraud_ind"


### Plot