In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score

from preprocess.preprocess_transaction_frequecy import preprocess_transaction_frequency
from preprocess.preprocess_init import preprocess_init
from preprocess.preprocess_time import preprocess_time
from preprocess.preprocess_change_card import preprocess_change_card
from preprocess.preprocess_mchno import preprocess_mchno
from preprocess.preprocess_special_features import preprocess_special_features
from preprocess.preprocess_conam import preprocess_conam
from preprocess.preprocess_train_test_split import preprocess_train_test_split

from util.generate_X_y import generate_X_y
from util.remove_outlier import remove_outlier
from util.generate_statistic import generate_statistic

from model.lgbm_model import LGBM_Model
from model.shap_importance import shap_importance
from model.plot import plot_importance

# import warnings
# warnings.filterwarnings("ignore")

pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

### Before running the following program , please make sure 
### that the features create from new_features.ipynb complete

# Model Explanation

- Total: 5 models

- Base model: Created by base features which contain 20 raw features , 13 preprocessed features and 60 preprocess frequent features.

- Special features: 4 special features to capture the information of the label, since there are some users(bacno) in both training and testing set.

- Each of the four models is trained by the base features plus one special feature.

- If the transaction in testing meets the conditions of special features, the transaction is predicted by the corresponding special model. Otherwise the transaction is predicted by the base model.

- Note: These special features may not work in real world since we don't suppose have the label.

# Key Takeaways

- Apply limited number of new features 
    - In order to simplify models to have better generalizaiton

- Replace the value of categorical features of training set with NA if the value is not in testing set
    - The model will not learn something useless when apply in testing set. The model can focus on the value which also exists in testing set.

- Use early stopping and split the training set by GroupKFold
    - The model will stop training once the model performance stops improving on a hold out validation dataset.
    - Grouping the training set by user(bacno) makes the model stop earlier which prevents overfitting.

- Drop extreme cases
    - Drop the prediction of the fold if it is out of 1 standard deviation boundary, since some of the predictions of testing set are very extreme between folds.

# Define features

These are the features i used in the model

In [7]:
raw_bool_features = ['ecfg',
                     'flbmk',
                     'flg_3dsmk',
                     'insfg',
                     'ovrlt'
                      ]

raw_categorial_features = ['contp',
                           'stscd',
                           'etymd',
                           'stocn',
                           'mcc',
                           'csmcu',
                           'hcefg',
                           'bacno',
                           'cano',
                           'mchno',
                           'acqic',
                           'scity'
                            ] 

raw_contiuous_feautres = ['loctm',
                          'conam',
                          'iterm'
                           ]

transaction_frequency_feautres = ['cano_days_txkey_count',#同卡號，同一週期(30/60/90)出現的次數
                                  'cano_locdt_txkey_count',#同卡號，同一天出現的次數
                                  'bacno_locdt_mchno_txkey_count'#同帳號，同一天，同特店出現的次數
                                    ]

time_feautres = ['last_time_days',
                 'next_time_days',
                 'cano_locdt_global_time_std' 
                    ]


change_card_feautres = ['diff_locdt_with_last_trans_cano',
                        'diff_locdt_of_two_card'
                           ]

conam_feautres = ['cano_locdt_conam_min',
                  'cano_locdt_conam_max',
                  'diff_gtime_with_conam_zero_trans_locdt'
                   ]

mchno_features = ['bacno_mchno_locdt_head_tail_diff',
                  'cano_days_mchno_index',    
                    ]


special_feautures = ['mchno_in_normal_mchno_list',# 這間特電在過去的交易中有出現且是正常的
                    'mchno_in_fraud_mchno_list',# 這間特電在過去的交易中有出現且是盜刷的
                    'conam_in_fraud_conam_list',# 金額在過去的交易中有出現且是異常的
                    'diff_with_first_fraud_locdt'#與該卡號第一次被判盜刷距今的交易時間 
                               ]
new_features = ['bacno_cano_conam_mean_1h', 'bacno_cano_count_sum_1h',
       'bacno_cano_conam_mean_6h', 'bacno_cano_count_sum_6h',
       'bacno_cano_conam_mean_1d', 'bacno_cano_count_sum_1d',
       'bacno_cano_conam_mean_30d', 'bacno_cano_count_sum_30d',
       'bacno_cano_conam_mean_7d', 'bacno_cano_count_sum_7d',
       'bacno_cano_conam_mean_120d', 'bacno_cano_count_sum_120d',
       'bacno_cano_conam_mean_1h_xg_conam',
       'bacno_cano_conam_mean_6h_xg_conam',
       'bacno_cano_conam_mean_1d_xg_conam',
       'bacno_cano_conam_mean_7d_xg_conam',
       'bacno_cano_conam_mean_30d_xg_conam',
       'bacno_cano_conam_mean_120d_xg_conam', 'bacno_cano_count_sum_7d_xg_1h',
       'bacno_cano_count_sum_1d_xg_1h', 'bacno_cano_count_sum_6h_xg_1h',
       'bacno_cano_conam_mean_1h_div_conam',
       'bacno_cano_conam_mean_6h_div_conam',
       'bacno_cano_conam_mean_1d_div_conam',
       'bacno_cano_conam_mean_7d_div_conam',
       'bacno_cano_conam_mean_30d_div_conam',
       'bacno_cano_conam_mean_120d_div_conam']
new2_features = ['cano_mhcno_conam_mean_1h', 'cano_mhcno_count_sum_1h',
       'cano_mhcno_conam_mean_6h', 'cano_mhcno_count_sum_6h',
       'cano_mhcno_conam_mean_1d', 'cano_mhcno_count_sum_1d',
       'cano_mhcno_conam_mean_7d', 'cano_mhcno_count_sum_7d',
       'cano_mhcno_conam_mean_30d', 'cano_mhcno_count_sum_30d',
       'cano_mhcno_conam_mean_120d', 'cano_mhcno_count_sum_120d',
       'cano_mchno_conam_mean_1h_xg_conam',
       'cano_mchno_conam_mean_6h_xg_conam',
       'cano_mchno_conam_mean_1d_xg_conam',
       'cano_mchno_conam_mean_7d_xg_conam',
       'cano_mchno_conam_mean_30d_xg_conam',
       'cano_mchno_conam_mean_120d_xg_conam', 'cano_mhcno_count_sum_7d_xg_1h',
       'cano_mhcno_count_sum_1d_xg_1h', 'cano_mhcno_count_sum_6h_xg_1h',
       'cano_mhcno_conam_mean_1h_div_conam',
       'cano_mhcno_conam_mean_6h_div_conam',
       'cano_mhcno_conam_mean_1d_div_conam',
       'cano_mhcno_conam_mean_7d_div_conam',
       'cano_mhcno_conam_mean_30d_div_conam',
       'cano_mhcno_conam_mean_120d_div_conam']
new3_features = ['bacno_cano_stocn_unique_2d', 'bacno_cano_stocn_unique_6h',
       'bacno_cano_mchno_unique_2d', 'bacno_cano_mchno_unique_6h',
       'bacno_cano_mcc_unique_2d', 'bacno_cano_mcc_unique_6h']

base_features =  (    raw_bool_features 
                    + raw_categorial_features
                    + raw_contiuous_feautres
                    + transaction_frequency_feautres
                    + time_feautres
                    + change_card_feautres
                    + conam_feautres 
                    + mchno_features + new_features +new2_features +new3_features
                       )

label = 'fraud_ind'

# Preprocess

In [8]:
preprocess_group = False
if preprocess_group:
    df_train_raw = pd.read_csv('data/train.csv')
    df_test_raw = pd.read_csv('data/test.csv')
    df = preprocess_init(df_train_raw, df_test_raw, raw_bool_features)
    df = preprocess_transaction_frequency(df)
    df = preprocess_time(df)
    df = preprocess_change_card(df)
    df = preprocess_mchno(df)
    df = preprocess_conam(df)
    df = preprocess_special_features(df)
    df.to_pickle('data/df_preprocessed.pkl')
else:
    df = pd.read_pickle('data/df_preprocessed.pkl')    

In [9]:
df_new_x = pd.read_csv('new_x_bacno_cano_conam.csv',index_col=0)

In [10]:
df_new2_x = pd.read_csv('new2_x_cano_mhcno_conam.csv',index_col=0)

In [11]:
df_new3_x = pd.read_csv('new3_x_bacno_cano_category_count.csv',index_col=0)

In [13]:
df = pd.merge(df , df_new_x,on='txkey' , how='left')

In [14]:
df = pd.merge(df , df_new2_x,on='txkey' , how='left')

In [15]:
df = pd.merge(df , df_new3_x,on='txkey' , how='left')

In [16]:
len(df.columns) #確認合併是一樣的

119

# Base features

In [19]:
df_train, df_test = preprocess_train_test_split(df, raw_categorial_features)# Replace the value of categorical features of training
                                                                                #set with NA 
                                                                                #if the value is not in testing set
input_features = base_features 
X_train, y_train, groups, X_test = generate_X_y(df_train, df_test, label, input_features)
X_train.tail(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user

Unnamed: 0,ecfg,flbmk,flg_3dsmk,insfg,ovrlt,contp,stscd,etymd,stocn,mcc,csmcu,hcefg,bacno,cano,mchno,acqic,scity,loctm,conam,iterm,cano_days_txkey_count,cano_locdt_txkey_count,bacno_locdt_mchno_txkey_count,last_time_days,next_time_days,cano_locdt_global_time_std,diff_locdt_with_last_trans_cano,diff_locdt_of_two_card,cano_locdt_conam_min,cano_locdt_conam_max,diff_gtime_with_conam_zero_trans_locdt,bacno_mchno_locdt_head_tail_diff,cano_days_mchno_index,bacno_cano_conam_mean_1h,bacno_cano_count_sum_1h,bacno_cano_conam_mean_6h,bacno_cano_count_sum_6h,bacno_cano_conam_mean_1d,bacno_cano_count_sum_1d,bacno_cano_conam_mean_30d,bacno_cano_count_sum_30d,bacno_cano_conam_mean_7d,bacno_cano_count_sum_7d,bacno_cano_conam_mean_120d,bacno_cano_count_sum_120d,bacno_cano_conam_mean_1h_xg_conam,bacno_cano_conam_mean_6h_xg_conam,bacno_cano_conam_mean_1d_xg_conam,bacno_cano_conam_mean_7d_xg_conam,bacno_cano_conam_mean_30d_xg_conam,bacno_cano_conam_mean_120d_xg_conam,bacno_cano_count_sum_7d_xg_1h,bacno_cano_count_sum_1d_xg_1h,bacno_cano_count_sum_6h_xg_1h,bacno_cano_conam_mean_1h_div_conam,bacno_cano_conam_mean_6h_div_conam,bacno_cano_conam_mean_1d_div_conam,bacno_cano_conam_mean_7d_div_conam,bacno_cano_conam_mean_30d_div_conam,bacno_cano_conam_mean_120d_div_conam,cano_mhcno_conam_mean_1h,cano_mhcno_count_sum_1h,cano_mhcno_conam_mean_6h,cano_mhcno_count_sum_6h,cano_mhcno_conam_mean_1d,cano_mhcno_count_sum_1d,cano_mhcno_conam_mean_7d,cano_mhcno_count_sum_7d,cano_mhcno_conam_mean_30d,cano_mhcno_count_sum_30d,cano_mhcno_conam_mean_120d,cano_mhcno_count_sum_120d,cano_mchno_conam_mean_1h_xg_conam,cano_mchno_conam_mean_6h_xg_conam,cano_mchno_conam_mean_1d_xg_conam,cano_mchno_conam_mean_7d_xg_conam,cano_mchno_conam_mean_30d_xg_conam,cano_mchno_conam_mean_120d_xg_conam,cano_mhcno_count_sum_7d_xg_1h,cano_mhcno_count_sum_1d_xg_1h,cano_mhcno_count_sum_6h_xg_1h,cano_mhcno_conam_mean_1h_div_conam,cano_mhcno_conam_mean_6h_div_conam,cano_mhcno_conam_mean_1d_div_conam,cano_mhcno_conam_mean_7d_div_conam,cano_mhcno_conam_mean_30d_div_conam,cano_mhcno_conam_mean_120d_div_conam,bacno_cano_stocn_unique_2d,bacno_cano_stocn_unique_6h,bacno_cano_mchno_unique_2d,bacno_cano_mchno_unique_6h,bacno_cano_mcc_unique_2d,bacno_cano_mcc_unique_6h
1943429,0,0,0,0,0,5.0,0.0,5.0,102.0,292.0,62.0,5.0,,,47022.0,6716.0,4526.0,202808.0,815.08,0,9,1,1,99046.0,,,0,,815.08,815.08,,0,1,815.08,1.0,815.08,1.0,815.08,1.0,619.404,10.0,782.833333,3.0,788.288235,17.0,-2.273737e-13,-2.273737e-13,-2.273737e-13,32.24667,195.676,26.791765,2.0,0.0,0.0,1.0,1.0,1.0,1.041192,1.31591,1.033987,815.08,1.0,815.08,1.0,815.08,1.0,815.08,1.0,815.08,1.0,815.08,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0
1943430,1,0,0,0,0,5.0,0.0,8.0,102.0,209.0,62.0,5.0,,,38207.0,6322.0,5817.0,61702.0,750.24,0,9,3,1,,-46675.0,29430.624781,-13,,381.65,841.64,,0,1,750.24,1.0,750.24,1.0,750.24,1.0,657.79,10.0,774.685,4.0,786.174444,18.0,-2.273737e-13,-2.273737e-13,-2.273737e-13,-24.445,92.45,-35.934444,3.0,0.0,0.0,1.0,1.0,1.0,0.968445,1.140546,0.954292,750.24,1.0,750.24,1.0,750.24,1.0,750.24,1.0,750.24,1.0,750.24,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0
1943431,0,0,0,0,0,5.0,0.0,5.0,102.0,270.0,62.0,5.0,,,39044.0,6716.0,5820.0,191457.0,841.64,0,9,3,1,46675.0,-7720.0,29430.624781,-13,,381.65,841.64,,0,1,841.64,1.0,841.64,1.0,795.94,2.0,674.503636,11.0,788.076,5.0,789.093684,19.0,-2.273737e-13,-2.273737e-13,45.7,53.564,167.136364,52.546316,4.0,1.0,0.0,1.0,1.0,1.057416,1.067968,1.247792,1.066591,841.64,1.0,841.64,1.0,841.64,1.0,841.64,1.0,841.64,1.0,841.64,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,1.0,3.0,1.0
1943432,0,0,0,0,0,5.0,0.0,5.0,102.0,306.0,62.0,5.0,,,88672.0,5975.0,5817.0,212337.0,381.65,0,9,3,1,7720.0,-84800.0,29430.624781,-13,,381.65,841.64,,0,1,381.65,1.0,611.645,2.0,657.843333,3.0,650.099167,12.0,720.338333,6.0,768.7215,20.0,-2.273737e-13,-229.995,-276.1933,-338.6883,-268.449167,-387.0715,5.0,2.0,1.0,1.0,0.623973,0.580153,0.52982,0.587064,0.496474,381.65,1.0,381.65,1.0,381.65,1.0,381.65,1.0,381.65,1.0,381.65,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,2.0,3.0,2.0
1943433,0,0,0,0,0,5.0,0.0,5.0,102.0,192.0,62.0,5.0,,,,5975.0,5817.0,205657.0,366.38,0,9,1,1,84800.0,-240959.0,,-12,,366.38,366.38,,0,1,366.38,1.0,366.38,1.0,374.015,2.0,623.76,12.0,669.772857,7.0,749.562381,21.0,-2.273737e-13,-2.273737e-13,-7.635,-303.3929,-257.38,-383.182381,6.0,1.0,0.0,1.0,1.0,0.979586,0.547021,0.587373,0.488792,366.38,1.0,366.38,1.0,366.38,1.0,366.38,1.0,366.38,1.0,366.38,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,4.0,1.0
1943434,0,0,0,0,0,5.0,0.0,5.0,102.0,288.0,62.0,5.0,,,,5975.0,5817.0,155256.0,1119.11,0,9,4,1,240959.0,-5347.0,3661.951802,-9,,1103.64,1334.91,,0,1,1119.11,1.0,1119.11,1.0,1119.11,1.0,661.863846,13.0,780.08,7.0,766.36,22.0,-2.273737e-13,-2.273737e-13,-2.273737e-13,339.03,457.246154,352.75,6.0,0.0,0.0,1.0,1.0,1.0,1.434609,1.690846,1.460293,1119.11,1.0,1119.11,1.0,1119.11,1.0,1119.11,1.0,1119.11,1.0,1119.11,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1943435,0,0,0,0,0,5.0,0.0,5.0,102.0,247.0,62.0,5.0,,,6475.0,6767.0,5817.0,172203.0,1334.91,0,9,4,3,5347.0,-2025.0,3661.951802,-9,,1103.64,1334.91,,0,1,1334.91,1.0,1227.01,2.0,1227.01,2.0,709.938571,14.0,801.287143,7.0,791.079565,23.0,-2.273737e-13,107.9,107.9,533.6229,624.971429,543.830435,6.0,1.0,1.0,1.0,1.087937,1.087937,1.665957,1.880318,1.687454,1334.91,1.0,1334.91,1.0,1334.91,1.0,1334.91,1.0,1334.91,1.0,1334.91,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0
1943436,0,0,0,0,0,5.0,0.0,5.0,102.0,247.0,62.0,5.0,,,6475.0,6767.0,5817.0,175548.0,1125.71,0,9,4,3,2025.0,-735.0,3661.951802,-9,,1103.64,1334.91,,0,2,1230.31,2.0,1193.243333,3.0,1193.243333,3.0,737.656667,15.0,841.84,8.0,805.0225,24.0,-104.6,-67.53333,-67.53333,283.87,388.053333,320.6875,6.0,1.0,1.0,0.914981,0.943404,0.943404,1.337202,1.526062,1.398358,1230.31,2.0,1230.31,2.0,1230.31,2.0,1230.31,2.0,1230.31,2.0,1230.31,2.0,-104.6,-104.6,-104.6,-104.6,-104.6,-104.6,0.0,0.0,0.0,0.914981,0.914981,0.914981,0.914981,0.914981,0.914981,1.0,1.0,2.0,2.0,2.0,2.0
1943437,0,0,0,0,0,5.0,0.0,5.0,102.0,247.0,62.0,5.0,,,6475.0,6767.0,5817.0,180803.0,1103.64,0,9,4,3,735.0,-749289.0,3661.951802,-9,,1103.64,1334.91,,0,3,1188.086667,3.0,1170.8425,4.0,1170.8425,4.0,760.530625,16.0,870.928889,9.0,816.9672,25.0,-84.44667,-67.2025,-67.2025,232.7111,343.109375,286.6728,6.0,1.0,1.0,0.928922,0.942603,0.942603,1.267199,1.451145,1.350899,1188.086667,3.0,1188.086667,3.0,1188.086667,3.0,1188.086667,3.0,1188.086667,3.0,1188.086667,3.0,-84.446667,-84.446667,-84.446667,-84.446667,-84.446667,-84.446667,0.0,0.0,0.0,0.928922,0.928922,0.928922,0.928922,0.928922,0.928922,1.0,1.0,2.0,2.0,2.0,2.0
1943438,1,0,0,1,0,5.0,0.0,8.0,102.0,247.0,62.0,5.0,,,82174.0,6769.0,5817.0,101612.0,1194.66,1,9,1,1,749289.0,,,0,,1194.66,1194.66,,0,1,1194.66,1.0,1194.66,1.0,1194.66,1.0,803.008667,15.0,1194.66,1.0,831.493846,26.0,-2.273737e-13,-1.136868e-12,-1.136868e-12,-2.273737e-13,391.651333,363.166154,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.48773,1.436763,1194.66,1.0,1194.66,1.0,1194.66,1.0,1194.66,1.0,1194.66,1.0,1194.66,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Base Model

In [18]:
lgbm = LGBM_Model(input_features) #等這跑完
oof_preds_LGBM, df_sub_preds_LGBM, clf = lgbm.run(X_train, y_train, groups, X_test,
                                                lgbm.lgbm_averge_precision, n_splits = 10
                                                 ,  model_name='base_model')

Starting LightGBM. Fold 1,Train shape: (1521787, 93), test shape: (421665, 93)
Training until validation scores don't improve for 100 rounds
[100]	training's Averge Precision: 0.800959	valid_1's Averge Precision: 0.647371


KeyboardInterrupt: 

df_sub_preds_statistics = generate_statistic(df_sub_preds_LGBM)

# Remove Outier

Since some of the predictions above are very extreme, i drop it if the prediction is out of 1 standard deviation boundary.

In [15]:
df_sub_preds_statistics['mean_remove_outlier'] = df_sub_preds_statistics.apply(remove_outlier, axis = 1)

In [16]:
df_train['oof_base_model'] = oof_preds_LGBM
df_test.reset_index(drop=True,inplace=True)
df_test['sub_base_model'] = df_sub_preds_statistics['mean_remove_outlier']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Special Model - Whitelist of Merchant

In [17]:
input_features = base_features + [special_feautures[0]] #
X_train, y_train, groups, X_test = generate_X_y(df_train, df_test, label, input_features)
X_train['mchno_in_normal_mchno_list'].unique()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


array([0, 1], dtype=int64)

In [18]:
lgbm = LGBM_Model(input_features)
oof_preds_LGBM, df_sub_preds_LGBM, clf = lgbm.run(X_train, y_train, groups, X_test, lgbm.lgbm_averge_precision, n_splits = 10 
                                                 , model_name='Whitelist_of_Merchant')

Starting LightGBM. Fold 1,Train shape: (1521787, 94), test shape: (421665, 94)
Training until validation scores don't improve for 100 rounds.
[100]	training's Averge Precision: 0.822541	valid_1's Averge Precision: 0.708602
[200]	training's Averge Precision: 0.86193	valid_1's Averge Precision: 0.727492
[300]	training's Averge Precision: 0.886318	valid_1's Averge Precision: 0.736274
[400]	training's Averge Precision: 0.903138	valid_1's Averge Precision: 0.741672
[500]	training's Averge Precision: 0.915386	valid_1's Averge Precision: 0.744526
[600]	training's Averge Precision: 0.924736	valid_1's Averge Precision: 0.746202
[700]	training's Averge Precision: 0.932667	valid_1's Averge Precision: 0.746335
[800]	training's Averge Precision: 0.939236	valid_1's Averge Precision: 0.747505
[900]	training's Averge Precision: 0.944812	valid_1's Averge Precision: 0.74673
Early stopping, best iteration is:
[802]	training's Averge Precision: 0.93936	valid_1's Averge Precision: 0.74756
Starting LightGBM

In [19]:
df_sub_preds_statistics = generate_statistic(df_sub_preds_LGBM)
df_sub_preds_statistics['mean_remove_outlier'] = df_sub_preds_statistics.apply(remove_outlier, axis = 1)
df_train['oof_normal_mchno_model'] = oof_preds_LGBM
df_test['sub_normal_mchno_model'] = df_sub_preds_statistics['mean_remove_outlier']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Special Model - Blacklist of Merchant

In [20]:
input_features = base_features + [special_feautures[1]]
X_train, y_train, groups, X_test = generate_X_y(df_train, df_test, label, input_features)
X_train['mchno_in_fraud_mchno_list'].unique()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


array([0, 1], dtype=int64)

In [21]:
lgbm = LGBM_Model(input_features)
oof_preds_LGBM, df_sub_preds_LGBM, clf = lgbm.run(X_train, y_train, groups, X_test, lgbm.lgbm_averge_precision, n_splits = 10
                                                  , model_name='blacklist_of_Merchant')

Starting LightGBM. Fold 1,Train shape: (1521787, 94), test shape: (421665, 94)
Training until validation scores don't improve for 100 rounds.
[100]	training's Averge Precision: 0.831112	valid_1's Averge Precision: 0.741078
[200]	training's Averge Precision: 0.867921	valid_1's Averge Precision: 0.758423
[300]	training's Averge Precision: 0.888832	valid_1's Averge Precision: 0.76456
[400]	training's Averge Precision: 0.905362	valid_1's Averge Precision: 0.769277
[500]	training's Averge Precision: 0.916613	valid_1's Averge Precision: 0.768652
[600]	training's Averge Precision: 0.926519	valid_1's Averge Precision: 0.770257
[700]	training's Averge Precision: 0.933882	valid_1's Averge Precision: 0.770482
[800]	training's Averge Precision: 0.940261	valid_1's Averge Precision: 0.771231
[900]	training's Averge Precision: 0.945892	valid_1's Averge Precision: 0.771014
Early stopping, best iteration is:
[809]	training's Averge Precision: 0.940858	valid_1's Averge Precision: 0.771697
Starting Light

In [22]:
df_sub_preds_statistics = generate_statistic(df_sub_preds_LGBM)
df_sub_preds_statistics['mean_remove_outlier'] = df_sub_preds_statistics.apply(remove_outlier, axis = 1)
df_train['oof_fraud_mchno_model'] = oof_preds_LGBM
df_test['sub_fraud_mchno_model'] = df_sub_preds_statistics['mean_remove_outlier']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Special Model - Blacklist of Transaction Amount

In [23]:
input_features = base_features + [special_feautures[2]]
X_train, y_train, groups, X_test = generate_X_y(df_train, df_test, label, input_features)
X_train['conam_in_fraud_conam_list'].unique()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


array([0, 1], dtype=int64)

In [24]:
lgbm = LGBM_Model(input_features)
oof_preds_LGBM, df_sub_preds_LGBM, clf = lgbm.run(X_train, y_train, groups, X_test, lgbm.lgbm_averge_precision, n_splits = 10
                                                 , model_name='Blacklist_of_Transaction_Amount')

Starting LightGBM. Fold 1,Train shape: (1521787, 94), test shape: (421665, 94)
Training until validation scores don't improve for 100 rounds.
[100]	training's Averge Precision: 0.816676	valid_1's Averge Precision: 0.69224
[200]	training's Averge Precision: 0.856489	valid_1's Averge Precision: 0.708883
[300]	training's Averge Precision: 0.87841	valid_1's Averge Precision: 0.7144
[400]	training's Averge Precision: 0.895186	valid_1's Averge Precision: 0.71769
[500]	training's Averge Precision: 0.908079	valid_1's Averge Precision: 0.719589
[600]	training's Averge Precision: 0.917924	valid_1's Averge Precision: 0.719857
Early stopping, best iteration is:
[531]	training's Averge Precision: 0.911371	valid_1's Averge Precision: 0.72022
Starting LightGBM. Fold 2,Train shape: (1521787, 94), test shape: (421665, 94)
Training until validation scores don't improve for 100 rounds.
[100]	training's Averge Precision: 0.818198	valid_1's Averge Precision: 0.728444
[200]	training's Averge Precision: 0.85

In [25]:
df_sub_preds_statistics = generate_statistic(df_sub_preds_LGBM)
df_sub_preds_statistics['mean_remove_outlier'] = df_sub_preds_statistics.apply(remove_outlier, axis = 1)
df_train['oof_fraud_conam_model'] = oof_preds_LGBM
df_test['sub_fraud_conam_model'] = df_sub_preds_statistics['mean_remove_outlier']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Special Model - Elapsed Day of First Fraudulent Transaction(if exist)

In [26]:
input_features = base_features + [special_feautures[3]]
X_train, y_train, groups, X_test = generate_X_y(df_train, df_test, label, input_features)
X_train['diff_with_first_fraud_locdt'].unique()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


array([nan,  7.,  1., 12., 11., 14.,  9.,  2.,  5.,  8., 57.,  3., 15.,
        6., 13., 17., 18., 21., 22., 23., 30., 16., 28., 31., 41., 51.,
       52., 71., 34.,  4., 24., 19., 26., 27., 36., 37., 33., 42., 44.,
       45., 10., 46., 47., 56., 58., 61., 63., 29., 20., 50., 55., 38.,
       39., 49., 70., 72., 74., 25., 43., 54., 48., 35., 32., 53., 40.,
       62., 60., 59., 75., 64., 67., 68., 69., 73., 76., 77., 80., 81.,
       83., 88., 65., 78., 79., 82., 87., 66., 84., 85.])

In [27]:
lgbm = LGBM_Model(input_features)
oof_preds_LGBM, df_sub_preds_LGBM, clf = lgbm.run(X_train, y_train, groups, X_test, lgbm.lgbm_averge_precision
                                                  , n_splits = 10, model_name='First_Fraudulent_Transaction')

Starting LightGBM. Fold 1,Train shape: (1521787, 94), test shape: (421665, 94)
Training until validation scores don't improve for 100 rounds.
[100]	training's Averge Precision: 0.836907	valid_1's Averge Precision: 0.74294
[200]	training's Averge Precision: 0.874894	valid_1's Averge Precision: 0.759197
[300]	training's Averge Precision: 0.894684	valid_1's Averge Precision: 0.763892
[400]	training's Averge Precision: 0.911028	valid_1's Averge Precision: 0.767644
[500]	training's Averge Precision: 0.92278	valid_1's Averge Precision: 0.77085
[600]	training's Averge Precision: 0.932288	valid_1's Averge Precision: 0.771871
[700]	training's Averge Precision: 0.939607	valid_1's Averge Precision: 0.77215
Early stopping, best iteration is:
[684]	training's Averge Precision: 0.938599	valid_1's Averge Precision: 0.772534
Starting LightGBM. Fold 2,Train shape: (1521787, 94), test shape: (421665, 94)
Training until validation scores don't improve for 100 rounds.
[100]	training's Averge Precision: 0.

In [28]:
df_sub_preds_statistics = generate_statistic(df_sub_preds_LGBM)
df_sub_preds_statistics['mean_remove_outlier'] = df_sub_preds_statistics.apply(remove_outlier, axis = 1)
df_train['oof_first_fraud_model'] = oof_preds_LGBM
df_test['sub_first_fraud_model'] = df_sub_preds_statistics['mean_remove_outlier']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [29]:
print('跑完')

跑完


## Find the range of threshold which maximizes the f1 score

In [30]:
for i in range(0,70):# train到這
    print (i, f1_score(y_train, np.where(df_train['oof_base_model']>i/100,1,0) ))

0 0.026398347233912316
1 0.3465040024912532
2 0.42884876268887123
3 0.4811164741788456
4 0.5194893667436763
5 0.547837567004406
6 0.568708623145898
7 0.585598373727284
8 0.5994000545404963
9 0.6110746744120679
10 0.6203671631451333
11 0.6289935000984833
12 0.6357821049243643
13 0.6416928092513994
14 0.6467335739770902
15 0.6514675941635562
16 0.6567781993790963
17 0.6610543808628343
18 0.6639050108546364
19 0.6664571595322004
20 0.6688048769391748
21 0.6717451842305125
22 0.6724917284123212
23 0.674572921078639
24 0.676664210851534
25 0.6776950780312125
26 0.6795334965933615
27 0.6809374694197085
28 0.6822390676312996
29 0.6835159362549801
30 0.684379547393246
31 0.6846432094705689
32 0.6852758233914551
33 0.6860972728673412
34 0.6862227740700501
35 0.6863788068418857
36 0.6860141695093152
37 0.6861279158851347
38 0.6860242501595406
39 0.685500066943366
40 0.684969151601692
41 0.683969300029832
42 0.6830359578763573
43 0.6821760719870511
44 0.6819322978453388
45 0.6805382161187404
46 0

## Create Submission

Use different models to predict based on different condiitons. Change the threshold to create submission. 

In [31]:
threshold = 0.35

# Use base model for default
df_test['fraud_ind'] = np.where(df_test['sub_base_model']> threshold, 1, 0)

df_test['fraud_ind'] = np.where((df_test['mchno_in_fraud_mchno_list']==1) & (df_test['sub_fraud_mchno_model'] > threshold), 1, df_test['fraud_ind'])
df_test['fraud_ind'] = np.where((df_test['mchno_in_fraud_mchno_list']==1) & (df_test['sub_fraud_mchno_model'] <= threshold), 0, df_test['fraud_ind'])

df_test['fraud_ind'] = np.where((df_test['mchno_in_fraud_mchno_list']==0) & (df_test['diff_with_first_fraud_locdt'] >= 1) & (df_test['sub_first_fraud_model'] > threshold), 1, df_test['fraud_ind'])

df_test['fraud_ind'] = np.where((df_test['mchno_in_fraud_mchno_list']==0) & (df_test['mchno_in_normal_mchno_list']>0) & (df_test['sub_normal_mchno_model'] > threshold), 1, df_test['fraud_ind'])
df_test['fraud_ind'] = np.where((df_test['mchno_in_fraud_mchno_list']==0) & (df_test['mchno_in_normal_mchno_list']>0) & (df_test['sub_normal_mchno_model'] <= threshold), 0, df_test['fraud_ind'])

df_test['fraud_ind'] = np.where((df_test['mchno_in_fraud_mchno_list']==0) & (df_test['conam_in_fraud_conam_list']==1) & (df_test['sub_fraud_conam_model'] > threshold), 1, df_test['fraud_ind'])
df_test['fraud_ind'] = np.where((df_test['mchno_in_fraud_mchno_list']==0) & (df_test['conam_in_fraud_conam_list']==1) & (df_test['sub_fraud_conam_model'] <= threshold), 0, df_test['fraud_ind'])

df_test[['txkey','fraud_ind']].to_csv('submission_{}.csv'.format(threshold),index = False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user