# Imports

In [1]:
import pandas as pd
import yaml
import os
project_dir = 'C:\\Users\\diego\\OneDrive\\Cursos\\Python\\learning_curve'

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, roc_auc_score, make_scorer
from sklearn.metrics import average_precision_score
from sklearn.model_selection import RandomizedSearchCV

# Data

In [2]:
treino = pd.read_parquet(os.path.join(project_dir, 'data', 'feat', 'treino.parquet.gzip'))

In [3]:
treino = treino.reset_index()

In [4]:
treino.head()

Unnamed: 0,index,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,day_of_week,time_of_day,client_age,sum_last_30_minutes,sum_last_1_hour,sum_last_2_hour,sum_last_8_hour,sum_last_12_hour,sum_last_24_hour,sum_last_72_hour
0,0,2019-01-01 12:47:15,60416207185,"fraud_Jones, Sawayn and Romaguera",0,7.27,Mary,Diaz,F,9886 Anita Drive,...,1,0,32.0,7.27,7.27,7.27,7.27,7.27,7.27,7.27
1,1,2019-01-02 08:44:57,60416207185,fraud_Berge LLC,1,52.94,Mary,Diaz,F,9886 Anita Drive,...,2,1,32.0,52.94,52.94,52.94,52.94,52.94,60.21,60.21
2,2,2019-01-02 08:47:36,60416207185,fraud_Luettgen PLC,1,82.08,Mary,Diaz,F,9886 Anita Drive,...,2,1,32.0,135.02,135.02,135.02,135.02,135.02,142.29,142.29
3,3,2019-01-02 12:38:14,60416207185,fraud_Daugherty LLC,2,34.79,Mary,Diaz,F,9886 Anita Drive,...,2,0,32.0,34.79,34.79,34.79,169.81,169.81,177.08,177.08
4,4,2019-01-02 13:10:46,60416207185,fraud_Beier and Sons,3,27.18,Mary,Diaz,F,9886 Anita Drive,...,2,0,32.0,27.18,61.97,61.97,196.99,196.99,196.99,204.26


In [5]:
treino.shape

(1296675, 36)

In [7]:
sample_treino = treino[(treino.index <= 200000) | (treino.is_fraud == 1)]

In [8]:
print(sample_treino.shape)
print(sample_treino.is_fraud.mean())

(206256, 36)
0.036391668606004186


In [9]:
features = yaml.safe_load(open(os.path.join(project_dir, 'src', 'feature', 'config', 'variaveis.yaml'), 'rb'))

# Hipertunning

In [10]:
clf = RandomForestClassifier(n_estimators=100, random_state=777, n_jobs=-1)

In [11]:
sample_treino.is_fraud.value_counts()

is_fraud
0    198750
1      7506
Name: count, dtype: int64

In [12]:
def gini_score(y_true, y_pred):
    return (2*roc_auc_score(y_true,y_pred))-1

In [13]:
params = {
    'max_depth' : [2,3,4],
    'min_samples_leaf' : [200, 500, 700, 1000, 3000, 5000],
    'class_weight' : ['balanced', {1 : 100, 0 : 1}, None, {1 : 200, 0 : 1}, 'balanced_subsample']
}

In [14]:
search = RandomizedSearchCV(clf,
                            params,
                            n_iter=30,
                            scoring=make_scorer(average_precision_score),
                            cv=3,
                            n_jobs=-1,
                            verbose=True,
                            return_train_score=True,
                            random_state=777)

In [15]:
%%time
search.fit(sample_treino[features['descritivas']], sample_treino.is_fraud)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
CPU times: total: 40.8 s
Wall time: 4min 18s


In [16]:
results_cv = pd.DataFrame(search.cv_results_)
results_cv['Dif_train_teste'] = results_cv['mean_train_score'] - results_cv['mean_test_score']
results_cv[['params', 'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score','Dif_train_teste']].sort_values(by='Dif_train_teste').head(10)

Unnamed: 0,params,mean_test_score,std_test_score,mean_train_score,std_train_score,Dif_train_teste
26,"{'min_samples_leaf': 3000, 'max_depth': 2, 'cl...",0.21037,0.062443,0.215041,0.012231,0.004671
23,"{'min_samples_leaf': 5000, 'max_depth': 4, 'cl...",0.31683,0.075306,0.322712,0.00907,0.005883
22,"{'min_samples_leaf': 5000, 'max_depth': 4, 'cl...",0.317476,0.075293,0.323612,0.009601,0.006135
15,"{'min_samples_leaf': 5000, 'max_depth': 3, 'cl...",0.326945,0.077822,0.336267,0.01002,0.009322
3,"{'min_samples_leaf': 500, 'max_depth': 2, 'cla...",0.219525,0.069522,0.230895,0.012725,0.01137
12,"{'min_samples_leaf': 5000, 'max_depth': 2, 'cl...",0.337356,0.07552,0.350136,0.012868,0.01278
8,"{'min_samples_leaf': 3000, 'max_depth': 2, 'cl...",0.362463,0.080051,0.380019,0.011259,0.017557
10,"{'min_samples_leaf': 500, 'max_depth': 2, 'cla...",0.136467,0.06154,0.156614,0.011277,0.020147
29,"{'min_samples_leaf': 3000, 'max_depth': 4, 'cl...",0.228837,0.113934,0.249153,0.003532,0.020316
16,"{'min_samples_leaf': 1000, 'max_depth': 2, 'cl...",0.375929,0.078199,0.406086,0.015961,0.030158


In [37]:
results_cv.sort_values(by='Dif_train_teste')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_depth,param_class_weight,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score,Dif_train_teste
12,3.947183,0.42255,3.327271,0.161258,5000,2,balanced,"{'min_samples_leaf': 5000, 'max_depth': 2, 'cl...",0.274296,0.270168,0.327728,0.290731,0.026215,13,0.323221,0.299708,0.305492,0.309474,0.010003,0.018743
15,8.315665,0.222436,2.306739,0.6424,5000,3,balanced,"{'min_samples_leaf': 5000, 'max_depth': 3, 'cl...",0.277479,0.266834,0.299923,0.281412,0.013792,14,0.315006,0.294654,0.299425,0.303028,0.008691,0.021616
8,10.417777,0.336725,1.726177,0.635141,3000,2,balanced_subsample,"{'min_samples_leaf': 3000, 'max_depth': 2, 'cl...",0.343702,0.347083,0.382364,0.357716,0.017483,8,0.393624,0.380531,0.387207,0.38712,0.005346,0.029404
11,12.685594,0.276557,2.36633,0.559815,500,2,balanced_subsample,"{'min_samples_leaf': 500, 'max_depth': 2, 'cla...",0.425241,0.398099,0.408025,0.410455,0.011213,3,0.461639,0.428081,0.437032,0.44225,0.014188,0.031796
6,10.125181,3.4713,4.182152,1.817485,500,2,balanced,"{'min_samples_leaf': 500, 'max_depth': 2, 'cla...",0.425615,0.39799,0.408135,0.41058,0.01141,2,0.462063,0.428081,0.437097,0.442414,0.014374,0.031834
9,10.885394,0.263379,1.14535,0.092157,700,2,balanced_subsample,"{'min_samples_leaf': 700, 'max_depth': 2, 'cla...",0.424245,0.398142,0.40686,0.409749,0.01085,5,0.46058,0.427036,0.437886,0.441834,0.013976,0.032085
16,7.067147,0.239335,0.768553,0.281888,1000,2,balanced,"{'min_samples_leaf': 1000, 'max_depth': 2, 'cl...",0.42538,0.397753,0.406491,0.409874,0.01153,4,0.462915,0.426566,0.437048,0.442176,0.015276,0.032302
17,6.882983,0.132362,4.137689,0.12153,3000,4,"{1: 200, 0: 1}","{'min_samples_leaf': 3000, 'max_depth': 4, 'cl...",0.170818,0.142773,0.056036,0.123209,0.048859,19,0.171476,0.150207,0.157785,0.159823,0.008802,0.036613
10,7.664947,0.311518,3.009671,0.241882,500,2,"{1: 200, 0: 1}","{'min_samples_leaf': 500, 'max_depth': 2, 'cla...",0.16062,0.156006,0.052659,0.123095,0.049842,20,0.181981,0.168132,0.161654,0.170589,0.008478,0.047494
3,11.090848,0.236842,1.255248,0.193267,500,2,"{1: 100, 0: 1}","{'min_samples_leaf': 500, 'max_depth': 2, 'cla...",0.210756,0.216139,0.130134,0.185676,0.039336,17,0.246554,0.242271,0.259277,0.249367,0.007222,0.063691


In [17]:
results_cv.iloc[15,7]

{'min_samples_leaf': 5000, 'max_depth': 3, 'class_weight': 'balanced'}

In [18]:
params = {'min_samples_leaf': 5000, 'max_depth': 3, 'class_weight': 'balanced'}