# 1 - Reading, Cleaning, Encoding and Balancing

In [151]:
import pandas as pd
import numpy as np
import datetime as dt

#First, we read our base:
base=pd.read_csv('1.BASE.csv',sep=';')

#Exchange infinity by nan and dropna:
base=base.replace([np.inf, -np.inf], np.nan)
base=base.dropna()

In [152]:
from datetime import date

def idade(born):
    '''Function to determinate the age of a person'''
    
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [153]:
#Let's explore our data:
base.columns

Index(['ID', 'FRAUDE', 'DT_CONTRATACAO', 'DATA_ATIVACAO',
       'LIMITE_CREDITO_CARTAO', 'DATA_NASCIMENTO_CLIENTE',
       'FLAG_CARTAO_ADICIONAL', 'OPTIN_FATURA_POR_EMAIL',
       'FLAG_VALIDACAO_CADASTRAL', 'FLAG_EMAIL_INFORMADO', 'RENDA_INFORMADA',
       'RENDA_BUREAU1', 'RENDA_BUREAU2', 'SCORE_CREDITO_BUREAU1',
       'SCORE_CREDITO_BUREAU2', 'SCORE_FRAUDE', 'FLAG_DESENVOLVIMENTO',
       'DIAS_ATIVACAO', 'ORIGEM_VENDA_CONTRATO_OUTBOUND', 'GENERO_MASCULINO',
       'DATA_NASCIMENTO_CLIENTE_Y', 'NATURALIDADE_N', 'ESTADO_CIVIL_N',
       'DOMINIO_EMAIL_N', 'ESCOLARIDADE_N', 'CEP_COUNT', 'CEP_RES_COM',
       'CEP_RES_B1', 'CEP_RES_B2', 'DDD', 'UF_EMISSAO_RG_N', 'CIDADE_1',
       'CIDADE_2', 'CIDADE_3', 'CIDADE_4', 'CIDADE_5', 'RENDA_DIF_1',
       'RENDA_DIF_2', 'PERC_LIM_RI', 'PERC_LIM_B1', 'PERC_LIM_B2',
       'DIAS_REF_NASC', '('QT_DIAS_ATRASO', 'count')',
       '('QT_DIAS_ATRASO', 'mean')', '('QT_DIAS_ATRASO', 'sum')',
       '('DIAS_REF_CONT', 'count')', '('DIAS_REF_CONT', 

In [154]:
#First, we are looking for features to encode:
print(base['CIDADE_1'].unique())
print(base['CIDADE_2'].unique())
print(base['CIDADE_3'].unique())
print(base['CIDADE_4'].unique())
print(base['CIDADE_5'].unique())
print(base['NATURALIDADE_N'].unique())
print(base['CEP_RES_COM'].unique())
print(base['CEP_RES_B1'].unique())
print(base['CEP_RES_B2'].unique())
print(base['UF_EMISSAO_RG_N'].unique())
print(base['ESTADO_CIVIL_N'].unique())
print(base['DDD'].unique())
print(base['GENERO_MASCULINO'].unique())

print('Every feature for localization and gender is encoded.')

[1 0]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[0 1]
[1 0]
[1 0]
[0 1]
[1 0]
Every feature for localization and gender is encoded.


In [155]:
print(base['FLAG_CARTAO_ADICIONAL'].unique())
print(base['OPTIN_FATURA_POR_EMAIL'].unique())
print(base['FLAG_VALIDACAO_CADASTRAL'].unique())
print(base['FLAG_EMAIL_INFORMADO'].unique())
print(base['FLAG_DESENVOLVIMENTO'].unique())
print(base['ORIGEM_VENDA_CONTRATO_OUTBOUND'].unique())
print(base['DOMINIO_EMAIL_N'].unique())

print("Every feature for flagging is encoded")

[1 0]
[1 0]
[1 0]
[1 0]
[0 1]
[0 1]
[1. 0.]
Every feature for flagging is encoded


In [156]:
#For our datetime data, we can adjust as following:
contrat=pd.to_datetime(base['DT_CONTRATACAO'])
ativ=pd.to_datetime(base['DATA_ATIVACAO'])

#We set the delta between "ativacao" and "contratacao":
base['Tempo entre ativação e contratação']=contrat-ativ
base['Tempo entre ativação e contratação']=base['Tempo entre ativação e contratação'].dt.days

In [157]:
#Now we calculate the age of each client:

base['DATA_NASCIMENTO_CLIENTE']=pd.to_datetime(base['DATA_NASCIMENTO_CLIENTE'])
base['Idade do Cliente'] = base['DATA_NASCIMENTO_CLIENTE'].apply(idade)

In [158]:
#Since:
print(len(base['ID'].unique()))
print(base.shape)  #Such a small base!

#We can set "ID" as index to our df:
base=base.set_index('ID')

84978
(84978, 80)


In [159]:
#We replace the datetime data by our delta and our age feature:
X = base.drop(columns=['FRAUDE','DATA_NASCIMENTO_CLIENTE','DT_CONTRATACAO','DATA_ATIVACAO'])
y = base['FRAUDE']

In [160]:
print("The fraudulent samples are given by y=1 \n")
p=100*( len(y[y==1]) )/( len(y[y==1])+len(y[y==0]) )
print('In our set there are {:.2f}% of fraudulent samples.'.format(p))

The fraudulent samples are given by y=1 

In our set there are 0.66% of fraudulent samples.


In [161]:
#Even though gradient boosting is an ensemble method, let's check our correlation table: 
base.corr(method='pearson', min_periods=1)

Unnamed: 0,FRAUDE,LIMITE_CREDITO_CARTAO,FLAG_CARTAO_ADICIONAL,OPTIN_FATURA_POR_EMAIL,FLAG_VALIDACAO_CADASTRAL,FLAG_EMAIL_INFORMADO,RENDA_INFORMADA,RENDA_BUREAU1,RENDA_BUREAU2,SCORE_CREDITO_BUREAU1,...,"('PERC_APOS', 'sum')","('PERC_APOS', 'mean')","('PERC_VAL', 'sum')","('PERC_VAL', 'mean')","('D_ATIV', 'sum')","('D_ATIV', 'mean')",LIMITE_DISPONIVEL_APOS_TRANSACAO_mean,LIMITE_DISPONIVEL_APOS_TRANSACAO_min,Tempo entre ativação e contratação,Idade do Cliente
FRAUDE,1.000000,0.103157,-0.002275,0.041370,0.054656,0.033648,-0.000281,0.057480,0.017883,0.064035,...,-0.007857,0.001083,-0.012287,0.003017,-0.044403,-0.092203,0.047607,-0.000208,0.027753,0.012404
LIMITE_CREDITO_CARTAO,0.103157,1.000000,0.044880,-0.094702,-0.202148,-0.049232,-0.002601,0.088293,0.102165,0.262917,...,0.063679,0.107078,-0.061197,-0.056722,0.108510,0.030705,0.803417,0.459128,-0.053369,0.267574
FLAG_CARTAO_ADICIONAL,-0.002275,0.044880,1.000000,-0.113888,-0.173894,-0.105539,-0.001069,-0.002664,0.009879,0.065760,...,0.005307,0.019252,-0.003586,-0.004020,-0.007822,-0.032055,0.040726,0.021871,-0.012571,0.123256
OPTIN_FATURA_POR_EMAIL,0.041370,-0.094702,-0.113888,1.000000,0.748855,0.760431,0.002435,0.018131,-0.009477,-0.270724,...,0.023402,-0.075772,0.026526,-0.016727,0.093375,0.083328,-0.117678,-0.119957,0.171247,-0.407197
FLAG_VALIDACAO_CADASTRAL,0.054656,-0.202148,-0.173894,0.748855,1.000000,0.656818,0.002821,0.009797,-0.021223,-0.344365,...,0.021167,-0.112907,0.043572,-0.013772,0.120737,0.112036,-0.218878,-0.187774,0.227955,-0.443797
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"('D_ATIV', 'mean')",-0.092203,0.030705,-0.032055,0.083328,0.112036,0.087035,-0.004760,-0.004494,-0.018690,-0.077721,...,0.137542,-0.082305,0.125364,-0.048998,0.631707,1.000000,-0.022019,-0.109164,0.338487,-0.062019
LIMITE_DISPONIVEL_APOS_TRANSACAO_mean,0.047607,0.803417,0.040726,-0.117678,-0.218878,-0.077995,-0.002128,0.072145,0.088695,0.223618,...,0.195880,0.478531,-0.045225,-0.042820,0.010110,-0.022019,1.000000,0.754817,0.019386,0.258055
LIMITE_DISPONIVEL_APOS_TRANSACAO_min,-0.000208,0.459128,0.021871,-0.119957,-0.187774,-0.090094,-0.000867,0.046538,0.055273,0.150047,...,-0.004995,0.341424,-0.053426,-0.011495,-0.104312,-0.109164,0.754817,1.000000,-0.079683,0.175219
Tempo entre ativação e contratação,0.027753,-0.053369,-0.012571,0.171247,0.227955,0.145560,0.000557,0.005408,0.003805,-0.085569,...,0.072077,0.099500,0.064772,0.001026,0.192206,0.338487,0.019386,-0.079683,1.000000,-0.058229


In [162]:
print("This table can lead us to better results if our model prediction performs badly")

This table can lead us to better results if our model prediction performs badly


In [163]:
from sklearn.model_selection import train_test_split

#We implement a regularization of our features:
X=(X-np.mean(X,axis=0))/np.std(X,axis=0)

#Then, we divide our set:
X_treino, X_test, y_treino, y_test = train_test_split(X, y, test_size=0.3)

In [164]:
#We will balance classes using SMOTE method:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

smo=SMOTE(random_state=2)
over_X,over_y = smo.fit_sample(X_treino, y_treino)

# 2 - Let's boost

In [165]:
import xgboost as xgb

#Now we convert our data do a DMatrix:

dmatrix = xgb.DMatrix(data=X,label=y)
dmatrix_test = xgb.DMatrix(X_test, label=y_test)

In [168]:
xg_reg = xgb.XGBClassifier(objective='binary:logistic',colsample_bytree = 1,max_depth = 6, 
                                min_child_weight=1, eta=.3, subsample=1)

In [169]:
#Time to fit and predict:
xg_reg.fit(over_X, over_y)

pred_1 = xg_reg.predict(over_X)
pred_2 = xg_reg.predict(X_test)

**Some metrics**

In [170]:
from sklearn.metrics import roc_auc_score

auc = np.sqrt(roc_auc_score(over_y, pred_1))
print("For training set:  %f" % (auc))

auc = np.sqrt(roc_auc_score(y_test, pred_2))
print("For test set:  %f" % (auc))

For training set:  1.000000
For test set:  0.898144


In [171]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score

print('For training set:\n')
print('Accuracy: {}'.format(accuracy_score(over_y,pred_1)))

precision, recall, fscore, support=score(over_y,pred_1)
print('Precision for each class:{}'.format(precision))
print('F_1-score for each class: {}'.format(fscore))
print('Recall for each class:{}'.format(recall))



print('\nFor test set:\n')
print('Accuracy: {}'.format(accuracy_score(y_test,pred_2)))

precision, recall, fscore, support = score(y_test,pred_2)
print('Precision for each class:{}'.format(precision))
print('F_1-score for each class: {}'.format(fscore))
print('Recall for each class:{}'.format(recall))


For training set:

Accuracy: 1.0
Precision for each class:[1. 1.]
F_1-score for each class: [1. 1.]
Recall for each class:[1. 1.]

For test set:

Accuracy: 0.9967051070840197
Precision for each class:[0.9973986  0.85365854]
F_1-score for each class: [0.998343   0.71428571]
Recall for each class:[0.99928918 0.61403509]


Overall **it is a pretty ok result**. Is it possible to improve this by tuning some parameters? Let's find out:

## 3 - Tuning and Learning

In [91]:
#We set our initial parameters:
parameters = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:logistic',
}

#Setting our evaluation metric and num_boost_round (maximum number of boosts)
parameters['eval_metric'] = "auc"
num_boost_round = 900

#First we set our grid:
grid_parameters = [
    (max_depth, min_child_weight,eta)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
    for eta in [.3, .2, .1, .05, .01, .005]   #We will try to use alpha as a regularization factor
]

#Set initial minimal rmse and best parameters:
min_auc = float("Inf") #this infinity dude will make sense on the next cell 
best_parameters = None

In [92]:
#Let's run our grid searching the best parameters:

for max_depth, min_child_weight,eta in grid_parameters:
    print("Cross-validation with max_depth={}, min_child_weight={}, eta={}".format(max_depth,min_child_weight, eta))    
    
    parameters['max_depth'] = max_depth
    parameters['min_child_weight'] = min_child_weight
    parameters['eta'] = eta
    
    #Cross-validation time!
    cv_results = xgb.cv(
        parameters,
        dmatrix,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'auc'},
        early_stopping_rounds=10
    )    
    
    #Update auc:
    mean_auc = cv_results['test-auc-mean'].min()
    boost_rounds = cv_results['test-auc-mean'].argmin()
    
    print("\tauc {} for {} rounds".format(mean_auc, boost_rounds))
    if mean_auc < min_auc:
        min_auc = mean_auc
        best_parameters = (max_depth,min_child_weight,eta)

Cross-validation with max_depth=9, min_child_weight=5, eta=0.3
	auc 0.8367486 for 0 rounds
Cross-validation with max_depth=9, min_child_weight=5, eta=0.2
	auc 0.8367486 for 0 rounds
Cross-validation with max_depth=9, min_child_weight=5, eta=0.1
	auc 0.8367486 for 0 rounds
Cross-validation with max_depth=9, min_child_weight=5, eta=0.05
	auc 0.8367486 for 0 rounds
Cross-validation with max_depth=9, min_child_weight=5, eta=0.01
	auc 0.8367462 for 1 rounds
Cross-validation with max_depth=9, min_child_weight=5, eta=0.005
	auc 0.8367464 for 2 rounds
Cross-validation with max_depth=9, min_child_weight=6, eta=0.3
	auc 0.8352456 for 0 rounds
Cross-validation with max_depth=9, min_child_weight=6, eta=0.2
	auc 0.8352456 for 0 rounds
Cross-validation with max_depth=9, min_child_weight=6, eta=0.1
	auc 0.8352456 for 0 rounds
Cross-validation with max_depth=9, min_child_weight=6, eta=0.05
	auc 0.8352456 for 0 rounds
Cross-validation with max_depth=9, min_child_weight=6, eta=0.01
	auc 0.8352424 for 1 

In [93]:
#The end of our adveture:
print("Best parameters: {}, {}, {}, auc: {}".format(best_parameters[0], best_parameters[1], best_parameters[2], min_auc))


Best parameters: 9, 7, 0.3, auc: 0.8352006000000001


In [172]:
#Finally, we reach our golden pot! Let's train and test:
xg_classifier = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 1,max_depth = 9,
                                min_child_weight=7, eta=.3, subsample=1)

In [173]:
#Le fit:
xg_classifier.fit(over_X,over_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.3, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=9,
              min_child_weight=7, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [174]:
#Le predict:
pred_1 = xg_classifier.predict(over_X)
pred_2 = xg_classifier.predict(X_test)

In [175]:
from sklearn.metrics import roc_auc_score

auc = np.sqrt(roc_auc_score(over_y, pred_1))
print("For training set:  %f" % (auc))

auc = np.sqrt(roc_auc_score(y_test, pred_2))
print("For test set:  %f" % (auc))

For training set:  1.000000
For test set:  0.909391


In [176]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score

print('For training set:\n')
print('Accuracy: {}'.format(accuracy_score(over_y,pred_1)))

precision, recall, fscore, support=score(over_y,pred_1)
print('Precision for each class:{}'.format(precision))
print('F_1-score for each class: {}'.format(fscore))
print('Recall for each class:{}'.format(recall))



print('\nFor test set:\n')
print('Accuracy: {}'.format(accuracy_score(y_test,pred_2)))

precision, recall, fscore, support = score(y_test,pred_2)
print('Precision for each class:{}'.format(precision))
print('F_1-score for each class: {}'.format(fscore))
print('Recall for each class:{}'.format(recall))


For training set:

Accuracy: 1.0
Precision for each class:[1. 1.]
F_1-score for each class: [1. 1.]
Recall for each class:[1. 1.]

For test set:

Accuracy: 0.9967051070840197
Precision for each class:[0.99767323 0.81751825]
F_1-score for each class: [0.99834254 0.72727273]
Recall for each class:[0.99901276 0.65497076]


And there it is: a slight improvement in performance.

It is a tipical case of a mild overfit.