# Avaliação do Modelo

In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, average_precision_score

# Data

In [3]:
df_train = pd.read_parquet('../data/encoded/fraud_dataset_v2_train.parquet.gzip')
df_valid = pd.read_parquet('../data/encoded/fraud_dataset_v2_valid.parquet.gzip')
df_test = pd.read_parquet('../data/encoded/fraud_dataset_v2_test.parquet.gzip')

In [4]:
print(df_train.fraude.mean())
print(df_valid.fraude.mean())
print(df_test.fraude.mean())

0.03100739300211449
0.030991480429682677
0.04832882688418804


In [5]:
df_train = pd.concat([df_train, df_valid], ignore_index=True)
df_train['group'] = 'train'
df_test['group'] = 'test'
df = pd.concat([df_train, df_test], ignore_index=True)

# Avaliação do Modelo

In [7]:
model = pickle.load(open('../model/predictors/lgbm_model.pkl', 'rb'))

In [12]:
df['new_score'] = (model.predict_proba(df[model.feature_names_in_])[:,1]*100).round(2)

## Estabilidade

In [13]:
df['valor_fraude'] = df['monto'] * df['fraude']
df_thresholds = (
    df
    .groupby('new_score')
    .agg({'fraude' : ['sum', 'count'],
          'valor_fraude' : 'sum',
          'monto' : 'sum'})
    .sort_values(by='new_score', ascending=False)
    .cumsum()
    .reset_index()
)
df_thresholds.columns = ['new_score', 'qtd_fraude', 'qtd_total', 'valor_fraude', 'valor_total']
df_thresholds['fpr_#'] = (df_thresholds['qtd_total'] - df_thresholds['qtd_fraude'])/(df_thresholds['qtd_total'].max() - df_thresholds['qtd_fraude'].max())
df_thresholds['fpr_$'] = (df_thresholds['valor_total'] - df_thresholds['valor_fraude'])/(df_thresholds['valor_total'].max() - df_thresholds['valor_fraude'].max())
df_thresholds['precision_#'] = df_thresholds['qtd_fraude']/df_thresholds['qtd_total']
df_thresholds['precision_$'] = df_thresholds['valor_fraude']/df_thresholds['valor_total']
df_thresholds['recall_#'] = df_thresholds['qtd_fraude']/df_thresholds['qtd_fraude'].max()
df_thresholds['recall_$'] = df_thresholds['valor_fraude']/df_thresholds['valor_fraude'].max()
df_thresholds.iloc[188:195,:]

Unnamed: 0,new_score,qtd_fraude,qtd_total,valor_fraude,valor_total,fpr_#,fpr_$,precision_#,precision_$,recall_#,recall_$
188,74.73,273,299,7566.61,9737.12,0.000108,0.00021,0.913043,0.777089,0.029431,0.011147
189,74.4,274,300,7575.06,9745.57,0.000108,0.00021,0.913333,0.777282,0.029539,0.011159
190,74.3,274,301,7575.06,9749.55,0.000112,0.000211,0.910299,0.776965,0.029539,0.011159
191,74.16,276,303,8184.16,10358.65,0.000112,0.000211,0.910891,0.79008,0.029754,0.012056
192,74.09,277,304,8248.62,10423.11,0.000112,0.000211,0.911184,0.791378,0.029862,0.012151
193,73.87,277,306,8248.62,10427.81,0.00012,0.000211,0.905229,0.791021,0.029862,0.012151
194,73.83,279,308,8312.86,10492.05,0.00012,0.000211,0.905844,0.792301,0.030078,0.012246


In [16]:
df_importance = pd.DataFrame({
    'feature': model.feature_names_in_,
    'importance': model.feature_importances_
}).sort_values(by='importance', ascending=False).reset_index(drop=True)
df_importance['importance_cumsum'] = df_importance['importance'].cumsum()
df_importance

Unnamed: 0,feature,importance,importance_cumsum
0,a,453,453
1,m,224,677
2,l,207,884
3,h,186,1070
4,d,173,1243
5,f,165,1408
6,br_qty_cat_last7d,163,1571
7,o,135,1706
8,vl_fraud_cat_last7d,86,1792
9,br_vl_cat_last7d,38,1830
