In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, precision_recall_curve, mean_absolute_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, f_classif


In [2]:
df = pd.read_csv("../data/chargeback_data_final.csv")
df

Unnamed: 0,dia,dia_semana,periodo_semana,periodo_mes,hora,hora_completa_segundos,periodo_dia,cartão,valor,media_dia,media_cartao,total_uso_cartao,cbk,vez_uso_cartao,cartao_usado_antes
0,1,4,semana,inicio,0,114,madrugada,536518******2108,36.54,139.59,36.54,2,Não,1,False
1,1,4,semana,inicio,0,226,madrugada,536518******2108,36.54,139.59,36.54,2,Não,2,True
2,1,4,semana,inicio,0,530,madrugada,453211******1239,69.00,139.59,69.00,1,Não,1,False
3,1,4,semana,inicio,0,1620,madrugada,548827******1705,193.43,139.59,193.43,1,Não,1,False
4,1,4,semana,inicio,1,5566,madrugada,531681******9778,132.00,139.59,132.00,1,Não,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10999,30,5,fds,fim,23,83327,noite,439354******5281,15.00,92.96,15.00,2,Não,1,False
11000,30,5,fds,fim,23,83724,noite,549167******1648,20.00,92.96,20.00,2,Não,1,False
11001,30,5,fds,fim,23,83861,noite,518759******8384,70.00,92.96,70.00,2,Não,1,False
11002,30,5,fds,fim,23,85891,noite,518759******0329,20.00,92.96,20.00,2,Não,1,False


In [3]:
df

Unnamed: 0,dia,dia_semana,periodo_semana,periodo_mes,hora,hora_completa_segundos,periodo_dia,cartão,valor,media_dia,media_cartao,total_uso_cartao,cbk,vez_uso_cartao,cartao_usado_antes
0,1,4,semana,inicio,0,114,madrugada,536518******2108,36.54,139.59,36.54,2,Não,1,False
1,1,4,semana,inicio,0,226,madrugada,536518******2108,36.54,139.59,36.54,2,Não,2,True
2,1,4,semana,inicio,0,530,madrugada,453211******1239,69.00,139.59,69.00,1,Não,1,False
3,1,4,semana,inicio,0,1620,madrugada,548827******1705,193.43,139.59,193.43,1,Não,1,False
4,1,4,semana,inicio,1,5566,madrugada,531681******9778,132.00,139.59,132.00,1,Não,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10999,30,5,fds,fim,23,83327,noite,439354******5281,15.00,92.96,15.00,2,Não,1,False
11000,30,5,fds,fim,23,83724,noite,549167******1648,20.00,92.96,20.00,2,Não,1,False
11001,30,5,fds,fim,23,83861,noite,518759******8384,70.00,92.96,70.00,2,Não,1,False
11002,30,5,fds,fim,23,85891,noite,518759******0329,20.00,92.96,20.00,2,Não,1,False


### Comparando modelos

In [None]:
df['dia_sin'] = np.sin(df.dia*(2.*np.pi/30))
df['dia_cos'] = np.cos(df.dia*(2.*np.pi/30))
df['hora_sin'] = np.sin(df.hora*(2.*np.pi/24))
df['hora_cos'] = np.cos(df.hora*(2.*np.pi/24))
df['dia_semana_sin'] = np.sin(df.hora*(2.*np.pi/7))
df['dia_semana_cos'] 

In [21]:
features_list = ['dia_sin', 'dia_cos', 'hora_sin', 'hora_cos', 'dia_semana_sin', 'dia_semana_cos', 'periodo_semana', 'periodo_mes',
       'hora_completa_segundos', 'periodo_dia', 'valor', 'cartao_usado_antes']

# Separando as features e o target
# Sem usar media_dia, media_cartao e quantidade_uso_cartao para não ter data leakage
X = df.loc[:,features_list]
y = df['cbk']

# Dividindo o conjunto de dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# label encoding
le = LabelEncoder()
for x in ['periodo_semana', 'periodo_mes', 'periodo_dia', 'cartao_usado_antes']:
    X_train[x] = le.fit_transform(X_train[x])
    X_test[x] = le.transform(X_test[x])

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)


In [24]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from xgboost import XGBClassifier
from sklearn.svm import SVC

# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))
models.append(('BNB', BernoulliNB()))
models.append(('SVM', SVC(kernel='linear')))
models.append(('RFC', RandomForestClassifier()))
models.append(('XGB', XGBClassifier()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
X2 = X.copy()

# label encoding
le = LabelEncoder()
for x in ['periodo_semana', 'periodo_mes', 'periodo_dia']:
    X2[x] = le.fit_transform(X2[x])

y_labeled = le.fit_transform(y)

X_scaled = StandardScaler().fit_transform(X2)


for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=None)
    cv_results = model_selection.cross_val_score(model, X2, y_labeled, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

LR: 0.946924 (0.017802)
KNN: 0.943381 (0.015464)
DTC: 0.919838 (0.027937)
GNB: 0.937924 (0.032554)
BNB: 0.944196 (0.021501)


### Random Forest Classifier

#### Transformando variáveis ​​cíclicas

In [20]:
df['dia_sin'] = np.sin(df.dia*(2.*np.pi/30))
df['dia_cos'] = np.cos(df.dia*(2.*np.pi/30))
df['hora_sin'] = np.sin(df.hora*(2.*np.pi/24))
df['hora_cos'] = np.cos(df.hora*(2.*np.pi/24))
df['dia_semana_sin'] = np.sin(df.hora*(2.*np.pi/7))
df['dia_semana_cos'] = np.cos(df.hora*(2.*np.pi/7))

In [None]:
param_grid = { 
    'n_estimators': [50, 100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'criterion' : ['gini', 'entropy', 'log_loss'], 
    'min_samples_split' : [2, 3, 4, 5], 
    'min_samples_leaf': [1, 2, 3, 4], 
    'bootstrap': [True, False],
    'max_leaf_nodes': [3,5, 6, 9, None],

}

In [None]:
rfc2=RandomForestClassifier(random_state=42)
rf_RandomGrid = RandomizedSearchCV(estimator = rfc2, param_distributions = param_grid, cv = 10, n_jobs = 4)
rf_RandomGrid.fit(X_train, y_train)

rf_RandomGrid.best_params_

In [19]:
# Criando o modelo Random Forest
rf = RandomForestClassifier(random_state=42, max_features='sqrt', n_estimators= 100, criterion='entropy')

# Treinando o modelo
rf.fit(X_train, y_train)

# Fazendo previsões no conjunto de teste
y_pred = rf.predict(X_test)

# Avaliando o modelo
print('Acurácia: ', accuracy_score(y_test, y_pred))
print('Precisão: ', precision_score(y_test, y_pred))
print('Recall: ', recall_score(y_test, y_pred))
print('F1-Score: ', f1_score(y_test, y_pred))

print(classification_report(y_test, y_pred))

Acurácia:  0.975920036347115
Precisão:  0.8571428571428571
Recall:  0.6111111111111112
F1-Score:  0.7135135135135134
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2093
           1       0.86      0.61      0.71       108

    accuracy                           0.98      2201
   macro avg       0.92      0.80      0.85      2201
weighted avg       0.97      0.98      0.97      2201



In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# predict probabilidade
y_prob = rf.predict_proba(X_test)
y_prob = y_prob[:, 1]

# Calculando precision recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
# convertendo para f score
fscore = (2 * precision * recall) / (precision + recall)
# pegando o index com maior f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))
# plotando precision recall curve
no_skill = len(y_test[y_test==1]) / len(y_test)
plt.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
plt.plot(recall, precision, marker='.', label='RFC')
plt.scatter(recall[ix], precision[ix], marker='o', color='black', label='Best')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()

plt.show()

In [None]:
threshold = 0.43

predicted_proba = rf.predict_proba(X_test)
y_pred_proba = (predicted_proba [:,1] >= threshold).astype('int')

# Avaliando o modelo
print('Acurácia: ', accuracy_score(y_test, y_pred_proba))
print('Precisão: ', precision_score(y_test, y_pred_proba))
print('Recall: ', recall_score(y_test, y_pred_proba))
print('F1-Score: ', f1_score(y_test, y_pred_proba))

print(classification_report(y_test, y_pred_proba))

In [None]:
# Feature importance
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
feature_names = [X.columns[i] for i in range(X.shape[1])]
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

### Prevendo novas transações

In [None]:
# Carregando o data frame
df_pred = pd.read_excel("../data/chargeback_data.xlsx", sheet_name=1)
df_pred

In [None]:
def get_day_period(time):
    period = ""
    if 6 <= time <= 12:
        period = "manha"
    elif 13 <= time <= 18:
        period = "tarde"
    elif 19 <= time <= 23:
        period = "noite"
    elif 00 <= time <= 5:
        period = "madrugada"
    return period

def get_month_period(day):
    period = ""
    if 1 <= day <= 10:
        period = "inicio"
    elif 11 <= day <= 20:
        period = "meio"
    elif 21 <= day <= 31:
        period = "fim"
    return period

def get_week_period(day):
    period = ""
    if day in range(0,5):
        period = "semana"
    else:
        period = "fds"

    return period


df_pred_pre = df_pred.copy()
df_pred_pre.columns= df_pred_pre.columns.str.lower()
df_pred_pre.rename(columns={"dia":"data", "hora":"hora_completa"}, inplace=True)

df_pred_pre.loc[:, "dia"] = pd.to_datetime(df_pred_pre.loc[:, "data"]).dt.day.astype(int)
df_pred_pre.loc[:, "periodo_mes"] = df_pred_pre.loc[:, "dia"].apply(lambda x: get_month_period(x))
df_pred_pre.loc[:, "dia_semana"] = pd.to_datetime(df_pred_pre.loc[:, "data"]).dt.day_of_week.astype(int)
df_pred_pre.loc[:, "periodo_semana"] = df_pred_pre.loc[:, "dia_semana"].apply(lambda x: get_week_period(x))
df_pred_pre.loc[:, "hora"] = df_pred_pre.loc[:, "hora_completa"].apply(lambda x: x.hour)
df_pred_pre.loc[:, "hora_completa_segundos"] = df_pred_pre.loc[:, "hora_completa"].apply(lambda x: (x.hour * 60 + x.minute) * 60 + x.second)
df_pred_pre.loc[:, "periodo_dia"] = df_pred_pre.loc[:, "hora"].apply(lambda x: get_day_period(x))


df_pred_pre_final = df_pred_pre.loc[:, ['dia','dia_semana', 'periodo_semana', 'periodo_mes', 'hora_completa_segundos', 'periodo_dia', 'valor', 'cbk']] 

In [None]:
df_pred_pre_final

#### Previsão do modelo

In [None]:
# label encoding
le = LabelEncoder()
for x in ['periodo_semana', 'periodo_mes', 'periodo_dia', 'cbk']:
    df_pred_pre_final[x] = le.fit_transform(df_pred_pre_final[x])

X = df_pred_pre_final.loc[:,:'valor']


# Fazendo previsões
threshold = 0.43

predicted_proba = rf.predict_proba(X)
y_pred = (predicted_proba [:,1] >= threshold).astype('int')

In [None]:
len(predicted_proba)

In [None]:
import collections
collections.Counter(y_pred)

In [None]:
# Populando 
df_pred.loc[:,'CBK'] = y_pred
df_pred.replace({"CBK": {0.0: "Não",
                         1.0: "Sim",}},inplace=True)

In [None]:
df_pred

In [None]:
df_pred['CBK'].value_counts(normalize=True)

In [None]:
df_pred['CBK'].value_counts(normalize=True).plot(kind="bar")
plt.title('Quantidade de chargeback')
plt.xlabel('chargeback')
plt.xticks(rotation=0)
plt.ylabel("Count")
plt.show()

In [None]:
df_pred.to_csv("../data/predicted_chargeback_data.csv", index=False)