In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier)
from sklearn.metrics import (accuracy_score, precision_score, confusion_matrix, recall_score,
                             f1_score)

In [3]:
data = pd.read_csv('data/train_dataset.csv')

In [4]:
data.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,170,85,97.0,0.9,0.9,1,1,118,78,...,70,142,19.8,1,1.0,61,115,125,1,1
1,20,175,110,110.0,0.7,0.9,1,1,119,79,...,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,...,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,...,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,...,47,92,14.9,1,1.2,26,28,15,0,0


In [None]:
data.Amount.describe().round(2)

In [None]:
# Distribuição de valor

fig = px.scatter(data, x="Amount", y=data.index, color=data.Class, 
                title='Distribuição dos valores')
fig.update_layout(xaxis_title="Valor da transação (em €)",
                  yaxis_title="Transações")
fig.show()

In [None]:
fig = px.pie(data, names="Class", title="Fraudulent x Genuine Transactions in the Dataset", color_discrete_map={0: 'blue', 1: 'red'})
fig.update_traces(textinfo="percent+label")
fig.show()

In [None]:
data.query("Class == 1").Amount.describe()

In [None]:
fig = px.scatter(data.query("Class == 1"), x="Amount", y=data.query("Class == 1").index, title = 'Distribution de valores fraudulentos' ) 
fig.update_layout(xaxis_title= 'Valor da transação (em €)' , 
                    yaxis_title= 'Transações' ) 
fig.show()

In [None]:
# Separação dos dados entre treino e teste

train, test = train_test_split(data, test_size=0.3, random_state=42)
X_train = train.drop(columns=['Class'])
X_test = test.drop(columns=['Class'])
y_train = train['Class']
y_test = test['Class']

In [None]:
print ( 'Tamanho do X_train: ' , X_train.shape) 
print ( 'Tamanho do X_test: ' , X_test.shape) 
print ( 'Proporção do X_test:' , "%s%%" % round (( len (X_test) / ( len (X_train) + len (X_test))) * 100 )) 
print ( 'Tamanho do y_train: ' , y_train.shape) 
print ( 'Tamanho do y_test:' , y_test.shape) 
print ( 'Proporção do y_test:' , "%s%%" % round (( len (y_test) / ( len (y_train) + len (y_test))) * 100 ))

In [None]:
# Dimensionando dados no conjunto de treinamento

scaler = StandardScaler()
X_train['Amount'] = scaler.fit_transform(X_train.Amount.values.reshape(-1,1))
X_train

In [None]:
# Dimensionando dados no conjunto de teste

scaler = StandardScaler()
X_test['Amount'] = scaler.fit_transform(X_test.Amount.values.reshape(-1,1))
X_test

In [None]:
# Aplicando o Classificador de Floresta Aleatória
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 42) 
random_forest.fit(X_train, y_train) 
 
y_predictions_rf = random_forest.predict(X_test) 
 
# Aplicando o classificador Ada Boost
ada_boost = AdaBoostClassifier(n_estimators = 100, random_state = 42) 
ada_boost.fit(X_train,y_train) 
 
y_predictions_ab = ada_boost.predict(X_test) 
 
# Aplicando o classificador Gradient Boosting
gradiente_boosting = GradientBoostingClassifier (n_estimators = 100, random_state = 42) 
gradiente_boosting.fit (X_train, y_train) 
 
y_prediction_gb = gradiente_boosting.predict (X_test)

In [None]:
# Métricas de avaliação para

random_forest_metric = [['Accurary', (accuracy_score(y_test, y_predictions_rf))],
                        ['Precision', (precision_score(y_test, y_predictions_rf))],
                        ['Recall', (recall_score(y_test, y_predictions_rf))],
                        ['F1_score', f1_score(y_test, y_predictions_rf)]]

metricas_rf = pd.DataFrame(random_forest_metric, columns=["Métrica", "Resultados"])
metricas_rf

In [None]:
# Matriz de confusão para Random Forest

confusion_matrix_rf = confusion_matrix(y_test, y_predictions_rf)

# Visualização

plt.figure(figsize=(10, 7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_rf, annot=True, fmt="g", ax= ax)
ax.set_xlabel( 'Valores Previstos' ) 
ax.set_ylabel( 'Valores Reais' ) 
ax.set_title( 'Matriz de Confusão - Floresta Aleatória' ) 
ax.xaxis.set_ticklabels([ 'Genuíno' , 'Fraude' ]) 
ax.yaxis.set_ticklabels([ 'Genuíno' , 'Fraude' ]) 
plt.show()

In [None]:
# Métricas de avaliação para

ab_metric = [['Accurary', (accuracy_score(y_test, y_predictions_ab))],
                        ['Precision', (precision_score(y_test, y_predictions_ab))],
                        ['Recall', (recall_score(y_test, y_predictions_ab))],
                        ['F1_score', f1_score(y_test, y_predictions_ab)]]

metricas_ab = pd.DataFrame(ab_metric, columns=["Métrica", "Resultados"])
metricas_ab

In [None]:
# Matriz de confusão para Ada Boost

confusion_matrix_ab = confusion_matrix(y_test, y_predictions_ab)

# Visualização

plt.figure(figsize=(10, 7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_ab, annot=True, fmt="g", ax= ax)
ax.set_xlabel( 'Valores Previstos' ) 
ax.set_ylabel( 'Valores Reais' ) 
ax.set_title( 'Matriz de Confusão - Ada Boost' ) 
ax.xaxis.set_ticklabels([ 'Genuíno' , 'Fraude' ]) 
ax.yaxis.set_ticklabels([ 'Genuíno' , 'Fraude' ]) 
plt.show()

In [None]:
# Métricas de avaliação para

gradiente_boosting_metric = [['Accurary', (accuracy_score(y_test, y_prediction_gb))],
                        ['Precision', (precision_score(y_test, y_prediction_gb))],
                        ['Recall', (recall_score(y_test, y_prediction_gb))],
                        ['F1_score', f1_score(y_test, y_prediction_gb)]]

metricas_gb = pd.DataFrame(gradiente_boosting_metric, columns=["Métrica", "Resultados"])
metricas_gb

In [None]:
# Matriz de confusão para Gradient Boost

confusion_matrix_gb = confusion_matrix(y_test, y_prediction_gb)

# Visualização

plt.figure(figsize=(10, 7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_gb, annot=True, fmt="g", ax= ax)
ax.set_xlabel( 'Valores Previstos' ) 
ax.set_ylabel( 'Valores Reais' ) 
ax.set_title( 'Matriz de Confusão - Gradient Boost' ) 
ax.xaxis.set_ticklabels([ 'Genuíno' , 'Fraude' ]) 
ax.yaxis.set_ticklabels([ 'Genuíno' , 'Fraude' ]) 
plt.show()