# **Código Novo**

## Importando as bibliotecas e mudando opções do Pandas

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install plotly --upgrade
!pip install sklearn --upgrade

Requirement already up-to-date: plotly in /usr/local/lib/python3.7/dist-packages (4.14.3)
Requirement already up-to-date: sklearn in /usr/local/lib/python3.7/dist-packages (0.0)


In [3]:
# Dataframe e matemática
import numpy as np
import pandas as pd

# Visualização

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Machine Learning: Random Forest
from sklearn.ensemble import RandomForestClassifier

# Machine Learning: split, grid search e cross validation
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

# Métricas
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score



In [4]:
pd.options.display.max_colwidth = 5000

In [5]:
pd.options.display.float_format= "{:.5f}".format

In [6]:
!pip install ipython-autotime

%load_ext autotime

time: 254 µs (started: 2021-04-14 12:36:16 +00:00)


## Importando DF, pesquisando hiperparâmetros e treinando modelos

In [7]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Trabalho em grupo/Limpa_G4")
df = df.iloc[:,1:]
df

Unnamed: 0,Months since Last Donation,Number of Donations,Months since First Donation,Made Donation in March 2007
0,2,50,98,1
1,0,13,28,1
2,1,16,35,1
3,2,20,45,1
4,1,24,77,0
...,...,...,...,...
743,23,2,38,0
744,21,2,52,0
745,23,3,62,0
746,39,1,39,0


time: 27 ms (started: 2021-04-14 12:36:16 +00:00)


In [8]:
# Separando X e y 
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

#Separando os dados em teste e treino

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify = y,  random_state= 6)

#Separando os dados de treinamento em treinamento (final) e validação

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify = y_train, random_state= 6)

# Calculando as proporções do valor 1 entre os splits
prop_real = df[df["Made Donation in March 2007"] == 1]["Made Donation in March 2007"].count()/df["Made Donation in March 2007"].count()
prop_treino = y_train[y_train == 1].count()/y_train.count()
prop_teste = y_test[y_test == 1].count()/y_test.count()
prop_validação = y_val[y_val == 1].count()/y_val.count()

# Criando um Data Frame para avaliar se a proporção de valor 1 (doou sangue em março de 2007) é identica nos datasets.
df_train_test_prop = pd.DataFrame([prop_real,prop_treino,prop_teste, prop_validação]).T
df_train_test_prop.rename(columns = {0:"Proporção Real", 1: "Proporção de Treinamento", 2: "Proporção de Teste", 3: "Proporção de Validação"}, 
                          inplace = True)

df_train_test_prop

Unnamed: 0,Proporção Real,Proporção de Treinamento,Proporção de Teste,Proporção de Validação
0,0.23797,0.23819,0.23894,0.23622


time: 40.8 ms (started: 2021-04-14 12:36:16 +00:00)


In [9]:
# Normalizando X_train

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

# Normalizando o resto

X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

time: 7.68 ms (started: 2021-04-14 12:36:16 +00:00)


In [10]:
# Separando as métricas
métricas = ["accuracy", "precision", "recall", "f1", "roc_auc"]
range(len(métricas))

range(0, 5)

time: 10.6 ms (started: 2021-04-14 12:36:16 +00:00)


In [11]:
# Criando o modelo
RF = RandomForestClassifier(random_state = 6)

#Definindo as métricas
métricas = ["accuracy", "precision", "recall", "f1_macro", "roc_auc"]

# Separando os hiperparâmetros
hiperparametros = {"n_estimators": [x for x in range(50,550,50)],
                   "max_features": ["auto", "sqrt"],
                   "min_samples_split": [2,6,8,10],
                   "min_samples_leaf": [1,3,5],
                   "class_weight": ["balanced"]}

#Realizando o GridSearch
resultados = {}

for i in range(len(métricas)):
  
  GS = GridSearchCV(RF, hiperparametros, scoring = métricas[i])

  GS.fit(X_train, y_train)

  resultados[i] = pd.DataFrame(GS.cv_results_)



time: 41min 46s (started: 2021-04-14 12:36:16 +00:00)


In [12]:
# Renomeando o dicionário
resultados["accuracy"] = resultados.pop(0)
resultados["precision"] = resultados.pop(1)
resultados["recall"] = resultados.pop(2)
resultados["f1"] = resultados.pop(3)
resultados["roc_auc"] = resultados.pop(4)


time: 3.27 ms (started: 2021-04-14 13:18:03 +00:00)


## Avaliação dos hiperparâmetros Métricas

In [13]:
print(resultados["accuracy"].sort_values("mean_test_score", ascending = False).head()["params"])
print("\n")
resultados["accuracy"].sort_values("mean_test_score", ascending = False).head()

1      {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
121    {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
0       {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
120     {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
125    {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Name: params, dtype: object




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,0.14844,0.00637,0.00967,0.0002,balanced,auto,1,2,100,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}",0.73529,0.77451,0.71569,0.72277,0.77228,0.74411,0.02473,1
121,0.15004,0.00557,0.0101,0.00053,balanced,sqrt,1,2,100,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}",0.73529,0.77451,0.71569,0.72277,0.77228,0.74411,0.02473,1
0,0.07984,0.00853,0.00568,0.00071,balanced,auto,1,2,50,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}",0.71569,0.77451,0.72549,0.73267,0.76238,0.74215,0.02247,3
120,0.07502,0.00201,0.0053,0.00018,balanced,sqrt,1,2,50,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}",0.71569,0.77451,0.72549,0.73267,0.76238,0.74215,0.02247,3
125,0.44304,0.00485,0.02874,0.00278,balanced,sqrt,1,2,300,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}",0.71569,0.78431,0.69608,0.73267,0.77228,0.74021,0.0334,5


time: 51.9 ms (started: 2021-04-14 13:18:03 +00:00)


In [14]:
print(resultados["precision"].sort_values("mean_test_score", ascending = False).head()["params"])
print("\n")
resultados["precision"].sort_values("mean_test_score", ascending = False).head()

22      {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}
142     {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}
39     {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
159    {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
23      {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 200}
Name: params, dtype: object




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
22,0.21648,0.00849,0.01493,0.00109,balanced,auto,1,8,150,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}",0.4,0.57143,0.40741,0.37931,0.5,0.45163,0.07286,1
142,0.21959,0.00317,0.0148,0.0002,balanced,sqrt,1,8,150,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}",0.4,0.57143,0.40741,0.37931,0.5,0.45163,0.07286,1
39,0.70642,0.00381,0.04579,0.00287,balanced,auto,1,10,500,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}",0.42308,0.53333,0.42308,0.35484,0.52381,0.45163,0.06765,3
159,0.7149,0.00906,0.04666,0.0026,balanced,sqrt,1,10,500,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}",0.42308,0.53333,0.42308,0.35484,0.52381,0.45163,0.06765,3
23,0.28617,0.00517,0.01907,0.00094,balanced,auto,1,8,200,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 200}",0.41667,0.55172,0.42308,0.36667,0.5,0.45163,0.06575,5


time: 50.6 ms (started: 2021-04-14 13:18:03 +00:00)


In [15]:
print(resultados["recall"].sort_values("mean_test_score", ascending = False).head()["params"])
print("\n")
resultados["recall"].sort_values("mean_test_score", ascending = False).head()

110    {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 50}
90      {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 50}
100     {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 50}
210     {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 50}
220     {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 50}
Name: params, dtype: object




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
110,0.07358,0.00145,0.00595,0.00029,balanced,auto,5,10,50,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 50}",0.45833,0.75,0.56,0.58333,0.54167,0.57867,0.09547,1
90,0.07173,0.0019,0.00573,0.00016,balanced,auto,5,6,50,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 50}",0.45833,0.75,0.56,0.58333,0.54167,0.57867,0.09547,1
100,0.07321,0.00187,0.00572,0.00011,balanced,auto,5,8,50,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 50}",0.45833,0.75,0.56,0.58333,0.54167,0.57867,0.09547,1
210,0.07138,0.00236,0.00565,3e-05,balanced,sqrt,5,6,50,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 50}",0.45833,0.75,0.56,0.58333,0.54167,0.57867,0.09547,1
220,0.07052,0.00131,0.00569,7e-05,balanced,sqrt,5,8,50,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 50}",0.45833,0.75,0.56,0.58333,0.54167,0.57867,0.09547,1


time: 55.8 ms (started: 2021-04-14 13:18:04 +00:00)


In [16]:
print(resultados["f1"].sort_values("mean_test_score", ascending = False).head()["params"])
print("\n")
resultados["f1"].sort_values("mean_test_score", ascending = False).head()

159    {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
39     {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}
22      {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}
142     {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}
143     {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 200}
Name: params, dtype: object




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
159,0.70947,0.01175,0.04823,0.0055,balanced,sqrt,1,10,500,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}",0.62909,0.72296,0.62092,0.58776,0.6712,0.64638,0.04662,1
39,0.70623,0.01107,0.04635,0.00134,balanced,auto,1,10,500,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 500}",0.62909,0.72296,0.62092,0.58776,0.6712,0.64638,0.04662,1
22,0.21138,0.00227,0.01485,0.00079,balanced,auto,1,8,150,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}",0.61053,0.7419,0.61285,0.60352,0.66221,0.6462,0.05219,3
142,0.21646,0.00265,0.01474,0.00042,balanced,sqrt,1,8,150,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 150}",0.61053,0.7419,0.61285,0.60352,0.66221,0.6462,0.05219,3
143,0.28502,0.00311,0.0203,0.00219,balanced,sqrt,1,8,200,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 8, 'n_estimators': 200}",0.61859,0.73235,0.62092,0.5956,0.66221,0.64593,0.04825,5


time: 46.4 ms (started: 2021-04-14 13:18:04 +00:00)


In [17]:
print(resultados["roc_auc"].sort_values("mean_test_score", ascending = False).head()["params"])
print("\n")
resultados["roc_auc"].sort_values("mean_test_score", ascending = False).head()

210     {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 50}
110    {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 50}
100     {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 50}
220     {'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 50}
80      {'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 50}
Name: params, dtype: object




Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
210,0.07152,0.0018,0.00597,0.0001,balanced,sqrt,5,6,50,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 6, 'n_estimators': 50}",0.63996,0.82399,0.69065,0.63312,0.69372,0.69629,0.06857,1
110,0.07256,0.0014,0.00598,0.00014,balanced,auto,5,10,50,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 10, 'n_estimators': 50}",0.63996,0.82399,0.69065,0.63312,0.69372,0.69629,0.06857,1
100,0.07173,0.0022,0.00603,0.00012,balanced,auto,5,8,50,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 50}",0.63996,0.82399,0.69065,0.63312,0.69372,0.69629,0.06857,1
220,0.07306,0.00292,0.00627,0.00049,balanced,sqrt,5,8,50,"{'class_weight': 'balanced', 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 50}",0.63996,0.82399,0.69065,0.63312,0.69372,0.69629,0.06857,1
80,0.07095,0.00152,0.00595,7e-05,balanced,auto,5,2,50,"{'class_weight': 'balanced', 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 50}",0.63996,0.82399,0.69065,0.63312,0.69372,0.69629,0.06857,1


time: 54.1 ms (started: 2021-04-14 13:18:04 +00:00)


## Seleção dos melhores modelos (por métrica) e resultados preliminares

Existe uma contradição entre as métricas: modelos mais "leves" (menor número de árvores e menor tempo de fit) apresentam bons resultados com acurácia, sensibilidade e precisão. Já modelos mais mais "pesados" (maior número de árvores e maior tempo de fit) apresentam melhores resultados com as scores f1, ROC.

Como existe uma preocupação grande em acertar as classes de doadores, optei por escolher o melhor modelo que segundo os resultados da pontuação F1.

In [18]:
#Separando os hiperparâmetros dos "melhores" modelos
melhores_modelos = {"accuracy": resultados["accuracy"].sort_values("mean_test_score", ascending = False).iloc[0,9],
                    "precision": resultados["precision"].sort_values("mean_test_score", ascending = False).iloc[0,9],
                    "recall":resultados["recall"].sort_values("mean_test_score", ascending = False).iloc[0,9],
                    "f1": resultados["f1"].sort_values("mean_test_score", ascending = False).iloc[0,9],
                    "roc_auc": resultados["roc_auc"].sort_values("mean_test_score", ascending = False).iloc[0,9]}


time: 10.5 ms (started: 2021-04-14 13:18:04 +00:00)


In [19]:
# Visualisando os hiperparâmetros
melhores_modelos

{'accuracy': {'class_weight': 'balanced',
  'max_features': 'auto',
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 100},
 'f1': {'class_weight': 'balanced',
  'max_features': 'sqrt',
  'min_samples_leaf': 1,
  'min_samples_split': 10,
  'n_estimators': 500},
 'precision': {'class_weight': 'balanced',
  'max_features': 'auto',
  'min_samples_leaf': 1,
  'min_samples_split': 8,
  'n_estimators': 150},
 'recall': {'class_weight': 'balanced',
  'max_features': 'auto',
  'min_samples_leaf': 5,
  'min_samples_split': 10,
  'n_estimators': 50},
 'roc_auc': {'class_weight': 'balanced',
  'max_features': 'sqrt',
  'min_samples_leaf': 5,
  'min_samples_split': 6,
  'n_estimators': 50}}

time: 7.55 ms (started: 2021-04-14 13:18:04 +00:00)


##  Novo grid search: n_estaimadores

In [33]:
# Criando o modelo
RF = RandomForestClassifier(class_weight = 'balanced', 
                            max_features = "sqrt", 
                            min_samples_leaf = 1,
                            min_samples_split = 10,
                            random_state = 6)

# Separando os hiperparâmetros
hiperparametros = {"n_estimators": [x for x in range(500,1050,50)]}

#Realizando o GridSearch

GS = GridSearchCV(RF, hiperparametros, scoring = "f1_macro")

GS.fit(X_train, y_train)

resultados = pd.DataFrame(GS.cv_results_)



time: 1min 3s (started: 2021-04-14 13:29:00 +00:00)


In [34]:
resultados.sort_values("mean_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
3,0.91838,0.01517,0.06412,0.00906,650,{'n_estimators': 650},0.62092,0.72296,0.62092,0.5956,0.6712,0.64632,0.04551,10
4,0.98735,0.00697,0.06487,0.00254,700,{'n_estimators': 700},0.62092,0.72296,0.62092,0.5956,0.6712,0.64632,0.04551,10
0,0.71986,0.0046,0.04542,0.00186,500,{'n_estimators': 500},0.62909,0.72296,0.62092,0.58776,0.6712,0.64638,0.04662,9
5,1.07797,0.02937,0.06739,0.00192,750,{'n_estimators': 750},0.62092,0.72296,0.62092,0.60352,0.6712,0.6479,0.04382,6
8,1.27636,0.0087,0.08245,0.00362,900,{'n_estimators': 900},0.62092,0.72296,0.62092,0.60352,0.6712,0.6479,0.04382,6
9,1.3517,0.01942,0.08487,0.00165,950,{'n_estimators': 950},0.62092,0.72296,0.62092,0.60352,0.6712,0.6479,0.04382,6
1,0.78844,0.01199,0.0494,0.00092,550,{'n_estimators': 550},0.62909,0.72296,0.62092,0.5956,0.6712,0.64795,0.04471,4
2,0.85605,0.01414,0.05271,0.0005,600,{'n_estimators': 600},0.62909,0.72296,0.62092,0.5956,0.6712,0.64795,0.04471,4
6,1.14265,0.01574,0.07597,0.00626,800,{'n_estimators': 800},0.62092,0.72296,0.62092,0.61154,0.6712,0.64951,0.04229,1
7,1.206,0.01715,0.07825,0.0037,850,{'n_estimators': 850},0.62092,0.72296,0.62092,0.61154,0.6712,0.64951,0.04229,1


time: 36.5 ms (started: 2021-04-14 13:30:06 +00:00)


## Treinamento e teste

In [35]:
# Treinando e testando a RandomForestClassifier com os "melhores" hiperparâmetros
RF = RandomForestClassifier(class_weight = 'balanced', 
                            max_features = "sqrt", 
                            min_samples_leaf = 1,
                            min_samples_split = 10,
                            n_estimators = 650,  
                            random_state = 6).fit(X_train,y_train)

# Prevendo y
y_pred = RF.predict(X_val)

time: 1.04 s (started: 2021-04-14 13:30:30 +00:00)


In [36]:
# Separando as métricas
Acurácia = accuracy_score(y_val, y_pred)
Precisão =  precision_score(y_val, y_pred)
Recall = recall_score(y_val, y_pred)
F1 = f1_score(y_val, y_pred, average = "macro")
ROC = roc_auc_score(y_val, y_pred)


time: 13.2 ms (started: 2021-04-14 13:30:31 +00:00)


In [37]:
# Organizando as métricas em um DataFrame
resultados_val = pd.DataFrame({"Acurácia":Acurácia,
                                "Precisão": Precisão,
                                "Recall": Recall,
                                "F1": F1,
                                "ROC": ROC}, index = [0])

time: 4.11 ms (started: 2021-04-14 13:30:32 +00:00)


In [38]:
# Visualizando os resultados
resultados_val

Unnamed: 0,Acurácia,Precisão,Recall,F1,ROC
0,0.76378,0.5,0.46667,0.66485,0.66117


time: 16.7 ms (started: 2021-04-14 13:30:33 +00:00)


# **Código Morto**

In [24]:
Acurácia = np.zeros((10,10))
Precisão = np.zeros((10,10))
Sensibilidade = np.zeros((10,10))

métricas = ["accuracy", "precision", "recall"]

for j in range(10):
  for i in range (1,10):
    RF = RandomForestClassifier(n_estimators = 50, class_weight = {0:(i*10), 1: 100 - (i*10)} = {0:(i*10), 1: 100 - (i*10)})
    CV = ShuffleSplit(n_splits=5, test_size=0.25)

    resultados = cross_validate(RF, X, y, scoring=métricas, cv = CV)
    Acurácia[i,j] = np.mean(resultados["test_accuracy"])
    Precisão[i,j] = np.mean(resultados["test_precision"])
    Sensibilidade[i,j] = np.mean(resultados["test_recall"])




SyntaxError: ignored

In [None]:
# Transformando em Data Frame
Acurácia = pd.DataFrame(Acurácia, columns = np.arange(0,10))
Acurácia.fillna(0, inplace = True)

Precisão = pd.DataFrame(Precisão, columns = np.arange(0,10))
Precisão.fillna(0, inplace = True)

Sensibilidade = pd.DataFrame(Sensibilidade, columns = np.arange(0,10))
Sensibilidade.fillna(0,inplace = True)

In [None]:
# Criando listas para as médias e Desvios Padrões 

Acurácia_média_peso = []
Acurácia_dsvp_peso = []
Precisão_média_peso = []
Precisão_dsvp_peso = []
Sensibilidade_média_peso = []
Sensibilidade_dsvp_peso = []

# Preenchendo as listas

for i in range(10):
  Acurácia_média_peso.append(np.mean(Acurácia.iloc[i,:]))
  Acurácia_dsvp_peso.append(np.std(Acurácia.iloc[i,:]))

  Precisão_média_peso.append(np.mean(Precisão.iloc[i,:]))
  Precisão_dsvp_peso.append(np.std(Precisão.iloc[i,:]))

  Sensibilidade_média_peso.append(np.mean(Sensibilidade.iloc[i,:]))
  Sensibilidade_dsvp_peso.append(np.std(Sensibilidade.iloc[i,:]))

# Transpondo as listas

Acurácia_média_peso = np.transpose(Acurácia_média_peso)
Acurácia_dsvp_peso = np.transpose(Acurácia_dsvp_peso)

Precisão_média_peso = np.transpose(Precisão_média_peso)
Precisão_dsvp_peso = np.transpose(Precisão_dsvp_peso)

Sensibilidade_média_peso = np.transpose(Sensibilidade_média_peso)
Sensibilidade_dsvp_peso = np.transpose(Sensibilidade_dsvp_peso)

Métricas = pd.DataFrame(data = [Acurácia_média_peso, 
                                Precisão_média_peso, 
                                Sensibilidade_média_peso,
                                Acurácia_dsvp_peso*2,
                                Precisão_dsvp_peso*2,
                                Sensibilidade_dsvp_peso*2]).T.rename(columns = {
                                    0: "Acurácia Média", 
                                    1: "Precisão Média", 
                                    2: "Sensibilidade Média",
                                    3: "Desvio Padrão Acurácia em dobro",
                                    4: "Desvio Padrão Precisão em dobro",
                                    5: "Desvio Padrão Sensibilidade em dobro"})                                

In [None]:
Métricas

In [None]:
# Fazendo um gráfico para verificar a média e o IC das métricas
fig = go.Figure()
fig.add_trace(go.Scatter(x=Métricas.index, y=Métricas["Acurácia Média"],
                    mode='lines+markers',
                    name='Acurácia Média', 
                    error_y=dict(type='data', 
                                 array=Métricas["Desvio Padrão Acurácia em dobro"], 
                                 visible=True)))

fig.add_trace(go.Scatter(x=Métricas.index, y=Métricas["Precisão Média"],
                    mode='lines+markers',
                    name='Precisão Média',
                    error_y=dict(type='data', 
                                 array=Métricas["Desvio Padrão Precisão em dobro"], 
                                 visible=True)))

fig.add_trace(go.Scatter(x=Métricas.index, y=Métricas["Sensibilidade Média"],
                    mode='lines+markers', 
                    name='Sensibilidade',
                    error_y=dict(type='data', 
                                 array=Métricas["Desvio Padrão Sensibilidade em dobro"], 
                                 visible=True)))

fig.update_layout(height=900, width=1700,
                  title_text="Valores médios de Acurácia, Precisão e Sensibilidade por número de pesos de zero",
                  )
fig.update_xaxes(title = "Peso (dezenas)")


fig.show()


In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
# Só fazer um fit com os dados de treino (não fazer outros).
scaler.fit(X_train)

In [None]:
X_train = scaler.transform(X_train)



In [None]:
X_val = scaler.transform(X_val)

In [None]:
Acurácia = np.zeros((10,10))
Precisão = np.zeros((10,10))
Sensibilidade = np.zeros((10,10))

métricas = ["accuracy", "precision", "recall"]

for j in range(10):
  for i in range (1,10):
    RF = RandomForestClassifier(n_estimators = (i*50), class_weight = {0:30, 1: 70})
    CV = ShuffleSplit(n_splits=5, test_size=0.25)

    resultados = cross_validate(RF, X, y, scoring=métricas, cv = CV)
    Acurácia[i,j] = np.mean(resultados["test_accuracy"])
    Precisão[i,j] = np.mean(resultados["test_precision"])
    Sensibilidade[i,j] = np.mean(resultados["test_recall"])


In [None]:
Acurácia = pd.DataFrame(Acurácia, columns = np.arange(0,10))
Acurácia.fillna(0, inplace = True)

Precisão = pd.DataFrame(Precisão, columns = np.arange(0,10))
Precisão.fillna(0, inplace = True)

Sensibilidade = pd.DataFrame(Sensibilidade, columns = np.arange(0,10))
Sensibilidade.fillna(0,inplace = True)

In [None]:
# Criando listas para as médias e Desvios Padrões 

Acurácia_média_estimadores = []
Acurácia_dsvp_estimadores = []
Precisão_média_estimadores = []
Precisão_dsvp_estimadores = []
Sensibilidade_média_estimadores = []
Sensibilidade_dsvp_estimadores = []

# Preenchendo as listas

for i in range(10):
  Acurácia_média_estimadores.append(np.mean(Acurácia.iloc[i,:]))
  Acurácia_dsvp_estimadores.append(np.std(Acurácia.iloc[i,:]))

  Precisão_média_estimadores.append(np.mean(Precisão.iloc[i,:]))
  Precisão_dsvp_estimadores.append(np.std(Precisão.iloc[i,:]))

  Sensibilidade_média_estimadores.append(np.mean(Sensibilidade.iloc[i,:]))
  Sensibilidade_dsvp_estimadores.append(np.std(Sensibilidade.iloc[i,:]))

# Transpondo as listas

Acurácia_média_estimadores = np.transpose(Acurácia_média_estimadores)
Acurácia_dsvp_estimadores = np.transpose(Acurácia_dsvp_estimadores)

Precisão_média_estimadores = np.transpose(Precisão_média_estimadores)
Precisão_dsvp_estimadores = np.transpose(Precisão_dsvp_estimadores)

Sensibilidade_média_estimadores = np.transpose(Sensibilidade_média_estimadores)
Sensibilidade_dsvp_estimadores = np.transpose(Sensibilidade_dsvp_estimadores)

Métricas = pd.DataFrame(data = [Acurácia_média_estimadores, 
                                Precisão_média_estimadores, 
                                Sensibilidade_média_estimadores,
                                Acurácia_dsvp_estimadores*2,
                                Precisão_dsvp_estimadores*2,
                                Sensibilidade_dsvp_estimadores*2]).T.rename(columns = {
                                    0: "Acurácia Média", 
                                    1: "Precisão Média", 
                                    2: "Sensibilidade Média",
                                    3: "Desvio Padrão Acurácia em dobro",
                                    4: "Desvio Padrão Precisão em dobro",
                                    5: "Desvio Padrão Sensibilidade em dobro"})                                

In [None]:
Métricas

In [None]:
# Fazendo um gráfico para verificar a média e o IC das métricas
fig = go.Figure()
fig.add_trace(go.Scatter(x=Métricas.index*50, y=Métricas["Acurácia Média"],
                    mode='lines+markers',
                    name='Acurácia Média', 
                    error_y=dict(type='data', 
                                 array=Métricas["Desvio Padrão Acurácia em dobro"], 
                                 visible=True)))

fig.add_trace(go.Scatter(x=Métricas.index*50, y=Métricas["Precisão Média"],
                    mode='lines+markers',
                    name='Precisão Média',
                    error_y=dict(type='data', 
                                 array=Métricas["Desvio Padrão Precisão em dobro"], 
                                 visible=True)))

fig.add_trace(go.Scatter(x=Métricas.index*50, y=Métricas["Sensibilidade Média"],
                    mode='lines+markers', 
                    name='Sensibilidade',
                    error_y=dict(type='data', 
                                 array=Métricas["Desvio Padrão Sensibilidade em dobro"], 
                                 visible=True)))

fig.update_layout(height=900, width=1700,
                  title_text="Valores médios de Acurácia, Precisão e Sensibilidade por número de árvores",
                  )
fig.update_xaxes(title = "Árvores")


fig.show()
