<a href="https://colab.research.google.com/github/RPGraciotti/BootCampAlura/blob/main/Projeto_final/Evaluating.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install tpot

Collecting tpot
  Downloading TPOT-0.11.7-py3-none-any.whl (87 kB)
[?25l[K     |███▊                            | 10 kB 16.6 MB/s eta 0:00:01[K     |███████▌                        | 20 kB 12.1 MB/s eta 0:00:01[K     |███████████▎                    | 30 kB 8.9 MB/s eta 0:00:01[K     |███████████████                 | 40 kB 8.3 MB/s eta 0:00:01[K     |██████████████████▉             | 51 kB 7.3 MB/s eta 0:00:01[K     |██████████████████████▋         | 61 kB 7.4 MB/s eta 0:00:01[K     |██████████████████████████▎     | 71 kB 6.0 MB/s eta 0:00:01[K     |██████████████████████████████  | 81 kB 6.7 MB/s eta 0:00:01[K     |████████████████████████████████| 87 kB 2.6 MB/s 
Collecting xgboost>=1.1.0
  Downloading xgboost-1.4.2-py3-none-manylinux2010_x86_64.whl (166.7 MB)
[K     |████████████████████████████████| 166.7 MB 14 kB/s 
[?25hCollecting stopit>=1.1.1
  Downloading stopit-1.1.2.tar.gz (18 kB)
Collecting deap>=1.2
  Downloading deap-1.3.1-cp37-cp37m-manylinux_2_5_x

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline

from tpot.builtins import StackingEstimator
from tpot.builtins import ZeroCount
from tpot.export_utils import set_param_recursive

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, f_classif, SelectFwe

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve

import warnings

In [6]:
def multi_score_cv(model, x, y, cv, model_title, set_context = "talk", figsize = (10, 6)):
  with warnings.catch_warnings():
    warnings.simplefilter('ignore') # ignora warnings durante cross_validate
    cv_results = cross_validate(model, x, y, 
                                cv = cv, scoring = ["precision", "roc_auc", "recall", "f1"]) # performa cross validate com parametros escolhidos e separa os scorings de itneresse
  
  precision = cv_results["test_precision"] 
  roc_auc = cv_results["test_roc_auc"]
  recall = cv_results["test_recall"]
  f1 = cv_results["test_f1"]
  scores = pd.DataFrame(dict(Precision = precision, ROC_AUC = roc_auc, Recall = recall, F1 = f1))
  scores_melt = scores.melt()

  plt.figure(figsize = figsize)
  sns.set_context(set_context)
  sns.boxplot(data = scores_melt, x = "value", y = "variable", linewidth = 2.5)
  plt.title(f"Distribuição de valores de score por CV - Modelo {model_title}")
  plt.xlabel("")
  plt.xlim(-0.1, 1.1)
  plt.ylabel("Métrica")
  plt.show()
  sns.reset_orig()

  return scores

In [7]:
def set_split(data):
  
  data = data.sample(frac = 1, random_state = 78329).reset_index(drop = True)
  y = data.loc[:,"ICU"]
  y = y.rename("target")
  x = data.drop(["PATIENT_VISIT_IDENTIFIER", "ICU", "WINDOW"], axis = 1)
  split = train_test_split(x, y, stratify = y, test_size = 0.2, random_state = 78329)

  return split

In [8]:
# Função curva ROC-AUC de https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/

def roc_auc_curve(data, model, title):

  ns_probs = [0 for _ in range(len(data[3]))]
  lr_probs = model.predict_proba(data[1])
  lr_probs = lr_probs[:, 1]
  ns_auc = roc_auc_score(data[3], ns_probs)
  lr_auc = roc_auc_score(data[3], lr_probs)
  ns_fpr, ns_tpr, _ = roc_curve(data[3], ns_probs)
  lr_fpr, lr_tpr, _ = roc_curve(data[3], lr_probs)
  
  plt.plot(ns_fpr, ns_tpr, linestyle = '--', label = "Modelo neutro")
  plt.plot(lr_fpr, lr_tpr, marker = ".", label = f"Modelo {title}")
  plt.xlabel("Taxa de falso positivo")
  plt.ylabel("Taxa de verdadeiro positivo")
  plt.legend()
  plt.show()

In [9]:
# Função de curva - precision recall de https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/

def prec_rec_curve(data, model, title):
  
  lr_probs = model.predict_proba(data[1])
  lr_probs = lr_probs[:, 1]
  yhat = model.predict(data[1])
  lr_precision, lr_recall, _ = precision_recall_curve(data[3], lr_probs)
  lr_f1, lr_auc = f1_score(data[3], yhat), auc(lr_recall, lr_precision)
  no_skill = len(data[3][data[3]==1]) / len(data[3])
  
  plt.plot([0, 1], [no_skill, no_skill], linestyle = '--', label = "Modelo neutro")
  plt.plot(lr_recall, lr_precision, marker = '.', label = f"Modelo {title}")
  plt.xlabel("Recall")
  plt.ylabel("Precisão")
  plt.legend()
  plt.show()

In [10]:
path = "https://raw.githubusercontent.com/RPGraciotti/BootCampAlura/main/Data/data_clean_ohe.csv"

In [11]:
df_clean = pd.read_csv(path)
# df_clean

In [12]:
main_split = set_split(df_clean)

In [13]:
x = df_clean.drop(["PATIENT_VISIT_IDENTIFIER", "ICU", "WINDOW"], axis = 1)
y = df_clean.loc[:,"ICU"]
y = y.rename("target")

In [14]:
cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 10)

AVALIAÇÃO DOS MODELOS

PRECISION

In [15]:
m1 = RandomForestClassifier(bootstrap=True, criterion="gini", 
                            max_features=0.1, min_samples_leaf=15, min_samples_split=9, n_estimators=100)

In [None]:
m1_scores = multi_score_cv(model = m1, x = x, y = y, cv = cv,
                           model_title = "Precision")

In [16]:
m1_eval = m1.fit(X = main_split[0], y = main_split[2])

In [None]:
plt.figure(figsize = (10, 6))
plot_confusion_matrix(m1_eval, X = main_split[1], y_true = main_split[3], cmap = plt.cm.PuBu, normalize = "all")

In [None]:
roc_auc_curve(data = main_split, model = m1_eval, title = "Precision")
prec_rec_curve(data = main_split, model = m1_eval, title = "Precision")

In [None]:
forest_importances = pd.Series(m1_eval.feature_importances_)

MODELO 2 - MAXIMIZAÇÃO DO ROC

In [None]:
m1 = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=60),
    RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.1, min_samples_leaf=18, min_samples_split=9, n_estimators=100)
)

In [None]:
m1_eval = m1.fit(main_split[0], main_split[2])

In [None]:
plot_confusion_matrix(m1_eval, main_split[1], main_split[3], cmap = plt.cm.PuBu, normalize = "all")

ROC_AUC e PRECISION-RECALL CURVES

In [None]:
m1_scores = multi_score_cv(model = m1_eval, x = x, y = y, cv = cv, model_title = "ROC")

RECALL

In [None]:
m2 = make_pipeline(
    StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.7500000000000001, 
                                                       min_samples_leaf=2, min_samples_split=14, n_estimators=100, 
                                                       random_state = 78329)),
    StandardScaler(),
    BernoulliNB(alpha=0.01, fit_prior=False)
)

In [None]:
m2_scores = multi_score_cv(model = m2_eval, x = x, y = y, cv = cv, model_title = "Recall")

F1

In [None]:
m3 = make_pipeline(
    ZeroCount(),
    RandomForestClassifier(bootstrap=True, criterion="gini", 
                           max_features=0.7500000000000001, min_samples_leaf=15, min_samples_split=13, n_estimators=100)
)

In [None]:
m3_scores = multi_score_cv(model = m3_eval, x = x, y = y, cv = cv,
                           model_title = "F1")