In [1]:
# Ignora os avisos
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

fetal_health_df = pd.read_csv('fetal_health.csv')
fetal_health_df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [4]:
# Cópia do dataframe original, para conservar as informações
fetal_health = fetal_health_df.copy()

In [5]:
# Seleção das features (variáveis independentes)
X = fetal_health.drop('fetal_health', axis=1)

# Seleção da variável dependente
y = fetal_health['fetal_health']

In [6]:
from sklearn.preprocessing import StandardScaler

# Padronização dos dados com StandardScaler
escala = StandardScaler()
X_escalonado = escala.fit_transform(X)

In [7]:
from sklearn.model_selection import train_test_split

# Divisão dos dados em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X_escalonado, y, test_size=0.2, random_state=42)

### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

modelo_random_forest = RandomForestClassifier(random_state=42)

#### Hiperparâmetros com GridSearchCV

- O Grid Search busca pelo melhor hiperparâmetro, pesquisando exaustivamente todas as possíveis combinações de hiperparâmetros.
- Na prática, o intervalo de valores de hiperparâmetros a pesquisar é especificado manualmente.
- Este é um método caro e demorado, porém funciona bem quando o número de hiperparâmetros é relativamente pequeno

In [12]:
from sklearn.model_selection import GridSearchCV

# Define a grade de hiperparâmetros para o GridSearch
param_grid = {
              'n_estimators': [50, 100, 200],               # Número de árvores
              'max_depth': [None, 10, 20, 30],              # Profundidade máxima das árvores
              'min_samples_split': [2, 5, 10],              # Número mínimo de amostras para dividir um nó
              'min_samples_leaf': [1, 2, 4],                # Número mínimo de amostras por folha
              'max_features': ['sqrt', 'log2'],             # Número de features a serem consideradas para a divisão
              'bootstrap': [True, False]                    # Se o bootstrap (amostragem com reposição) será usado
              }

# 'sqrt': número de características a serem consideradas será a raiz quadrada do número total de características.
# 'log2': número de características será o logaritmo na base 2 do número total de características. 
#         Pode ser uma boa escolha quando você deseja ainda mais aleatoriedade no modelo.

In [10]:
# Inicializa o GridSearchCV
grid_search = GridSearchCV(estimator=modelo_random_forest, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Ajusta o GridSearchCV aos dados de treino
grid_search.fit(X_train, y_train)

# Melhor conjunto de hiperparâmetros
print(f"Melhores parâmetros: {grid_search.best_params_}")

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.1s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.3s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.5s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.5s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_esti

#### Hiperparâmetros com RandomizedSearchCV



In [13]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = {
              'n_estimators': randint(50, 500),         # Número de árvores
              'max_depth': randint(10, 50),             # Profundidade máxima das árvores
              'min_samples_split': randint(2, 20),      # Número mínimo de amostras para dividir um nó
              'min_samples_leaf': randint(1, 20),       # Número mínimo de amostras para ser uma folha
              'max_features': ['sqrt', 'log2'],         # Quantidade de características para tentar em cada divisão
              'bootstrap': [True, False]
             }

In [14]:
random_search = RandomizedSearchCV(estimator=modelo_random_forest, 
                                   param_distributions=param_dist, 
                                   n_iter=100,            # Número de combinações aleatórias
                                   cv=5,                  # K-fold cross-validation
                                   verbose=2,             # Informações detalhadas sobre o progresso
                                   random_state=42,       # Garantir reprodutibilidade
                                   n_jobs=-1)             # Usar todos os núcleos de CPU

random_search.fit(X_train, y_train)

print(f"Melhores parâmetros: {random_search.best_params_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=200; total time=   0.6s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=50; total time=   0.2s
[CV] END bootstrap=False, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators