In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

cleaned_data_path = "datas/cleanedData.csv"
labels_data_path = "datas/labels.csv"

cleaned_data = pd.read_csv(cleaned_data_path)
labels_data = pd.read_csv(labels_data_path)

x_train, x_test, y_train, y_test = train_test_split(cleaned_data.drop(columns=["Attrition", "EmployeeID"], axis=1), labels_data["Attrition"], test_size=0.3, shuffle=True)


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

depth_list = [1, 3, 5, 10, 15, 20, 25, 30, 35, 40, None]

tree_grid = [
    {'max_depth': depth_list, 'criterion': ['gini', 'entropy']}
  ]

forest_grid = [
    # essaye 12 (3×4) combinaisons des hyperparametres
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    # puis essaye 6 (2×3) combinaisons avec bootstrap à False (True étant la valeur par défaut)
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_csf = RandomForestClassifier(random_state=42)
tree_csf = DecisionTreeClassifier(random_state=42)

# 5 sous-jeux de cross-val, ça fait en tout (12+6)*5=90 tours d'entraînement 
forest_search = GridSearchCV(forest_csf, forest_grid, cv=10,
                           scoring='neg_mean_squared_error', return_train_score=True)
forest_search.fit(x_train, y_train)
tree_search = GridSearchCV(tree_csf, tree_grid, cv=10,
                           scoring='neg_mean_squared_error', return_train_score=True)
tree_search.fit(x_train, y_train)

In [7]:
print(forest_search.best_estimator_.get_params())
print(tree_search.best_estimator_.get_params())


{'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 3, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'entropy', 'max_depth': 15, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'best'}


In [10]:
print(forest_search.best_params_)
print(tree_search.best_params_)

{'bootstrap': False, 'max_features': 3, 'n_estimators': 10}
{'criterion': 'entropy', 'max_depth': 25}
