# Test avec RandomForestRegressor

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
df = pd.read_csv("../data/silver.csv")

In [3]:
# column_titles = df.columns.tolist()
# column_titles

In [4]:
X = df.drop(["prix_median"], axis=1)
y = df['prix_median']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=42)

categorial_features = ["proximité_autoroute", "riviere"]

numerical_features = ['tx_crim', 'tx_residence','tx_commerce',
       'tx_nitriq', 'nb_piece', 'tx_ancienneté_parc_immo', 'distance_centre_emploi',
        'indice_impot_foncier',  'ratio_eleve_enseignant', 'tx_person_couleur',  'tx_status_sociaux_eco_inf']

categorical_transformer = OneHotEncoder(sparse_output=True)
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorial_features),
        ('num', numerical_transformer, numerical_features)
    ],
    remainder="passthrough" 
)

# Créer un objet CatBoostRegressor
cb_reg = RandomForestRegressor(random_state=42, verbose=False)


# Créer un pipeline avec le préprocesseur et le modèle CatBoostRegressor
pipe = Pipeline([
     ('preprocessor', preprocessor),
     ('cb_reg', cb_reg)
])

# Entraîner le pipeline sur les données d'entraînement
pipe.fit(X_train, y_train)
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

print("######## Mean squared error : ")
print("TRAIN :",mean_squared_error(y_train, y_pred_train))
print("TEST :",mean_squared_error(y_test, y_pred_test))
print("/////////////////////////////////////////////////////")

print("######## Mean absolute error : ")
print("TRAIN :",mean_absolute_error(y_train, y_pred_train))
print("TEST :",mean_absolute_error(y_test, y_pred_test))
print("/////////////////////////////////////////////////////")

print("######## R2 score : ")    
print("TRAIN :",r2_score(y_train, y_pred_train))
print("TEST :",r2_score(y_test, y_pred_test))

######## Mean squared error : 
TRAIN : 1972396.5693069308
TEST : 7806928.617647059
/////////////////////////////////////////////////////
######## Mean absolute error : 
TRAIN : 908.1782178217821
TEST : 2022.3823529411766
/////////////////////////////////////////////////////
######## R2 score : 
TRAIN : 0.9772957374491794
TEST : 0.8935425354971871


### Optuna (recherche des hyperparametres)

In [12]:
import optuna
from sklearn.metrics import r2_score

In [17]:
def objective(trial):
    # Définition des hyperparamètres à optimiser
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
    max_depth = trial.suggest_int('max_depth', 5, 15)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    criterion = trial.suggest_categorical('criterion', ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'])
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])

    # Création du modèle avec les hyperparamètres suggérés
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf,
                                  criterion=criterion,
                                  max_features=max_features,
                                  random_state=42)
    
    # Entraînement du modèle
    model.fit(X_train, y_train)
    
    # Prédiction sur les données de test
    y_pred = model.predict(X_test)
    
    # Calcul du score R2
    r2 = r2_score(y_test, y_pred)
    
    return r2


In [18]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-06-12 14:15:56,021] A new study created in memory with name: no-name-6f53d227-8325-40ba-928a-05fabb5bfe2b
[I 2023-06-12 14:15:56,799] Trial 0 finished with value: 0.8304564044416839 and parameters: {'n_estimators': 500, 'max_depth': 6, 'min_samples_split': 4, 'min_samples_leaf': 1, 'criterion': 'squared_error', 'max_features': 'log2'}. Best is trial 0 with value: 0.8304564044416839.
[I 2023-06-12 14:15:57,659] Trial 1 finished with value: 0.8428654816460646 and parameters: {'n_estimators': 600, 'max_depth': 14, 'min_samples_split': 7, 'min_samples_leaf': 2, 'criterion': 'squared_error', 'max_features': 'log2'}. Best is trial 1 with value: 0.8428654816460646.
  warn(
[I 2023-06-12 14:15:58,538] Trial 2 finished with value: 0.8549666621983543 and parameters: {'n_estimators': 400, 'max_depth': 7, 'min_samples_split': 8, 'min_samples_leaf': 3, 'criterion': 'friedman_mse', 'max_features': 'auto'}. Best is trial 2 with value: 0.8549666621983543.
[I 2023-06-12 14:15:59,057] Trial 3 fi

In [19]:
best_params = study.best_params
best_value = study.best_value

In [20]:
print(best_params)
print(best_value)

{'n_estimators': 100, 'max_depth': 14, 'min_samples_split': 7, 'min_samples_leaf': 1, 'criterion': 'poisson', 'max_features': 'auto'}
0.8949142921897552
