In [6]:
import optuna
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import catboost as cb



In [3]:
data = pd.read_excel('Donnees.xlsx')

In [4]:
X = data.drop(columns=['y'])
y = data['y']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 2)

In [36]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 2000)
    max_depth = trial.suggest_int('max_depth', 4, 10)
    subsample = trial.suggest_float('subsample', 0.01, 1)
    tree_method = trial.suggest_categorical('tree_method', ['auto', 'hist', 'approx'])
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.01, 1)
    eta = trial.suggest_float('eta', 0.01, 1)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 50)

    
    model = xgb.XGBRegressor( n_estimators= n_estimators,
                                  max_depth=max_depth,
                                  subsample=subsample,
                                  tree_method=tree_method,
                                  colsample_bytree=colsample_bytree,
                                  eta=eta,
                                  min_child_weight=min_child_weight
                                  )
    
    score = cross_val_score(model, X_train, y_train, cv=3, scoring= 'neg_mean_squared_error')
    
    return score.mean()

study = optuna.create_study(direction='minimize', sampler= optuna.samplers.RandomSampler())

[I 2024-07-12 13:55:30,192] A new study created in memory with name: no-name-1219bc90-da37-4da9-999a-a0b56c511650


In [37]:
study.optimize(objective, n_trials=100)

[I 2024-07-12 13:55:34,320] Trial 0 finished with value: -3055.5110724410238 and parameters: {'n_estimators': 1192, 'max_depth': 10, 'subsample': 0.9316476086308488, 'tree_method': 'approx', 'colsample_bytree': 0.5526602840403889, 'eta': 0.31424354424651557, 'min_child_weight': 20}. Best is trial 0 with value: -3055.5110724410238.
[I 2024-07-12 13:55:34,736] Trial 1 finished with value: -464295.7117161951 and parameters: {'n_estimators': 614, 'max_depth': 6, 'subsample': 0.13744326104035492, 'tree_method': 'hist', 'colsample_bytree': 0.43946113268829917, 'eta': 0.3257526567264982, 'min_child_weight': 29}. Best is trial 1 with value: -464295.7117161951.
[I 2024-07-12 13:55:36,828] Trial 2 finished with value: -1570.8973879612731 and parameters: {'n_estimators': 1904, 'max_depth': 5, 'subsample': 0.939036534756173, 'tree_method': 'auto', 'colsample_bytree': 0.901738304247258, 'eta': 0.7328730119980018, 'min_child_weight': 10}. Best is trial 1 with value: -464295.7117161951.
[I 2024-07-12

In [38]:
optuna.visualization.plot_parallel_coordinate(study)

In [39]:
best_params =study.best_params

best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_subsample = best_params['subsample']
best_tree_method = best_params['tree_method']
best_colsample_bytree = best_params['colsample_bytree']
best_eta = best_params['eta']
best_min_child_weight = best_params['min_child_weight']
    
best_model = xgb.XGBRegressor(n_estimators= best_n_estimators,
                                  max_depth=best_max_depth,
                                  min_samples_split=best_subsample,
                                  tree_method = best_tree_method,
                                  colsample_bytree = best_colsample_bytree,
                                  eta = best_eta,
                                  min_child_weight = best_min_child_weight
                                  )

In [40]:
best_model.fit(X_train,y_train)


Parameters: { "min_samples_split" } are not used.




In [41]:
y_hat = best_model.predict(X_test)

In [42]:
mean_squared_error(y_test, y_hat)

3716.1324695728517

In [43]:
mean_absolute_percentage_error(y_test, y_hat)*100

8.680115232780853

In [50]:
X_scaler = StandardScaler()
X_scaled = X_scaler.fit_transform(X)
y_scaler = StandardScaler()
y_scaled = y_scaler.fit_transform(y.values.reshape(-1,1))

In [51]:
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(X_scaled, y_scaled, test_size = 0.15, random_state = 2)

In [52]:


study = optuna.create_study(direction='minimize', sampler= optuna.samplers.RandomSampler())

[I 2024-07-12 14:27:59,527] A new study created in memory with name: no-name-f8637542-6f37-4919-8bd6-e924bd35daa9


In [53]:
study.optimize(objective, n_trials=50)

[I 2024-07-12 14:28:07,970] Trial 0 finished with value: -2087.445558102233 and parameters: {'first_hidden_layer': 128, 'second_hidden_layer': 16, 'alpha': 0.006051402598993809}. Best is trial 0 with value: -2087.445558102233.
[I 2024-07-12 14:28:09,467] Trial 1 finished with value: -6633.191672410191 and parameters: {'first_hidden_layer': 4, 'second_hidden_layer': 64, 'alpha': 0.003331629330688833}. Best is trial 1 with value: -6633.191672410191.

Stochastic Optimizer: Maximum iterations (500) reached and the optimization hasn't converged yet.

[I 2024-07-12 14:28:10,548] Trial 2 finished with value: -575487.7696493791 and parameters: {'first_hidden_layer': 2, 'second_hidden_layer': 4, 'alpha': 0.00834341932513804}. Best is trial 2 with value: -575487.7696493791.
[I 2024-07-12 14:28:11,692] Trial 3 finished with value: -1946.8765598918555 and parameters: {'first_hidden_layer': 128, 'second_hidden_layer': 32, 'alpha': 0.0016198830360719247}. Best is trial 2 with value: -575487.76964937

In [54]:
optuna.visualization.plot_parallel_coordinate(study)

In [56]:
best_params =study.best_params

best_first_hidden_layer = best_params['first_hidden_layer']
best_second_hidden_layer = best_params['second_hidden_layer']
best_alpha = best_params['alpha']


best_model = MLPRegressor(hidden_layer_sizes=[best_first_hidden_layer, best_second_hidden_layer],
                            activation='relu',
                            solver='adam',
                            alpha=best_alpha,
                            learning_rate='adaptive',
                            max_iter=500,
                            validation_fraction=0.15,
                            early_stopping=True)


In [57]:
best_model.fit(X_train,y_train)

y_hat = best_model.predict(X_test)

mse = mean_squared_error(y_test, y_hat)

mape = mean_absolute_percentage_error(y_test, y_hat)*100

print('the mean squared error is: ', mse)
print('the mean absolute percentage error is: ', mape)

the mean squared error is:  2745.6379795869248
the mean absolute percentage error is:  35.721584834034736
