In [2]:
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import optuna
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import plotly

In [3]:
healthexp = sns.load_dataset('healthexp')
healthexp = pd.get_dummies(healthexp)
X = healthexp.drop(['Life_Expectancy'],axis = 1)
y = healthexp['Life_Expectancy']
X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [4]:
rfr = RandomForestRegressor(random_state=10)

In [5]:
rfr.fit(X, y)

In [6]:
y_pred = rfr.predict(X_test)

In [7]:
mean_squared_error(y_test,y_pred)

np.float64(0.020587381818181764)

In [8]:
mean_absolute_error(y_test,y_pred)

np.float64(0.10316363636364011)

In [9]:
r2_score(y_test,y_pred)

0.9979479667097655

In [10]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth',10,15)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 32)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 32)
    
    model = RandomForestRegressor(n_estimators=n_estimators,
                                  max_depth=max_depth,
                                  min_samples_split= min_samples_split,
                                  min_samples_leaf= min_samples_leaf)
    
    score = cross_val_score(model, X_train, y_train, cv= 5, scoring= 'neg_mean_squared_error').mean()
    
    return score

In [11]:
study = optuna.create_study(direction = 'maximize', sampler = optuna.samplers.RandomSampler(seed = 10))

[I 2024-11-04 23:38:22,810] A new study created in memory with name: no-name-4d1935a8-0620-4694-b9fb-57d58fbd52ed


In [15]:
study.optimize(objective,n_trials = 200)

[I 2024-11-04 23:39:03,432] Trial 2 finished with value: -3.284343275829557 and parameters: {'n_estimators': 252, 'max_depth': 10, 'min_samples_split': 23, 'min_samples_leaf': 31}. Best is trial 0 with value: -2.4106863909222307.
[I 2024-11-04 23:39:04,924] Trial 3 finished with value: -2.1989724944807163 and parameters: {'n_estimators': 103, 'max_depth': 13, 'min_samples_split': 27, 'min_samples_leaf': 20}. Best is trial 3 with value: -2.1989724944807163.
[I 2024-11-04 23:39:15,701] Trial 4 finished with value: -2.3226829137249885 and parameters: {'n_estimators': 750, 'max_depth': 11, 'min_samples_split': 30, 'min_samples_leaf': 23}. Best is trial 3 with value: -2.1989724944807163.
[I 2024-11-04 23:39:21,193] Trial 5 finished with value: -2.2759861646693196 and parameters: {'n_estimators': 588, 'max_depth': 10, 'min_samples_split': 13, 'min_samples_leaf': 22}. Best is trial 3 with value: -2.1989724944807163.
[I 2024-11-04 23:39:26,051] Trial 6 finished with value: -2.1026134822151126 

In [16]:
best_params = study.best_params

In [17]:
optuna.visualization.plot_optimization_history(study)

In [18]:
optuna.visualization.plot_parallel_coordinate(study)

In [19]:
optuna.visualization.plot_slice(study,params = ['n_estimators','max_depth','min_samples_split','min_samples_leaf'])

In [20]:
optuna.visualization.plot_param_importances(study)

In [21]:
best_n_estimators = best_params['n_estimators']
best_max_depth = best_params['max_depth']
best_min_samples_split = best_params['min_samples_split']
best_min_samples_leaf = best_params['min_samples_leaf']

In [22]:
best_model = RandomForestRegressor(n_estimators=best_n_estimators,
                                   max_depth=best_max_depth,
                                   min_samples_leaf= best_min_samples_leaf,
                                   min_samples_split= best_min_samples_split)

In [23]:
best_model.fit(X_train, y_train)

In [26]:
y_pred = best_model.predict(X_test)

In [27]:
r2_score(y_test,y_pred)

0.9796014130432554

In [28]:
mean_absolute_error(y_test,y_pred)

np.float64(0.3600282103314219)

In [29]:
mean_squared_error(y_test,y_pred)

np.float64(0.20465238075250547)