In [1]:
!pip install --quiet optuna

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.8/78.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import joblib
import optuna

In [3]:
# precios de casas
data = fetch_california_housing()
X = data.data
y = data.target

### Attribute Information:
  - MedInc:        median income in block group
  - HouseAge:      median house age in block group
  - AveRooms:      average number of rooms per household
  - AveBedrms:     average number of bedrooms per household
  - Population:    block group population
  - AveOccup:      average number of household members
  - Latitude:      block group latitude
  - Longitude:     block group longitude

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# función para entrenar y evaluar un modelo con hiperparámetros dados
def train_evaluate_model(params, X_train, y_train, X_test, y_test):
    model = RandomForestRegressor(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

In [5]:
df=pd.DataFrame(np.c_[data['data'], data['target']],
                  columns= np.append(data['feature_names'], ['target']))
df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [6]:
# grid search

grid_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4]
}

grid_search = GridSearchCV(RandomForestRegressor(random_state=42), grid_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_grid_params = grid_search.best_params_
best_grid_model = RandomForestRegressor(**best_grid_params, random_state=42)
best_grid_model.fit(X_train, y_train)
grid_search_mse = mean_squared_error(y_test, best_grid_model.predict(X_test))

El objetivo es emplear las técnicas de búsqueda de hiperparámetros, NO hacer un análisis exploratorio. Tener en cuenta que primero hay que hacer ese análisis.

In [7]:
# random search
random_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 20],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4]
}

random_search = RandomizedSearchCV(RandomForestRegressor(random_state=42), random_params, n_iter=4, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
random_search.fit(X_train, y_train)
best_random_params = random_search.best_params_
best_random_model = RandomForestRegressor(**best_random_params, random_state=42)
best_random_model.fit(X_train, y_train)
random_search_mse = mean_squared_error(y_test, best_random_model.predict(X_test))

In [8]:
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 200)
    max_depth = trial.suggest_int('max_depth', 5, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)

    params = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf
    }

    mse = train_evaluate_model(params, X_train, y_train, X_test, y_test)
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)
best_optuna_params = study.best_params
best_optuna_model = RandomForestRegressor(**best_optuna_params, random_state=42)
best_optuna_model.fit(X_train, y_train)
optuna_mse = mean_squared_error(y_test, best_optuna_model.predict(X_test))


[I 2024-05-13 22:07:13,431] A new study created in memory with name: no-name-2faea2ad-2d4e-440d-a1ae-d0364820fb35
[I 2024-05-13 22:07:26,020] Trial 0 finished with value: 0.2565200035512326 and parameters: {'n_estimators': 102, 'max_depth': 19, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.2565200035512326.
[I 2024-05-13 22:07:37,178] Trial 1 finished with value: 0.37748468952318137 and parameters: {'n_estimators': 166, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.2565200035512326.
[I 2024-05-13 22:07:49,344] Trial 2 finished with value: 0.3410909965267795 and parameters: {'n_estimators': 160, 'max_depth': 8, 'min_samples_split': 9, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.2565200035512326.
[I 2024-05-13 22:08:07,351] Trial 3 finished with value: 0.25938507150878315 and parameters: {'n_estimators': 158, 'max_depth': 16, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value

In [9]:
best_grid_params

{'max_depth': 20,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

In [10]:
best_random_params

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_depth': 20}

In [11]:
best_optuna_params

{'n_estimators': 143,
 'max_depth': 18,
 'min_samples_split': 4,
 'min_samples_leaf': 2}

In [12]:
print("Grid Search MSE:", grid_search_mse)
print("Random Search MSE:", random_search_mse)
print("Optuna MSE:", optuna_mse)

Grid Search MSE: 0.2545922861286096
Random Search MSE: 0.2545922861286096
Optuna MSE: 0.2556043781408411
