# Optymalizacja hiperparamterów

In [1]:
import pandas as pd
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
import optuna
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_val_score
import random
from sklearn.metrics import r2_score

### sztuczne pogorszenie wyników modelu bazowego poprzez usunięcie mocno skorelowanych cech

Z wszystkimi cechami R^2 było na poziomie 0.96, dopiero po usunięciu 6 cech spadło do 0.86.

In [2]:
data = pd.read_csv('mean_min_max.csv')

X = data.drop(columns=['sellingprice', 'car_age', 'odometer', 'sale_year', 'condition', 'mmr', 'sale_month'])
y = data['sellingprice']

In [3]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Rozmiar zbioru treningowego: {X_train.shape}")
print(f"Rozmiar zbioru walidacyjnego: {X_val.shape}")
print(f"Rozmiar zbioru testowego: {X_test.shape}")

Rozmiar zbioru treningowego: (307333, 14)
Rozmiar zbioru walidacyjnego: (102445, 14)
Rozmiar zbioru testowego: (102445, 14)


In [4]:
base_model = HistGradientBoostingRegressor(random_state=42)
base_model.fit(X_train, y_train)

y_test_pred_base = base_model.predict(X_test)
mse_base = mean_squared_error(y_test, y_test_pred_base)
r2_base = r2_score(y_test, y_test_pred_base)

print("=== Model Bazowy ===")
print(f"Mean Squared Error (Testowy): {mse_base:.4f}")
print(f"R² (Testowy): {r2_base:.4f}\n")

=== Model Bazowy ===
Mean Squared Error (Testowy): 0.0070
R² (Testowy): 0.8572



## GridSearchCV

In [5]:
model = HistGradientBoostingRegressor(random_state=42)

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_iter': [100, 200, 300],
    'max_depth': [None, 3, 5],
    'min_samples_leaf': [20, 30, 50],
    'l2_regularization': [0.0, 1.0, 10.0],
}

grid_search = GridSearchCV(
    model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1
)
grid_search.fit(X_train, y_train)

print("GridSearchCV Najlepsze parametry:", grid_search.best_params_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
GridSearchCV Najlepsze parametry: {'l2_regularization': 0.0, 'learning_rate': 0.2, 'max_depth': None, 'max_iter': 300, 'min_samples_leaf': 20}


## RandomSearchCV

In [6]:
random_search = RandomizedSearchCV(
    model, param_distributions=param_grid, n_iter=20, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1
)
random_search.fit(X_train, y_train)

print("RandomizedSearchCV Najlepsze parametry:", random_search.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomizedSearchCV Najlepsze parametry: {'min_samples_leaf': 50, 'max_iter': 300, 'max_depth': None, 'learning_rate': 0.2, 'l2_regularization': 0.0}


In [7]:
grid_best_model = grid_search.best_estimator_
random_best_model = random_search.best_estimator_

for name, model in [("GridSearchCV", grid_best_model), ("RandomizedSearchCV", random_best_model)]:
    y_val_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_val_pred)
    print(f"{name} Mean Squared Error na zbiorze walidacyjnym:", mse)

GridSearchCV Mean Squared Error na zbiorze walidacyjnym: 0.004641173589074727
RandomizedSearchCV Mean Squared Error na zbiorze walidacyjnym: 0.0046466008004473125


## Optuna

In [8]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_iter': trial.suggest_int('max_iter', 100, 500),
        'max_depth': trial.suggest_categorical('max_depth', [None, 3, 5, 7]),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 20, 50),
        'l2_regularization': trial.suggest_float('l2_regularization', 1e-3, 10.0),
    }
    model = HistGradientBoostingRegressor(random_state=42, **params)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    return -scores.mean()

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

print("Optuna Najlepsze parametry:", study.best_params)

[I 2024-12-19 20:54:01,906] A new study created in memory with name: no-name-7a2e6100-1b06-44ec-8445-063048da0a3b
[I 2024-12-19 20:54:47,789] Trial 0 finished with value: 0.008118560870203444 and parameters: {'learning_rate': 0.11391289828319251, 'max_iter': 411, 'max_depth': 3, 'min_samples_leaf': 41, 'l2_regularization': 2.099510944204836}. Best is trial 0 with value: 0.008118560870203444.
[I 2024-12-19 20:55:27,641] Trial 1 finished with value: 0.005700753329242929 and parameters: {'learning_rate': 0.13163906624379165, 'max_iter': 286, 'max_depth': 5, 'min_samples_leaf': 25, 'l2_regularization': 6.855954268434842}. Best is trial 1 with value: 0.005700753329242929.
[I 2024-12-19 20:56:33,345] Trial 2 finished with value: 0.007079048114057289 and parameters: {'learning_rate': 0.028432253552642173, 'max_iter': 428, 'max_depth': 7, 'min_samples_leaf': 34, 'l2_regularization': 4.601235396076859}. Best is trial 1 with value: 0.005700753329242929.
[I 2024-12-19 20:57:06,030] Trial 3 finish

[I 2024-12-19 21:19:09,622] Trial 29 finished with value: 0.0052294577304131135 and parameters: {'learning_rate': 0.09412118075352532, 'max_iter': 405, 'max_depth': 7, 'min_samples_leaf': 43, 'l2_regularization': 2.4988721580962068}. Best is trial 12 with value: 0.0043317076499916046.
[I 2024-12-19 21:19:59,684] Trial 30 finished with value: 0.006478786123574135 and parameters: {'learning_rate': 0.23224015950551113, 'max_iter': 471, 'max_depth': 3, 'min_samples_leaf': 28, 'l2_regularization': 5.938684212638169}. Best is trial 12 with value: 0.0043317076499916046.
[I 2024-12-19 21:21:05,861] Trial 31 finished with value: 0.004353202743395257 and parameters: {'learning_rate': 0.2541254739899914, 'max_iter': 499, 'max_depth': None, 'min_samples_leaf': 22, 'l2_regularization': 4.784592921871652}. Best is trial 12 with value: 0.0043317076499916046.
[I 2024-12-19 21:22:04,349] Trial 32 finished with value: 0.004404395375158023 and parameters: {'learning_rate': 0.26420288899484473, 'max_iter'

Optuna Najlepsze parametry: {'learning_rate': 0.2930895840551921, 'max_iter': 487, 'max_depth': None, 'min_samples_leaf': 20, 'l2_regularization': 3.941533331112236}


In [9]:
best_params = study.best_params
optuna_model = HistGradientBoostingRegressor(random_state=42, **best_params)
optuna_model.fit(X_train, y_train)

HistGradientBoostingRegressor(l2_regularization=3.941533331112236,
                              learning_rate=0.2930895840551921, max_iter=487,
                              random_state=42)

In [10]:
y_val_pred = optuna_model.predict(X_val)
mse = mean_squared_error(y_val, y_val_pred)
print("Optuna Mean Squared Error na zbiorze walidacyjnym:", mse)

Optuna Mean Squared Error na zbiorze walidacyjnym: 0.004184684140353185


## Algorytm genetyczny

In [22]:
def evaluate(params):
    learning_rate, max_iter, max_depth, min_samples_leaf, l2_regularization = params
    model = HistGradientBoostingRegressor(
        learning_rate=learning_rate,
        max_iter=int(max_iter),
        max_depth=None if max_depth < 1 else int(max_depth),
        min_samples_leaf=int(min_samples_leaf),
        l2_regularization=l2_regularization,
        random_state=42
    )
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
    return -scores

In [23]:
def initialize_population(size):
    return [
        [
            random.uniform(0.01, 0.3),
            random.randint(100, 500),
            random.uniform(1, 7),
            random.randint(20, 50), 
            random.uniform(1e-3, 10)
        ]
        for _ in range(size)
    ]

In [24]:
def crossover(parent1, parent2):
    return [random.choice([p1, p2]) for p1, p2 in zip(parent1, parent2)]

In [25]:
def mutate(individual, mutation_rate=0.1):
    for i in range(len(individual)):
        if random.random() < mutation_rate:
            if i == 0: individual[i] = random.uniform(0.01, 0.3)
            if i == 1: individual[i] = random.randint(100, 500)
            if i == 2: individual[i] = random.uniform(1, 7)
            if i == 3: individual[i] = random.randint(20, 50)
            if i == 4: individual[i] = random.uniform(1e-3, 10)
    return individual

In [26]:
def genetic_algorithm(generations, population_size):
    population = initialize_population(population_size)
    for gen in range(generations):
        scores = [(evaluate(ind), ind) for ind in population]
        scores.sort()
        best = scores[:population_size // 2]
        
        next_gen = [ind for _, ind in best]
        while len(next_gen) < population_size:
            parent1, parent2 = random.sample(next_gen, 2)
            child = mutate(crossover(parent1, parent2))
            next_gen.append(child)
        population = next_gen
    
    return scores[0][1]

In [27]:
best_params_ga = genetic_algorithm(20, 10)


In [28]:
ga_model = HistGradientBoostingRegressor(
    learning_rate=best_params_ga[0],
    max_iter=int(best_params_ga[1]),
    max_depth=None if best_params_ga[2] < 1 else int(best_params_ga[2]),
    min_samples_leaf=int(best_params_ga[3]),
    l2_regularization=best_params_ga[4],
    random_state=42
)
ga_model.fit(X_train, y_train)

HistGradientBoostingRegressor(l2_regularization=2.30621555056688,
                              learning_rate=0.29080400772472587, max_depth=6,
                              max_iter=497, min_samples_leaf=34,
                              random_state=42)

In [29]:
y_val_pred = ga_model.predict(X_val)
mse = mean_squared_error(y_val, y_val_pred)
print("Genetic Algorithm Mean Squared Error na zbiorze walidacyjnym:", mse)

Genetic Algorithm Mean Squared Error na zbiorze walidacyjnym: 0.004342958569084839


## Podsumowanie

In [30]:
def evaluate_test_set(model, X_test, y_test):
    y_test_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_test_pred)
    r2 = r2_score(y_test, y_test_pred)
    return mse, r2

In [32]:
results = {}

for method, model, params in [
    ("GridSearchCV", grid_best_model, grid_search.best_params_),
    ("RandomizedSearchCV", random_best_model, random_search.best_params_),
    ("Optuna", optuna_model, best_params),
    ("Genetic Algorithm", ga_model, {
        'learning_rate': best_params_ga[0],
        'max_iter': int(best_params_ga[1]),
        'max_depth': None if best_params_ga[2] < 0.5 else int(best_params_ga[2]),
        'min_samples_leaf': int(best_params_ga[3]),
        'l2_regularization': best_params_ga[4],
    }),
]:
    mse, r2 = evaluate_test_set(model, X_test, y_test)
    results[method] = {
        "MSE": mse,
        "R2": r2,
        "Params": params,
    }

print("=== Podsumowanie Metod ===")
print(f"Model Bazowy - Mean Squared Error: {mse_base:.4f}, R²: {r2_base:.4f}\n")
for method, metrics in results.items():
    print(f"{method}:")
    print(f"  Mean Squared Error (Testowy): {metrics['MSE']:.4f}")
    print(f"  R² (Testowy): {metrics['R2']:.4f}")
    print(f"  Najlepsze Hiperparametry: {metrics['Params']}\n")

=== Podsumowanie Metod ===
Model Bazowy - Mean Squared Error: 0.0070, R²: 0.8572

GridSearchCV:
  Mean Squared Error (Testowy): 0.0046
  R² (Testowy): 0.9063
  Najlepsze Hiperparametry: {'l2_regularization': 0.0, 'learning_rate': 0.2, 'max_depth': None, 'max_iter': 300, 'min_samples_leaf': 20}

RandomizedSearchCV:
  Mean Squared Error (Testowy): 0.0046
  R² (Testowy): 0.9061
  Najlepsze Hiperparametry: {'min_samples_leaf': 50, 'max_iter': 300, 'max_depth': None, 'learning_rate': 0.2, 'l2_regularization': 0.0}

Optuna:
  Mean Squared Error (Testowy): 0.0042
  R² (Testowy): 0.9151
  Najlepsze Hiperparametry: {'learning_rate': 0.2930895840551921, 'max_iter': 487, 'max_depth': None, 'min_samples_leaf': 20, 'l2_regularization': 3.941533331112236}

Genetic Algorithm:
  Mean Squared Error (Testowy): 0.0043
  R² (Testowy): 0.9130
  Najlepsze Hiperparametry: {'learning_rate': 0.29080400772472587, 'max_iter': 497, 'max_depth': 6, 'min_samples_leaf': 34, 'l2_regularization': 2.30621555056688}

