####  Finding optimal hyperparameters for one of the algorithms (LightGBM in this case)

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import tensorflow as tf
import keras
from keras import layers

#### Load the data

In [2]:
df = pd.read_csv("processed_regression_housing.csv")
df.head()

Unnamed: 0,housing_median_age,total_rooms,population,median_income,median_house_value,distance_to_nearest_city,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY
0,52.0,1627.0,565.0,3.8462,342200.0,17.06,0,0,0,1
1,52.0,919.0,413.0,4.0368,269700.0,17.06,0,0,0,1
2,52.0,2535.0,1094.0,3.6591,299200.0,16.55,0,0,0,1
3,52.0,3104.0,1157.0,3.12,241400.0,16.55,0,0,0,1
4,42.0,2555.0,1206.0,2.0804,226700.0,15.76,0,0,0,1


####  Same X/y + train/test split

In [3]:
categorical_variables = [ 
    "ocean_proximity_<1H OCEAN",
    "ocean_proximity_INLAND",
    "ocean_proximity_ISLAND",
    "ocean_proximity_NEAR BAY"]

# continuous variables also into a list
continuous_variables = [ 
    "housing_median_age",
    "total_rooms",
    "population",
    "median_income",
    "distance_to_nearest_city",
    "median_house_value"]

# the usual X/y -split
X = df.drop("median_house_value", axis=1)
y = df['median_house_value']

# usual train/test -split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# NOTE! SCALING => some of the algorithms require this
scaler = StandardScaler()

# create separate versions for the scaled data
# because we need both unscaled and scaled versions later
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

####  In this example, use RandomizedSearchCV to search better hyperparameters for our algorithm -  CatBoost

#### This example was taken from AI

In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Define parameter distributions for CatBoost
param_dist = {
    'depth': randint(4, 10),                 # tree depth
    'learning_rate': uniform(0.03, 0.15),    # learning rate range
    'iterations': randint(300, 1200),        # number of trees
    'l2_leaf_reg': uniform(1, 9),             # L2 regularization
}

# Setup RandomizedSearchCV
random_search_cat = RandomizedSearchCV(
    estimator=CatBoostRegressor(
        loss_function='RMSE',
        verbose=0,
        random_seed=42
    ),
    param_distributions=param_dist,
    n_iter=100,          # CatBoost is slower → 100 is realistic
    cv=3,
    n_jobs=-1,
    verbose=1,
    scoring='neg_mean_squared_error'
)

# Fit search
random_search_cat.fit(X_train, y_train)

# Best parameters
print("Best parameters:", random_search_cat.best_params_)
print("Best CV score (neg MSE):", random_search_cat.best_score_)


Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [None]:
# Taken from AI to understand if the model is overfitting or not. According to AI if train > test → overfitting and i need to reduce max_depth
# If train and test are about the same than i am good. 
# In my case -df without dropping rows that are higher than 500 001 -  train_score-1900641404.2566435, test_score-2604742485.6073427) i am good with these parameters, because test is not not wildly higher than train.
# PS1 - using df before optimization, i got train_score = -1968491890.4575675 and test_score = -2972746424.5103645. According to Ai, 
# I need to  change max_depth from 'max_depth': randint(-1, 40) to "max_depth": randint(4, 10) for example.
# PS2 - df before optimization, train_score = -2191108649.1476135 and test_score = -3034239139.642399. A bit better but still quite a big gap.

train_score = random_search.score(X_train, y_train)
test_score  = random_search.score(X_test, y_test)

print(train_score, test_score)

NameError: name 'random_search' is not defined