In [6]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import tensorflow as tf
import keras
from keras import layers

In [7]:
df = pd.read_csv("housing.csv")
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [8]:
from sklearn.preprocessing import LabelEncoder


In [9]:
df['ocean_proximity'] = LabelEncoder().fit_transform(df['ocean_proximity'])

# CatBoost requires we save our continuous and categorical variables separately into a list
categorical_variables = ['ocean_proximity']

# continous variables also into a list
continuous_variables = ['longitude', 'latitude', 'housing_median_age', 'total_rooms	', 'total_bedrooms', 'population', 'households', 'median_income']

# X/y 
X = df.drop("median_house_value", axis=1)
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SCALING => some of the algorithms require this
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from scipy.stats import randint

# Define the parameter distributions with integer ranges
# Notice how we use NumPy and scipy in order to have ranges of values
# for the RandomizedSearch, which it tries to combine in a random manner 
# => based on luck, you might stumble upon a very good combination of parameters
param_dist = {
    'learning_rate': np.linspace(0.01, 0.1, 20), 
    'num_leaves': randint(20, 45),  
    'max_depth': randint(3, 15), 
}


# Setup RandomizedSearchCV
# search 300 times (n_iter)
# n_jobs => -1 => use all CPU cores
# cv = cross-validation strategy (higher is often better, but takes more time), typical values 3-5
random_search = RandomizedSearchCV(
    estimator=lgb.LGBMRegressor(), 
    param_distributions=param_dist, 
    n_iter=300,  # Number of random combinations to test
    cv=5,
    n_jobs=-1,
    verbose=1,  
    scoring='neg_mean_squared_error'
)

# Fit the randomized search
random_search.fit(X_train, y_train)

# Get the best parameters
best_params_random = random_search.best_params_
print("\nBest parameters from RandomizedSearchCV:", best_params_random)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1843
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 9
[LightGBM] [Info] Start training from score 206376.438287

Best parameters from RandomizedSearchCV: {'learning_rate': 0.1, 'max_depth': 13, 'num_leaves': 43}
