In [17]:
# all needed installation are done
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import tensorflow as tf
import keras
from keras import layers

####  Phase 1: Load the data

In [18]:
df = pd.read_csv("processed_regression_housing.csv")
df.head()

Unnamed: 0,housing_median_age,total_rooms,population,median_income,median_house_value,distance_to_nearest_city,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY
0,52.0,1627.0,565.0,3.8462,342200.0,17.06,0,0,0,1
1,52.0,919.0,413.0,4.0368,269700.0,17.06,0,0,0,1
2,52.0,2535.0,1094.0,3.6591,299200.0,16.55,0,0,0,1
3,52.0,3104.0,1157.0,3.12,241400.0,16.55,0,0,0,1
4,42.0,2555.0,1206.0,2.0804,226700.0,15.76,0,0,0,1


####  X/y + train/test -splits + other data setups

In [19]:
df.describe()

Unnamed: 0,housing_median_age,total_rooms,population,median_income,median_house_value,distance_to_nearest_city,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY
count,16646.0,16646.0,16646.0,16646.0,16646.0,16646.0,16646.0,16646.0,16646.0,16646.0
mean,28.525952,2365.672954,1355.58867,3.4032,170451.285354,96.937193,0.448036,0.361889,0.00012,0.069386
std,11.956154,1443.732793,812.110795,1.259225,75183.944439,87.01934,0.497307,0.480561,0.010961,0.254117
min,1.0,2.0,6.0,0.4999,14999.0,0.42,0.0,0.0,0.0,0.0
25%,19.0,1424.0,809.0,2.4375,110500.0,24.4225,0.0,0.0,0.0,0.0
50%,29.0,2048.0,1181.0,3.26235,162500.0,68.54,0.0,0.0,0.0,0.0
75%,37.0,2949.0,1701.75,4.2386,222000.0,155.0675,1.0,1.0,0.0,0.0
max,52.0,13670.0,7228.0,6.7395,395300.0,489.12,1.0,1.0,1.0,1.0


In [20]:
df.columns

Index(['housing_median_age', 'total_rooms', 'population', 'median_income',
       'median_house_value', 'distance_to_nearest_city',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY'],
      dtype='str')

In [21]:
# CatBoost requires we save our continuous and categorical variables separately into lists
categorical_variables = [ 
    "ocean_proximity_<1H OCEAN",
    "ocean_proximity_INLAND",
    "ocean_proximity_ISLAND",
    "ocean_proximity_NEAR BAY"]

# continuous variables also into a list
continuous_variables = [ 
    "housing_median_age",
    "total_rooms",
    "population",
    "median_income",
    "distance_to_nearest_city",
    "median_house_value"]

# the usual X/y -split
X = df.drop("median_house_value", axis=1)
y = df['median_house_value']

# usual train/test -split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# NOTE! SCALING => some of the algorithms require this
scaler = StandardScaler()

# create separate versions for the scaled data
# because we need both unscaled and scaled versions later
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

####  List all models that we want to benchmark

In [22]:
# define our model test dictionary

# our data has 17 697 rows, so it is not a huge dataset.

# RandomizedSearchCV results LightGBM:
# 1. Best parameters from RandomizedSearchCV: {'learning_rate': np.float64(0.08948275862068966), 'max_depth': 17, 'num_leaves': 48}
# 2. Best parameters from RandomizedSearchCV: {'learning_rate': np.float64(0.13120689655172416), 'max_depth': 23, 'num_leaves': 46}
# 3. Best parameters: {'depth': 7, 'iterations': 1199, 'l2_leaf_reg': np.float64(5.778919229638712), 'learning_rate': np.float64(0.045241217646849234)}
# Best CV score (neg MSE): -1539596482.1394653
# 4. Best parameters: {'depth': 8, 'iterations': 944, 'l2_leaf_reg': np.float64(5.08618378675995), 'learning_rate': np.float64(0.05250387504557257)} 
# Best CV score (neg MSE): -2547064787.5312743
models = {
    "Linear Regression": LinearRegression(),
    "SVM": SVR(),
    "KNN": KNeighborsRegressor(),
    "Random Forest": RandomForestRegressor(),
    "XGBoost": xgb.XGBRegressor(enable_categorical=True, objective="reg:squarederror"),
    "CatBoost-default": cb.CatBoostRegressor(verbose=0, random_seed=42),
    "CatBoost-opt": cb.CatBoostRegressor(
        verbose=0,
        random_seed=42,
        depth=7,
        iterations=1199,
        l2_leaf_reg=5.778919229638712,
        learning_rate=0.045241217646849234
    ),
    'LightGBM-default': lgb.LGBMRegressor(objective='regression'),
    'LightGBM-opt': lgb.LGBMRegressor(objective='regression', learning_rate=0.13120689655172416, max_depth=23, num_leaves=46),
}

####  We need a for-loop to train all the models in one go + gather metrics

In [23]:
# our benchmarking code!

# this will contain our results later after the benchmarks
results = []

# loop through the models
for name, model in models.items():
    # we have to react to certain algorithms
    # because they have special requirements for fit()

    print("Starting..." + name)

    # SVM / KNN require scaled data
    if name in ["SVM", "KNN"]:
        model.fit(X_train_scaled, y_train)
        predictions = model.predict(X_test_scaled)
    elif "CatBoost" in name:
        # CatBoost requires we specify which columns are categories
        # this includes both ordinal and nominal categories
        model.fit(X_train, y_train, cat_features=categorical_variables)
        predictions = model.predict(X_test)
    else:
        # all the other algorithms
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

    # after training this particular algorithm, gather the metrics
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, predictions)

    # save the metrics for this model into the results
    results.append([name, mae, mse, rmse, r2])

Starting...Linear Regression
Starting...SVM
Starting...KNN
Starting...Random Forest
Starting...XGBoost
Starting...CatBoost-default
Starting...CatBoost-opt
Starting...LightGBM-default
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000229 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 13316, number of used features: 8
[LightGBM] [Info] Start training from score 170768.909357
Starting...LightGBM-opt
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 13316, number of used features: 8
[LightGBM] [Info] Start training from score 170768.909357


####  Let's visualize the results

In [24]:
metrics_df = pd.DataFrame(results, columns=['Model', 'MAE', 'MSE', 'RMSE', 'R2'])

# change these highlight colors as you wish
# default style is dart theme / mode, modify
# variable default_highlight_style if you wish 
# to use light theme instead
light_theme = 'background: yellow'
dark_theme = 'background: #8C670A'

# you can alternate the highlight styling based on your theme
default_highlight_style = dark_theme

# helper function that highlights the best model of each metric
def highlight_best_metrics(row):
    # default styles for everything is empty in the beginning
    styles = ['' for _ in row]

    # index 1 => MAE (index 0 => model name)
    if row['MAE'] == metrics_df['MAE'].min():
        styles[1] = default_highlight_style

    # index 2 = > MSE
    if row['MSE'] == metrics_df['MSE'].min():
        styles[2] = default_highlight_style

    # index 3 = > RMSE
    if row['RMSE'] == metrics_df['RMSE'].min():
        styles[3] = default_highlight_style

    # index 4 = > R-squared (R2)
    if row['R2'] == metrics_df['R2'].max():
        styles[4] = default_highlight_style

    return styles


# apply the custom styles based on the min/max metrics
highlight_df = metrics_df.style.apply(highlight_best_metrics, axis=1)


highlight_df

Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,Linear Regression,35592.266319,2102541801.965029,45853.481896,0.625315
1,SVM,59857.042189,5553526866.823964,74521.98915,0.010331
2,KNN,31799.441682,1882234975.74806,43384.732058,0.664575
3,Random Forest,30328.614441,1675219547.341729,40929.445969,0.701467
4,XGBoost,29107.025732,1567242792.428391,39588.417402,0.720709
5,CatBoost-default,28175.714453,1481038772.110498,38484.266553,0.736071
6,CatBoost-opt,28098.008981,1472847518.695074,38377.695589,0.73753
7,LightGBM-default,28988.168406,1535084969.101021,39180.160402,0.726439
8,LightGBM-opt,28722.940712,1533491393.608109,39159.818611,0.726723
