In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

import tensorflow as tf
import keras
from keras import layers

In [5]:
df = pd.read_csv("housing.csv")
df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [6]:
from sklearn.preprocessing import LabelEncoder


In [7]:
df['ocean_proximity'] = LabelEncoder().fit_transform(df['ocean_proximity'])

# CatBoost requires we save our continuous and categorical variables separately into a list
categorical_variables = ['ocean_proximity']

# continous variables also into a list
continuous_variables = ['longitude', 'latitude', 'housing_median_age', 'total_rooms	', 'total_bedrooms', 'population', 'households', 'median_income']

# X/y 
X = df.drop("median_house_value", axis=1)
y = df['median_house_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SCALING => some of the algorithms require this
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
from sklearn.ensemble import StackingRegressor

# 'Random Forest': RandomForestRegressor(),
# 'XGBoost': xgb.XGBRegressor(enable_categorical=True, objective='reg:squarederror'),
# 'LightGBM': lgb.LGBMRegressor(objective='regression')

# meta model is the judge or optimizer of the stacking algorithm
# => it assigns weights to the models above based on their performance
meta_model = LinearRegression()

# stacking regression algorithm
stacking_model = StackingRegressor(
    estimators=[('xgb', xgb.XGBRegressor(enable_categorical=True, objective='reg:squarederror')),
                ('randomforest', RandomForestRegressor()),
                ('lgbm', lgb.LGBMRegressor(objective='regression'))
                ],
    final_estimator = meta_model,
    cv=5
)

# fit the model
stacking_model.fit(X_train, y_train)

# test predictions and metrics
predictions = stacking_model.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print()
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1843
[LightGBM] [Info] Number of data points in the train set: 16512, number of used features: 9
[LightGBM] [Info] Start training from score 206495.116824
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1843
[LightGBM] [Info] Number of data points in the train set: 13209, number of used features: 9
[LightGBM] [Info] Start training from score 206578.516845
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1842
[LightGBM] [Info] Number of data points in the train set: 13209, number of used features: 9
[LightGBM] [Info] St