### Stacking algorithm -> classification models

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sns
from imblearn.under_sampling import TomekLinks
from sklearn.ensemble import IsolationForest
from scipy import stats
from sklearn.preprocessing import label_binarize

In [13]:
df = pd.read_csv("housing_classification_processed.csv")

In [14]:
# X/y split etc.
target = "grade_category"

# catboost expects data either in int or str, we convert it to integer
df['floors'] = df['floors'].astype('int64')

# we loose some decimal bathrooms but it should not be a problem
df['bathrooms'] = df['bathrooms'].astype('int64')

# categorical features need to be separated for certain algorithms, like CatBoost
categorical_features = ['bedrooms', 'floors', 'view']

X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SCALING => some of the algorithms require this
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Stacking Classifier -> we use XGBoost + RF + LGBM again

In [15]:
from sklearn.ensemble import StackingClassifier

# 'Random Forest': RandomForestRegressor(),
# 'XGBoost': xgb.XGBRegressor(enable_categorical=True, objective='reg:squarederror'),
# 'LightGBM': lgb.LGBMRegressor(objective='regression')

# Best Parameters: {'learning_rate': 0.06142857142857143, 'iterations': 600.0, 'depth': 5}

# meta model is the judge or optimizer of the stacking algorithm
# => it assigns weights to the models above based on their performance
#meta_model = LogisticRegression()
meta_model = RandomForestClassifier()

# stacking regression algorithm
stacking_model = StackingClassifier(
    estimators=[('xgb', xgb.XGBClassifier(enable_categorical=True, objective='multi:softprobar', num_class=len(X.columns))),
                ('CatBoost-optimized', cb.CatBoostClassifier(iterations=600, learning_rate=0.06142857142857143, depth=5, verbose=0)),
                ('randomforest', RandomForestClassifier(n_estimators=100)),
                ('lgbm', lgb.LGBMClassifier(objective='multiclass'))
                ],
    final_estimator = meta_model,
    cv=5
)

# fit the model
stacking_model.fit(X_train, y_train)

# test predictions and metrics
predictions = stacking_model.predict(X_test)

accuracy = stacking_model.score(X_test, y_test)
print()
print(f"Accuracy {accuracy}")


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000763 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 445
[LightGBM] [Info] Number of data points in the train set: 13484, number of used features: 6
[LightGBM] [Info] Start training from score -2.226498
[LightGBM] [Info] Start training from score -0.821648
[LightGBM] [Info] Start training from score -0.940993
[LightGBM] [Info] Start training from score -2.778241
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000564 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 437
[LightGBM] [Info] Number of data points in the train set: 10787, number of used features: 6
[LightGBM] [Info] Start training from score -2.226479
[LightGBM] [Info] Start trai

More Anayalsis


In [None]:

# we used LogisticRegression as meta model and XGBClassifier, CatBoostClassifier, RandomForestClassifier and LGBMClassifier were stacking in model list
# According to Accuracy value, stacking is slightly better than benchmarking

In [None]:
# tried Random Forest as meta model with the same stacked models list
# Accuracy 0.7019572953736655 is almost the same

In [None]:
# According to accuracy I also saw Catboot optimized algo also good one, hut Random would be the best option
# Because accuracy is better then this as compared with other model by using Staking approach.