## MMA 860 Team Project: Predicting Housing Prices

Team Istanbul

In [90]:
# %pip install openpyxl
# %matplotlib inline
# %pip install statsmodels
# %pip install scikit-learn seaborn
# %pip install jupyter_contrib_nbextensions
# %pip install --upgrade scikit-learn
# %pip install lightgbm xgboost scikit-learn --quiet
# %pip install missingno

In [91]:
import os

import pandas as pd
import numpy as np

import missingno as msno

import statsmodels.imputation.mice as mice
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import Ridge, Lasso
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.ensemble import StackingRegressor

from patsy import dmatrices
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [92]:
#Converting data source to dataframes
file_path_test  = "test.csv"
file_path_train = "train.csv"

test  = pd.read_csv(file_path_test)
train = pd.read_csv(file_path_train)

data = pd.concat([train, test], sort=False)

### Data Cleaning and EDA

In [93]:
# Fill categorical with mode
cat_cols = data.select_dtypes(include='object').columns
for col in cat_cols:
    data[col] = data[col].fillna(data[col].mode()[0])
    lbl = LabelEncoder()
    data[col] = lbl.fit_transform(data[col].astype(str))

# Fill numerical with median
num_cols = data.select_dtypes(include=['int64', 'float64']).columns
for col in num_cols:
    data[col] = data[col].fillna(data[col].median())

### Feature Engineering

In [94]:
#Combining features for Bathrooms to reduce dimensions
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
data['TotalBath'] = (data['FullBath'] + data['HalfBath'] * 0.5 +
                     data['BsmtFullBath'] + data['BsmtHalfBath'] * 0.5)

#Combining features for porches to reduce dimensions
data['TotalPorchSF'] = (data['OpenPorchSF'] + data['EnclosedPorch'] +
                        data['3SsnPorch'] + data['ScreenPorch'])

#Converting numerical features to Categorical (binary features make presence/effect of features more explicit)
data['HasPool'] = (data['PoolArea'] > 0).astype(int)
data['HasGarage'] = (data['GarageArea'] > 0).astype(int)
data['HasFireplace'] = (data['Fireplaces'] > 0).astype(int)


### Regression Modelling

In [95]:
#Split data into train & test sets
train_clean = data[:len(train)].copy()
test_clean = data[len(train):].copy()
train_clean['SalePrice'] = train['SalePrice']

In [96]:
#Log transforming target to reduce skewness in data
y = np.log1p(train_clean['SalePrice'])

X = train_clean.drop(['Id', 'SalePrice'], axis=1)
X_test = test_clean.drop(['Id', 'SalePrice'], axis=1)

In [97]:
#Defining base models
Using pipeline method to chain steps & uses features in similar scaling
ridge = make_pipeline(RobustScaler(), Ridge(alpha=15))
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005))

#Usring XBG & LGB models to handle complex feature and target relationships
xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=3,
                   subsample=0.7, colsample_bytree=0.7, random_state=42)
lgbm = LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05,
                     n_estimators=1000, random_state=42)

In [98]:
#Stacking all models
stacked_model = StackingRegressor(
    estimators=[('ridge', ridge), ('lasso', lasso), ('xgb', xgb), ('lgbm', lgbm)],
    final_estimator=Ridge(alpha=10)
)

### Model Evaluation

In [100]:
#Calculating Root Mean Square Error 
def rmse_cv(model):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    rmse = -cross_val_score(model, X, y, scoring="neg_root_mean_squared_error", cv=kf)
    return rmse.mean()

print(f"Stacked Model CV RMSE: {rmse_cv(stacked_model):.5f}/n")

### Predict Values for Competition

In [101]:
#Fit and predict
stacked_model.fit(X, y)
final_preds = np.expm1(stacked_model.predict(X_test))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000594 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3889
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 78
[LightGBM] [Info] Start training from score 12.024057
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000534 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3596
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 76
[LightGBM] [Info] Start training from score 12.021409
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000397 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

In [102]:
#Export csv with predictions for comptetion submission
submission = pd.DataFrame({
    'Id': test['Id'],
    'SalePrice': final_preds
})
submission.to_csv('submission.csv', index=False)
print("Submission file saved: submission.csv")

Submission file saved: submission.csv
