# House Price Prediction

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import scipy
from scipy.special import boxcox1p
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from sklearn.preprocessing import StandardScaler
from pycaret.regression import setup, compare_models

from sklearn.linear_model import BayesianRidge, OrthogonalMatchingPursuit
from sklearn.ensemble import GradientBoostingRegressor
from lightgbm import LGBMRegressor


import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sample = pd.read_csv("sample_submission.csv")

In [3]:
test_ids = test_df["Id"]

In [4]:
train_ids = train_df["Id"]
target = train_df["SalePrice"]

df = pd.concat([train_df, test_df], axis=0)
df.reset_index(inplace=True)
df = df.drop(["SalePrice", "index", "Id"], axis=1)

## Proper DTypes

In [5]:
df["MSSubClass"] = df["MSSubClass"].astype(str)

## Handle Missing Values

In [6]:
df.select_dtypes("object").loc[:, df.isna().sum() > 0].columns

Index(['MSZoning', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PoolQC', 'Fence', 'MiscFeature', 'SaleType'],
      dtype='object')

#### Impute Categorical Columns

In [7]:
# NAN Means Something
for column in ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType",
     "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"
]:
    df[column] = df[column].fillna("None")

# NAN Doesn`t mean something
for column in ["MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "Electrical", "KitchenQual", "Functional", "SaleType",
     "MasVnrType"
]:
    df[column] = df[column].fillna(df[column].mode()[0])

In [8]:
missing_columns = {}
for column in df.columns:
    if df[column].isna().sum() > 0:
        missing_columns[column] = [df[column].isna().sum(), df[column].dtype]        

In [9]:
df.select_dtypes(np.number).loc[:, df.isna().sum() > 0].columns

Index(['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageYrBlt',
       'GarageCars', 'GarageArea'],
      dtype='object')

#### Impute Numerical Columns

In [10]:
imputer = KNNImputer(n_neighbors = 5)
numeric_df = df.select_dtypes(np.number)

In [11]:
imputed_df = imputer.fit_transform(numeric_df)
imputed_df = pd.DataFrame(imputed_df, index=numeric_df.index, columns=numeric_df.columns)

In [12]:
imputed_cols = numeric_df.columns
for col in imputed_cols:
    df[col] = imputed_df[col]

In [13]:
df.isna().sum().sum()

0

## Feature Transformation

#### Skewed Numeric Features Transformation

In [14]:
df.select_dtypes(np.number).describe()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,...,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,69.994519,10168.11408,6.089072,5.564577,1971.312778,1984.264474,102.456389,441.30024,49.576841,560.845152,...,472.897568,93.709832,47.486811,23.098321,2.602261,16.06235,2.251799,50.825968,6.213087,2007.792737
std,22.58823,7886.996359,1.409947,1.113131,30.291442,20.894344,178.918648,455.581216,169.176867,439.486058,...,215.361488,126.526589,67.575493,64.244246,25.188169,56.184365,35.663946,567.402211,2.714762,1.314964
min,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,60.0,7478.0,5.0,5.0,1953.5,1965.0,0.0,0.0,0.0,220.0,...,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,70.0,9453.0,6.0,5.0,1973.0,1993.0,0.0,368.0,0.0,467.0,...,480.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,80.0,11570.0,7.0,6.0,2001.0,2004.0,165.0,733.0,0.0,805.0,...,576.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1526.0,2336.0,...,1488.0,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0


In [15]:
skew_df = pd.DataFrame(df.select_dtypes(np.number).columns, columns=["Feature"])
skew_df["Skew"] =  skew_df["Feature"].apply(lambda feature: abs(scipy.stats.skew(df[feature])))
skew_df["Skewed"] = skew_df["Skew"] > 0.4

In [16]:
skew_df

Unnamed: 0,Feature,Skew,Skewed
0,LotFrontage,1.353444,True
1,LotArea,12.822431,True
2,OverallQual,0.19711,False
3,OverallCond,0.570312,True
4,YearBuilt,0.599806,True
5,YearRemodAdd,0.45102,True
6,MasVnrArea,2.597595,True
7,BsmtFinSF1,1.425421,True
8,BsmtFinSF2,4.146111,True
9,BsmtUnfSF,0.918938,True


In [17]:
skewed_cols = skew_df[skew_df["Skewed"] == True]["Feature"].values
lam = 0.15
for col in skewed_cols:
    df[col] = boxcox1p(df[col], lam)

#### Categorical Features Transformation

In [18]:
df = pd.get_dummies(df)

#### Target Transformation

In [19]:
boxcox_target, lam = boxcox(target)
boxcox_target = pd.Series(boxcox_target)
boxcox_target.name = "SalePrice"

#### Scale Data

In [20]:
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), index=df.index, columns=df.columns)
df.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-0.08518,-0.14236,0.646183,-0.449532,1.041483,0.895888,1.165548,0.800175,-0.358469,-0.548233,...,-0.052423,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
1,0.536508,0.110415,-0.063185,1.957283,0.160515,-0.391011,-0.789452,0.974354,-0.358469,-0.132447,...,-0.052423,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
2,0.048143,0.431443,0.646183,-0.449532,0.976573,0.848489,1.065461,0.610875,-0.358469,0.167138,...,-0.052423,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693
3,-0.319292,0.099975,0.646183,-0.449532,-1.868103,-0.679016,-0.789452,0.234987,-0.358469,0.329226,...,-0.052423,-0.298629,-0.049029,0.394439,3.789876,-0.064249,-0.09105,-0.126535,-2.155466,-0.302693
4,0.685631,0.925808,1.355551,-0.449532,0.944098,0.75363,1.488837,0.7613,-0.358469,0.256492,...,-0.052423,-0.298629,-0.049029,0.394439,-0.263861,-0.064249,-0.09105,-0.126535,0.463937,-0.302693


## Split Data

In [21]:
train = df.iloc[:len(train_df), :].copy()
test = df.iloc[len(train_df):, :].copy()

## Compare Models

In [22]:
_ = setup(data=pd.concat([train, boxcox_target], axis=1), target="SalePrice")

Unnamed: 0,Description,Value
0,Session id,1147
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(1460, 318)"
4,Transformed data shape,"(1460, 318)"
5,Transformed train set shape,"(1021, 318)"
6,Transformed test set shape,"(439, 318)"
7,Numeric features,317
8,Preprocess,True
9,Imputation type,simple


In [23]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.0349,0.0027,0.051,0.8891,0.0058,0.0045,0.109
lightgbm,Light Gradient Boosting Machine,0.0367,0.0029,0.0537,0.8782,0.0061,0.0047,0.117
br,Bayesian Ridge,0.0345,0.0032,0.0552,0.8667,0.0063,0.0044,0.021
omp,Orthogonal Matching Pursuit,0.0355,0.0033,0.0557,0.8622,0.0063,0.0045,0.018
rf,Random Forest Regressor,0.0386,0.0034,0.0572,0.8609,0.0065,0.0049,0.278
et,Extra Trees Regressor,0.0395,0.0035,0.0584,0.8544,0.0067,0.0051,0.247
ridge,Ridge Regression,0.0362,0.0037,0.0594,0.8462,0.0067,0.0046,0.018
ada,AdaBoost Regressor,0.0525,0.005,0.0707,0.7887,0.008,0.0067,0.069
knn,K Neighbors Regressor,0.0571,0.0065,0.0802,0.7286,0.0091,0.0073,0.03
dt,Decision Tree Regressor,0.0583,0.0073,0.0847,0.6927,0.0097,0.0075,0.021


In [24]:
models = {
    "br" : BayesianRidge(alpha_1 = 0.010685598696613362,
                         alpha_2 = 0.047812312259566246,
                         lambda_1 = 0.04543456004541208,
                         lambda_2 = 6.1054020906826185e-06),
    "gbr" : GradientBoostingRegressor(),
    "lgbm" : LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11),
    
    "omp" : OrthogonalMatchingPursuit()
}

In [25]:
for name, model in models.items():
    model.fit(train, boxcox_target)
    print(f"{name} Trained")

br Trained
gbr Trained
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001082 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1769
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 261
[LightGBM] [Info] Start training from score 7.842252
lgbm Trained
omp Trained


In [26]:
# Make predictions with each model
br_predictions = models["br"].predict(test)
omp_predictions = models["omp"].predict(test)
gbr_predictions = models["gbr"].predict(test)
lgbm_predictions = models["lgbm"].predict(test)



In [27]:
# Apply inverse Box-Cox transformation to each model's predictions
br_pred_transformed = inv_boxcox(br_predictions, lam)
omp_pred_transformed = inv_boxcox(omp_predictions, lam)
gbr_pred_transformed = inv_boxcox(gbr_predictions, lam)
lgbm_pred_transformed = inv_boxcox(lgbm_predictions, lam)

In [28]:
# Combine the predictions with weights
final_predictions = (
    0.3 * br_pred_transformed +
    0.3 * omp_pred_transformed +
    0.2 * gbr_pred_transformed +
    0.2 * lgbm_pred_transformed
)

In [29]:
submission = pd.concat([test_ids, pd.Series(final_predictions, name="SalePrice")], axis=1)
submission["Id"] = submission["Id"].astype("Int32")
submission

Unnamed: 0,Id,SalePrice
0,1461,123520.549213
1,1462,159145.344312
2,1463,185880.568079
3,1464,197728.654812
4,1465,195883.014634
...,...,...
1454,2915,85873.041328
1455,2916,84382.360956
1456,2917,168115.999744
1457,2918,120024.563200


In [31]:
submission.to_csv("submission001.csv", index=False,  header=1)