# House price prediction [🔗](https://www.kaggle.com/competitions/home-data-for-ml-course/data)
by kaggle

### Importing required libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

# Approach:
1. Load data
2. Select useful features
3. Extract numerical columns and categorical columns, for `OrdinalEncoding` and `OneHotEncoding`
4. Make pipeline using `XGBRegressor` model
5. Use cross-validation (10 folds)
6. Check MAE    (if more, then repeat step 2)
7. Do hyperparameter tuning with default parameters
8. Find best parameters
9. Train model with best parameters
10. Predict price on `test_data`
11. Submit predictions

## 1. Loading data

In [3]:
house_data = pd.read_csv("../input/home-data-for-ml-course/train.csv", index_col="Id")
test_data = pd.read_csv("../input/home-data-for-ml-course/test.csv", index_col="Id")

pd.set_option("display.max_columns", house_data.shape[1])
house_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


## 2. Selecting useful features

In [4]:
useful_features = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'RoofMatl', 
                   'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'Condition1', 'YearBuilt', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtQual', 
                   'ExterQual', 'ExterCond', 'Foundation', 'TotalBsmtSF', 'Heating', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LandContour', 'GrLivArea', 'GarageType',
                   'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageCars', 'GarageArea',
                   'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'PoolArea', 'MiscFeature', 'MiscVal', 'BsmtFinType1']

house_data = house_data[useful_features + ["SalePrice"]]
test_data = test_data[useful_features]
X = house_data[useful_features]
y = house_data.SalePrice
len(useful_features)

50

In [5]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,Utilities,LotConfig,LandSlope,Neighborhood,RoofMatl,BldgType,HouseStyle,OverallQual,OverallCond,Condition1,YearBuilt,RoofStyle,Exterior1st,Exterior2nd,MasVnrType,BsmtQual,ExterQual,ExterCond,Foundation,TotalBsmtSF,Heating,CentralAir,1stFlrSF,2ndFlrSF,LandContour,GrLivArea,GarageType,FullBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageCars,GarageArea,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,PoolArea,MiscFeature,MiscVal,BsmtFinType1
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
1,60,RL,65.0,8450,Pave,Reg,AllPub,Inside,Gtl,CollgCr,CompShg,1Fam,2Story,7,5,Norm,2003,Gable,VinylSd,VinylSd,BrkFace,Gd,Gd,TA,PConc,856,GasA,Y,856,854,Lvl,1710,Attchd,2,3,1,Gd,8,0,,2,548,Y,0,61,0,0,,0,GLQ
2,20,RL,80.0,9600,Pave,Reg,AllPub,FR2,Gtl,Veenker,CompShg,1Fam,1Story,6,8,Feedr,1976,Gable,MetalSd,MetalSd,,Gd,TA,TA,CBlock,1262,GasA,Y,1262,0,Lvl,1262,Attchd,2,3,1,TA,6,1,TA,2,460,Y,298,0,0,0,,0,ALQ
3,60,RL,68.0,11250,Pave,IR1,AllPub,Inside,Gtl,CollgCr,CompShg,1Fam,2Story,7,5,Norm,2001,Gable,VinylSd,VinylSd,BrkFace,Gd,Gd,TA,PConc,920,GasA,Y,920,866,Lvl,1786,Attchd,2,3,1,Gd,6,1,TA,2,608,Y,0,42,0,0,,0,GLQ
4,70,RL,60.0,9550,Pave,IR1,AllPub,Corner,Gtl,Crawfor,CompShg,1Fam,2Story,7,5,Norm,1915,Gable,Wd Sdng,Wd Shng,,TA,TA,TA,BrkTil,756,GasA,Y,961,756,Lvl,1717,Detchd,1,3,1,Gd,7,1,Gd,3,642,Y,0,35,272,0,,0,ALQ
5,60,RL,84.0,14260,Pave,IR1,AllPub,FR2,Gtl,NoRidge,CompShg,1Fam,2Story,8,5,Norm,2000,Gable,VinylSd,VinylSd,BrkFace,Gd,Gd,TA,PConc,1145,GasA,Y,1145,1053,Lvl,2198,Attchd,2,4,1,Gd,9,1,TA,3,836,Y,192,84,0,0,,0,GLQ


## 3. Extracting numerical columns and categorical columns, for `Imputing`, `Ordinal` and `OneHot Encoding`

In [6]:
num_cols = X.select_dtypes(exclude="object").columns
cat_cols = X.select_dtypes(exclude=(int, float)).columns
oh_cols = ["Street", "LotShape", "LotConfig", "BldgType", "RoofStyle", "Foundation", "Heating", "MiscFeature"]
ord_cols = ["MSZoning", "Utilities", "LandSlope", "Neighborhood", "Condition1", "HouseStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", 
            "ExterCond", "ExterQual", "BsmtQual",  "CentralAir", "KitchenQual", "FireplaceQu", "GarageType", "PavedDrive", "BsmtFinType1"]

## 4. Making Pipeline (using `XGBRegressor`)

In [7]:
class CreatePipeline:
    
    def create(self, *, preprocessor, model, verbose=False):
        steps = [("preprocessor", preprocessor),
                 ("model", model)]
        return Pipeline(steps=steps, verbose=verbose)
    
    
    def numerical_transformer(self, *, strategy="mean", **params):
        """Transform numerical columns using `SimpleImputer`.
        params:
            strategy: "mean" | "median" | "most_frequent" | "constant"
            **params: extra keyword args for SimpleImputer"""
        
        transformer = SimpleImputer(strategy=strategy, **params)
        return transformer
    
    
    def categorical_transformer(self, *, imp_strategy="most_frequent", encoder_type="Ordinal", imp_params={}, encoder_params={}):
        """Transform categorical columns by making Pipeline
        `SimpleImputer` | `OneHotEncoder` | `OrdinalEncoder`.
        args:
            imp_strategy: strategy for imputer values can be
                "most_frequent" | "constant"
            encoder_type: encoder type,
                "Ordinal" | "OneHot"
        kwargs:
            imp_params: keyword args for `SimpleImputer`.
            encoder_params: keyword args for encoder.`
        """
        if not encoder_type in ("Ordinal", "OneHot"):
            raise ValueError(f"Inappropriate value for encoder_type passed: {encoder_type}\
            Takes one of 'Ordinal' | 'OneHot'.")
        
        encoder = OrdinalEncoder if encoder_type=="Ordinal" else OneHotEncoder
        transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy=imp_strategy, **imp_params)),
            (encoder_type, encoder(**encoder_params))
        ])
        return transformer
    
    def data_preprocessor(self, *, transformers):
        """Preprocess the data using `ColumnTransformer`.
        Pass extact list of transformers
        to be passed in `ColumnTransformer`.
        each tuple consist of: (transformer_name,
                                transformer,
                                list_of_columns)."""
        preprocessor = ColumnTransformer(transformers=transformers)
        return preprocessor
    
    def XGBRegressor_model(self, *, random_state=0, n_estimators=1000, **kwargs):
        """Create `XGBRegressor` model.
        **kwargs: keyword args for XGBRegressor."""
        model = XGBRegressor(random_state=random_state, n_estimators=n_estimators, **kwargs)
        return model

In [8]:
cp = CreatePipeline()

num_transformer = cp.numerical_transformer()
cat_ord_transformer = cp.categorical_transformer(encoder_params={"handle_unknown":"use_encoded_value", "unknown_value":-1})
cat_oh_transformer = cp.categorical_transformer(encoder_type="OneHot", encoder_params={"handle_unknown":"ignore"})

preprocessor = cp.data_preprocessor(
    transformers=[("num", num_transformer, num_cols),
                  ("cat_ord", cat_ord_transformer, ord_cols),
                  ("cat_oh", cat_oh_transformer, oh_cols)
                 ])

my_model = cp.XGBRegressor_model(learning_rate=0.05)

pipeline_1 = cp.create(preprocessor=preprocessor, model=my_model)

## 5. Doing cross validation (10 folds)

In [9]:
scores = -1 * cross_val_score(pipeline_1, X, y, cv=10, verbose=True,
                             scoring="neg_mean_absolute_error")
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.1min finished


array([15195.98721104, 17015.75305009, 15575.79232663, 18267.4460884 ,
       19745.83128211, 15499.66663099, 15307.10686002, 14926.47241545,
       17860.43771404, 16516.01452804])

In [10]:
scores.mean()

16591.05081068065

## 6. Doing hyperparameter tuning with defualt parameters

In [11]:
n_estimators = [100, 500, 800, 1000]
learning_rates = [0.5, 0.1, 0.08, 0.05]
maes = {}
for i, (n_estimator, learning_rate) in enumerate(zip(n_estimators, learning_rates), start=1):
    model = cp.XGBRegressor_model(n_estimators=n_estimator, learning_rate=learning_rate)
    pipeline = cp.create(preprocessor=preprocessor, model=model)
    mae = -1 * cross_val_score(pipeline, X, y, cv=10, verbose=True,
                             scoring="neg_mean_absolute_error")
    maes[i] = (n_estimator, learning_rate, mae.mean())


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   33.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   54.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.1min finished


In [12]:
for key in maes:
    n_estimator, learning_rate, mae = maes[key]
    print(f"{key}. N_estimators={n_estimator}, Learning_rate: {learning_rate}, Score: {mae}")

1. N_estimators=100, Learning_rate: 0.5, Score: 19268.648151220037
2. N_estimators=500, Learning_rate: 0.1, Score: 16495.916165453767
3. N_estimators=800, Learning_rate: 0.08, Score: 16496.006555008564
4. N_estimators=1000, Learning_rate: 0.05, Score: 16591.05081068065


## 8. Best parameter: `n_estimator=1000` and `learning_rate=0.08`
1. N_estimators=100, Learning_rate: 0.5, Score: 19268.648151220037
2. N_estimators=500, Learning_rate: 0.1, Score: 16495.916165453767
3. N_estimators=800, Learning_rate: 0.08, Score: 16496.006555008564
4. N_estimators=1000, Learning_rate: 0.05, Score: 16591.05081068065

In [13]:
best_n_estimators = 500
best_learning_rate = 0.1

## 9. Training model with best parameters

In [14]:
model = cp.XGBRegressor_model(n_estimators=best_n_estimators, learning_rate=best_learning_rate)
final_model = cp.create(preprocessor=preprocessor, model=model)
final_model.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', SimpleImputer(),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'Enclosed...
                              enable_categorical=False, eval_metric=None,
                              gamma=0, gpu_id=-1, grow_policy='depthwise',
                              importance_type=None, interaction_constraints='',
                              learning_rate=0.1, max_bin=256,
                              max_cat_to_onehot=4, max_delta_step=0,
                              max_depth=6, max_leaves=0, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                      

## 10. Predicting price on `test_data`

In [15]:
test_preds = final_model.predict(test_data)
test_preds

array([121662.85, 171334.02, 181721.36, ..., 173966.62, 111360.09,
       209294.9 ], dtype=float32)

## 11. Submitting Predictions

In [16]:
output = pd.DataFrame({"Id" : test_data.index, 
                       "SalePrice" : test_preds})
output.to_csv("./submission_7.csv", index=False)