# House price prediction [🔗](https://www.kaggle.com/competitions/home-data-for-ml-course/data)
by kaggle

## Importing required libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import mutual_info_regression

# Approach
1. Load data
2. Check Mutual Information
3. Select useful features
4. Extract numerical and categorical features
5. Create Pipeline
6. Do hyperparameter tuning with cross-validation
7. Check MAE, select best parameters
8. Training model on best parameters
9. Predict prices on `test_data`
10. Submit predictions

## 1. Loading data

In [3]:
house_data = pd.read_csv("../input/home-data-for-ml-course/train.csv", index_col="Id")
test_data = pd.read_csv("../input/home-data-for-ml-course/test.csv", index_col="Id")

pd.set_option("display.max_columns", house_data.shape[1])
house_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


## 2. Checking `MI` (Mutual Information)

In [4]:
def impute_data(data, strategy="most_frequent"):
    data = data.copy()
    index = data.index
    cat_cols = data.select_dtypes("object").columns
    num_cols = data.select_dtypes(exclude="object").columns
    num_imp = SimpleImputer(strategy="mean")
    cat_imp = SimpleImputer(strategy=strategy)
    data[cat_cols] = pd.DataFrame(cat_imp.fit_transform(data[cat_cols]), index=index, columns=cat_cols)
    data[num_cols] = pd.DataFrame(num_imp.fit_transform(data[num_cols]), index=index, columns=num_cols)
    return data

In [5]:
def get_mi_score(X, y):
    X = X.copy()
    X = impute_data(X)
    # Converting values of discrete features to numerical values
    for col in X.select_dtypes(["object"]):
        X[col] = X[col].factorize()[0]
    
    discrete_features = X.dtypes == int
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns).sort_values(ascending=False)
    return mi_scores

In [6]:
X = house_data.iloc[:, :-1]
y = house_data["SalePrice"]
mi_scores = get_mi_score(X, y)
mi_scores.head(20)

OverallQual     0.560282
Neighborhood    0.530819
GrLivArea       0.482077
YearBuilt       0.368010
TotalBsmtSF     0.366441
GarageArea      0.362350
GarageCars      0.361032
ExterQual       0.324114
BsmtQual        0.315680
KitchenQual     0.314715
1stFlrSF        0.309644
GarageYrBlt     0.297827
MSSubClass      0.275832
YearRemodAdd    0.257843
FullBath        0.256921
GarageFinish    0.249278
TotRmsAbvGrd    0.218532
Foundation      0.198219
2ndFlrSF        0.194843
LotFrontage     0.191295
Name: MI Scores, dtype: float64

## 3. Selecting useful features
- Selecting first 20 good mi_score features

In [7]:
useful_features = mi_scores.index
X = X[useful_features]
y = house_data["SalePrice"]
test_data = test_data[useful_features]
useful_features

Index(['OverallQual', 'Neighborhood', 'GrLivArea', 'YearBuilt', 'TotalBsmtSF',
       'GarageArea', 'GarageCars', 'ExterQual', 'BsmtQual', 'KitchenQual',
       '1stFlrSF', 'GarageYrBlt', 'MSSubClass', 'YearRemodAdd', 'FullBath',
       'GarageFinish', 'TotRmsAbvGrd', 'Foundation', '2ndFlrSF', 'LotFrontage',
       'HeatingQC', 'OpenPorchSF', 'Exterior2nd', 'Fireplaces', 'LotArea',
       'BsmtFinSF1', 'BsmtFinType1', 'Exterior1st', 'MSZoning', 'OverallCond',
       'GarageType', 'BsmtUnfSF', 'WoodDeckSF', 'MasVnrType', 'LotShape',
       'HalfBath', 'MasVnrArea', 'SaleType', 'SaleCondition', 'HouseStyle',
       'FireplaceQu', 'BsmtExposure', 'BedroomAbvGr', 'CentralAir',
       'Electrical', 'PavedDrive', 'BldgType', 'BsmtFullBath', 'LandContour',
       'BsmtCond', 'KitchenAbvGr', 'EnclosedPorch', 'ScreenPorch',
       'BsmtHalfBath', 'GarageCond', 'Condition1', 'Fence', 'GarageQual',
       'Heating', 'RoofStyle', 'ExterCond', 'LotConfig', 'Alley',
       'BsmtFinType2', 'Functiona

In [8]:
X.dtypes.value_counts()

object     43
int64      33
float64     3
dtype: int64

## 4. Extracting numerical and categorical columns

In [9]:
num_cols = X.select_dtypes(exclude="object").columns
cat_cols = X.select_dtypes("object").columns

## 5. Creating Pipeline

In [10]:
class CreatePipeline:
    """Create Pipeline
    methods:
        pipeline: Create Final Pipeline
        
        create_model: Create the provided model
        
        numerical_transformer: Transform numerical cols
        
        categorical_transformer: Transform categorical cols \
        OneHotEncoding / OrdinalEncoding
        
        data_preprocessor: Preprocess the data using ColumnTransformer     
        """
    def pipeline(self, *, preprocessor, model, verbose=False):
        """Creates pipeline
        params:
            preprocessor
            model
        """
        steps = [("preprocessor", preprocessor),
                 ("model", model)]
        return Pipeline(steps=steps, verbose=verbose)
    
    
    def numerical_transformer(self, *, strategy="mean", **params):
        """Transform numerical columns using `SimpleImputer`.
        params:
            strategy: "mean" | "median" | "most_frequent" | "constant"
            **params: extra keyword args for SimpleImputer"""
        
        transformer = SimpleImputer(strategy=strategy, **params)
        return transformer
    
    def categorical_transformer(self, *, 
                                imp_strategy="most_frequent", 
                                encoder_type="Ordinal", 
                                imp_params={}, encoder_params={}):
        """Transform categorical columns by making Pipeline
        `SimpleImputer` | `OneHotEncoder` | `OrdinalEncoder`.
        args:
            imp_strategy: strategy for imputer values can be
                "most_frequent" | "constant"
            encoder_type: encoder type,
                "Ordinal" | "OneHot"
        kwargs:
            imp_params: keyword args for `SimpleImputer`.
            encoder_params: keyword args for encoder.`
        """
        if not encoder_type in ("Ordinal", "OneHot"):
            raise ValueError(f"Inappropriate value for encoder_type passed: {encoder_type}\
            Takes one of 'Ordinal' | 'OneHot'.")
        
        encoder = OrdinalEncoder if encoder_type=="Ordinal" else OneHotEncoder
        transformer = Pipeline(steps=[
            ("imputer", SimpleImputer(strategy=imp_strategy, **imp_params)),
            (encoder_type, encoder(**encoder_params))
        ])
        return transformer
    
    
    def data_preprocessor(self, *, transformers):
        """Preprocess the data using `ColumnTransformer`.
        Pass extact list of transformers
        to be passed in `ColumnTransformer`.
        each tuple consist of: (transformer_name,
                                transformer,
                                list_of_columns)."""
        preprocessor = ColumnTransformer(transformers=transformers)
        return preprocessor
    
    
    def create_model(self, *, model, random_state=0, n_estimators=1000, **kwargs):
        """Creates the model.
        **kwargs: keyword args for model."""
        my_model = model(random_state=random_state, n_estimators=n_estimators, **kwargs)
        return my_model

In [11]:
cp = CreatePipeline()
num_transformer = cp.numerical_transformer()
cat_transformer = cp.categorical_transformer(encoder_params={"handle_unknown":"use_encoded_value", "unknown_value":-1})

preprocessor = cp.data_preprocessor(
    transformers=[("num", num_transformer, num_cols),
                  ("cat", cat_transformer, cat_cols)
                 ])

## 6. Doing hyperparameter tuning with cross-validation

In [12]:
n_estimators = [500, 750, 1000]
learning_rate = [0.05, 0.1]
maes = {}
i = 0
for n in n_estimators:
    for rate in learning_rate:
        i += 1
        model = cp.create_model(model=XGBRegressor, n_estimators=n, learning_rate=rate)
        pipeline = cp.pipeline(preprocessor=preprocessor, model=model)
        scores = -1 * cross_val_score(pipeline, X, y, cv=10, verbose=True,
                                scoring="neg_mean_absolute_error")
        mae = scores.mean()
        maes[i] = [n, rate, mae]

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   37.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   38.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   54.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   55.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.2min finished


## 7. Checking MAE, selecting best parameters
### n_estimators=`500` and learning_rate:`0.05`
1. N_estimators: 500	Learning_rate: 0.05	MAE: 15520
2. N_estimators: 500	Learning_rate: 0.1	MAE: 15772
3. N_estimators: 750	Learning_rate: 0.05	MAE: 15554
4. N_estimators: 750	Learning_rate: 0.1	MAE: 15781
5. N_estimators: 1000	Learning_rate: 0.05	MAE: 15574
6. N_estimators: 1000	Learning_rate: 0.1	MAE: 15781

In [13]:
for i in maes:
    estimators, rate, mae = maes[i]
    print(f"{i}. N_estimators: {estimators}\tLearning_rate: {rate}\tMAE: {mae:.0f}")

1. N_estimators: 500	Learning_rate: 0.05	MAE: 15520
2. N_estimators: 500	Learning_rate: 0.1	MAE: 15772
3. N_estimators: 750	Learning_rate: 0.05	MAE: 15554
4. N_estimators: 750	Learning_rate: 0.1	MAE: 15781
5. N_estimators: 1000	Learning_rate: 0.05	MAE: 15574
6. N_estimators: 1000	Learning_rate: 0.1	MAE: 15781


In [14]:
best_n_estimators = 500
best_learning_rate = 0.05

## 8. Training model on best parameters

In [15]:
model = cp.create_model(model=XGBRegressor, n_estimators=best_n_estimators, learning_rate=best_learning_rate)
pipeline = cp.pipeline(preprocessor=preprocessor, model=model)
pipeline.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', SimpleImputer(),
                                                  Index(['OverallQual', 'GrLivArea', 'YearBuilt', 'TotalBsmtSF', 'GarageArea',
       'GarageCars', '1stFlrSF', 'GarageYrBlt', 'MSSubClass', 'YearRemodAdd',
       'FullBath', 'TotRmsAbvGrd', '2ndFlrSF', 'LotFrontage', 'OpenPorchSF',
       'Fireplaces', 'LotArea', 'BsmtFinSF1', 'OverallCond', 'BsmtUnfSF...
                              gamma=0, gpu_id=-1, grow_policy='depthwise',
                              importance_type=None, interaction_constraints='',
                              learning_rate=0.05, max_bin=256,
                              max_cat_to_onehot=4, max_delta_step=0,
                              max_depth=6, max_leaves=0, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                              n_estimators=500, n_jobs=0, num_parallel_tree=1,
                       

## 9. Predicting prices of `test_data`

In [16]:
test_preds = pipeline.predict(test_data)
test_preds

array([125650.336, 152583.88 , 186216.7  , ..., 159991.2  , 113692.56 ,
       219900.81 ], dtype=float32)

## 10. Submitting predictions

In [17]:
output = pd.DataFrame({"Id": test_data.index, "SalePrice": test_preds})
output.to_csv("./submission_11.csv", index=False)