# House price prediction [🔗](https://www.kaggle.com/competitions/home-data-for-ml-course/data)
by kaggle

## Importing required libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder

# Steps I am using:
1. Load Data
2. Find useful features
3. Extract numerical features
4. Make pipeline for them    (Imputer)
5. Extract categorical features
6. Make pipeline for them too (Imputer, Ordinal Encoding)
7. Use cross validation (cv=10) folds
8. Find best n_estimator in RandomForestRegressor
9. Select best parameter
10. Train full model on cross validation
11. Predict price on test data
12. Submit final predictions

## Loading data

In [3]:
house_data = pd.read_csv("../input/home-data-for-ml-course/train.csv", index_col="Id")
test_data = pd.read_csv("../input/home-data-for-ml-course/test.csv", index_col="Id")

pd.set_option("display.max_columns", house_data.shape[1])
house_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


## Finding useful features

In [4]:
useful_features = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'LotShape', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'RoofMatl', 
                   'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'Condition1', 'YearBuilt', 'RoofStyle', 'Exterior1st', 'MasVnrType', 'BsmtQual', 'BsmtFinType1',
                   'ExterQual', 'ExterCond', 'Foundation', 'TotalBsmtSF', 'Heating', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LandContour', 'GrLivArea', 'GarageType',
                   'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'FireplaceQu', 'GarageCars', 'GarageArea',
                   'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'PoolArea', 'MiscVal']

house_data = house_data[useful_features + ["SalePrice"]]
test_data = test_data[useful_features]
X = house_data[useful_features]
y = house_data.SalePrice
len(useful_features)

48

In [5]:
X.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,Utilities,LotConfig,LandSlope,Neighborhood,RoofMatl,BldgType,HouseStyle,OverallQual,OverallCond,Condition1,YearBuilt,RoofStyle,Exterior1st,MasVnrType,BsmtQual,BsmtFinType1,ExterQual,ExterCond,Foundation,TotalBsmtSF,Heating,CentralAir,1stFlrSF,2ndFlrSF,LandContour,GrLivArea,GarageType,FullBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Fireplaces,FireplaceQu,GarageCars,GarageArea,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,PoolArea,MiscVal
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
1,60,RL,65.0,8450,Pave,Reg,AllPub,Inside,Gtl,CollgCr,CompShg,1Fam,2Story,7,5,Norm,2003,Gable,VinylSd,BrkFace,Gd,GLQ,Gd,TA,PConc,856,GasA,Y,856,854,Lvl,1710,Attchd,2,3,1,Gd,8,0,,2,548,Y,0,61,0,0,0
2,20,RL,80.0,9600,Pave,Reg,AllPub,FR2,Gtl,Veenker,CompShg,1Fam,1Story,6,8,Feedr,1976,Gable,MetalSd,,Gd,ALQ,TA,TA,CBlock,1262,GasA,Y,1262,0,Lvl,1262,Attchd,2,3,1,TA,6,1,TA,2,460,Y,298,0,0,0,0
3,60,RL,68.0,11250,Pave,IR1,AllPub,Inside,Gtl,CollgCr,CompShg,1Fam,2Story,7,5,Norm,2001,Gable,VinylSd,BrkFace,Gd,GLQ,Gd,TA,PConc,920,GasA,Y,920,866,Lvl,1786,Attchd,2,3,1,Gd,6,1,TA,2,608,Y,0,42,0,0,0
4,70,RL,60.0,9550,Pave,IR1,AllPub,Corner,Gtl,Crawfor,CompShg,1Fam,2Story,7,5,Norm,1915,Gable,Wd Sdng,,TA,ALQ,TA,TA,BrkTil,756,GasA,Y,961,756,Lvl,1717,Detchd,1,3,1,Gd,7,1,Gd,3,642,Y,0,35,272,0,0
5,60,RL,84.0,14260,Pave,IR1,AllPub,FR2,Gtl,NoRidge,CompShg,1Fam,2Story,8,5,Norm,2000,Gable,VinylSd,BrkFace,Gd,GLQ,Gd,TA,PConc,1145,GasA,Y,1145,1053,Lvl,2198,Attchd,2,4,1,Gd,9,1,TA,3,836,Y,192,84,0,0,0


### Extracting `numerical` & `categorical` features

In [6]:
num_cols = X.select_dtypes(exclude="object").columns
cat_cols = X.select_dtypes(exclude=(int, float)).columns

## Making pipelines

In [7]:
num_transformer = SimpleImputer(strategy="median")
cat_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="most_frequent")),
                                  ("ord_encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
                                 ])

data_preprocessor = ColumnTransformer(
    transformers=[("num", num_transformer, num_cols),
                  ("cat", cat_transformer, cat_cols)
                 ])

model_1 = RandomForestRegressor(n_estimators=200, random_state=0)
pipeline_1 = Pipeline(steps=[("preprocessor", data_preprocessor),
                             ("model", model_1)
                            ])

## Doing Cross validation on `pipeline_1`

In [8]:
scores = -1 * cross_val_score(pipeline_1, X, y, cv=10, verbose=True,
                             scoring="neg_mean_absolute_error")
scores

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   26.3s finished


array([16920.27561644, 18176.96646119, 16275.41031963, 19147.06833333,
       20671.44746575, 14868.75150522, 16274.52046804, 16994.86481735,
       19451.6894227 , 17872.26126712])

In [9]:
print(f"Average MAE of 10 folds: {scores.mean():.0f}")

Average MAE of 10 folds: 17665


### Finding best value for `n_estimator` using 6 different values

In [10]:
trees = [50, 100, 200, 250, 500, 1000]
for tree in trees:
    model = RandomForestRegressor(n_estimators=tree, random_state=0)
    pipeline = Pipeline(steps=[("preprocessor", data_preprocessor),
                             ("model", model)
                            ])
    scores = -1 * cross_val_score(pipeline, X, y, cv=10, verbose=True,
                             scoring="neg_mean_absolute_error")
    print(f"N_estimators: {tree}\t\tAverage MAE: {scores.mean():.0f}")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


N_estimators: 50		Average MAE: 17879


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   13.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


N_estimators: 100		Average MAE: 17791


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   26.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


N_estimators: 200		Average MAE: 17665


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   32.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


N_estimators: 250		Average MAE: 17662


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


N_estimators: 500		Average MAE: 17613
N_estimators: 1000		Average MAE: 17623


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  2.2min finished


### Best value for `n_estimators` is `500`.

## Training Final model with best `n_estimators` value

In [11]:
final_model = RandomForestRegressor(n_estimators=500, random_state=0)
pipeline = Pipeline(steps=[("preprocessor", data_preprocessor),
                           ("model", final_model)
                          ])
pipeline.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='median'),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
       'FullBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenP...
                                                  Index(['MSZoning', 'Street', 'LotShape', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'RoofMatl', 'BldgType', 'HouseStyle', 'Condition1',
       'RoofStyle', 'Exterior1st', 'MasVnrType', 'BsmtQual', 'BsmtFinType1',
       'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'CentralAir',
       'LandContour', 'GarageType', 'KitchenQual', 'FireplaceQu',
       'PavedDrive'],
      dtype='object'))])),
                ('

In [12]:
test_preds = pipeline.predict(test_data)
test_preds

array([123556.386, 156527.36 , 172320.08 , ..., 157677.788, 111330.828,
       227882.378])

## Submitting predictions

In [13]:
output = pd.DataFrame({'Id': test_data.index,
                       'SalePrice': test_preds})
output.to_csv('./submission_6.csv', index=False)