In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import randint

In [3]:
df = pd.read_csv('C:\\Users\\shawx\\OneDrive\\Desktop\\House_Price_Prediction\\notebooks\\data\\houseprice.csv')

df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
df.shape

(1460, 81)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [6]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [23]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [24]:
df = df.drop(columns=['Id'])
df.head(2)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500


In [25]:
threshold = len(df) * 0.5
df = df.dropna(thresh=threshold, axis=1)

In [26]:
df.shape

(1460, 75)

In [27]:
X = df.drop(columns=['SalePrice'])
y = df['SalePrice']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

In [30]:
num_transformer = SimpleImputer(strategy='median')

In [31]:
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [32]:
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

In [33]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [34]:
param_dist = {
    'regressor__n_estimators': randint(50, 100),
    'regressor__max_depth': randint(5, 30),
    'regressor__min_samples_split': randint(2, 10),
    'regressor__min_samples_leaf': randint(1, 10),
}

random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=15,
    cv=2,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=1,
    verbose=2
)

random_search.fit(X_train, y_train)

Fitting 2 folds for each of 15 candidates, totalling 30 fits
[CV] END regressor__max_depth=11, regressor__min_samples_leaf=4, regressor__min_samples_split=6, regressor__n_estimators=64; total time=   1.4s
[CV] END regressor__max_depth=11, regressor__min_samples_leaf=4, regressor__min_samples_split=6, regressor__n_estimators=64; total time=   1.4s
[CV] END regressor__max_depth=15, regressor__min_samples_leaf=8, regressor__min_samples_split=6, regressor__n_estimators=70; total time=   1.0s
[CV] END regressor__max_depth=15, regressor__min_samples_leaf=8, regressor__min_samples_split=6, regressor__n_estimators=70; total time=   1.0s
[CV] END regressor__max_depth=11, regressor__min_samples_leaf=3, regressor__min_samples_split=8, regressor__n_estimators=60; total time=   1.4s
[CV] END regressor__max_depth=11, regressor__min_samples_leaf=3, regressor__min_samples_split=8, regressor__n_estimators=60; total time=   1.4s
[CV] END regressor__max_depth=15, regressor__min_samples_leaf=8, regressor_

In [35]:
print('BEST PARAMETERS : ',random_search.best_params_)

BEST PARAMETERS :  {'regressor__max_depth': 29, 'regressor__min_samples_leaf': 1, 'regressor__min_samples_split': 4, 'regressor__n_estimators': 91}


In [36]:
y_pred = random_search.predict(X_test)

In [39]:
print('MEAN ABSOLUTE ERROR : ',mean_absolute_error(y_test, y_pred))
print('MEAN SQUARED ERROR : ',mean_squared_error(y_test, y_pred, squared=False))
print('R2 SCORE : ',r2_score(y_test, y_pred))

MEAN ABSOLUTE ERROR :  17653.053468403203
MEAN SQUARED ERROR :  29038.29934792478
R2 SCORE :  0.8900668881292707
