In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

from joblib import dump, load

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv")
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46


In [3]:
# Feature selection (X, y) & Split into xtrain,xtest,ytrain,ytest
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2, random_state=1)

In [4]:
print('Random Forest')
model2 = RandomForestRegressor()
model2.fit(xtrain,ytrain)
print('score:', model2.score(xtest, ytest) * 100)
pred = model2.predict(X)
print('mse:', mean_squared_error(y,pred))
print('mae:', mean_absolute_error(y,pred))

Random Forest
score: 72.48084505601993
mse: 824386758.4534308
mae: 15244.083094266876


### Grid Search

In [5]:
# We are going to create a dictionary with all the parameteer and their value options

In [6]:
params = {
    'n_estimators' : list(range(100,501,100)),
    'criterion': ['squared_error','absolute_error','poisson'],
    'max_depth': list(range(5,51,15))
}
params

{'n_estimators': [100, 200, 300, 400, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 20, 35, 50]}

In [7]:
grid = GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=2)


In [8]:
grid.fit(X,y)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 20, 35, 50],
                         'n_estimators': [100, 200, 300, 400, 500]},
             verbose=2)

In [9]:
grid.cv_results_

{'mean_fit_time': array([0.52445904, 1.00696127, 1.41687298, 1.96230141, 1.78897587,
        0.56748923, 0.53239473, 0.80258449, 1.05935677, 1.43348606,
        0.26072049, 0.84992496, 1.62994138, 2.36841639, 2.93191727,
        0.60978715, 1.14047615, 1.12909166, 1.14878058, 1.42105548,
        0.55540331, 1.72494626, 3.26951559, 4.47354349, 4.29348548,
        1.14009476, 2.19166787, 4.40182527, 6.0207332 , 7.0047524 ,
        1.09280133, 3.10807657, 3.28276443, 4.77830831, 6.18655252,
        1.64709504, 2.33022388, 3.31132746, 5.7435232 , 7.30924869,
        0.45871822, 0.90047375, 1.03435334, 0.85362299, 1.99458758,
        0.74754175, 1.42842897, 2.17721891, 2.2173152 , 2.16671316,
        0.44722374, 0.82403652, 1.46665589, 2.96475736, 3.99485175,
        1.01239014, 2.02703325, 2.4909145 , 2.23380423, 2.20556474]),
 'std_fit_time': array([0.02332155, 0.02694099, 0.02067539, 0.15146491, 0.11860954,
        0.0530902 , 0.00673103, 0.00872643, 0.01839393, 0.1657482 ,
        0.003

In [10]:
gf = pd.DataFrame(grid.cv_results_)

In [11]:
gf.sort_values(by='rank_test_score', inplace=True)
gf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
23,4.473543,0.29747,0.060051,0.003664,absolute_error,5,400,"{'criterion': 'absolute_error', 'max_depth': 5...",0.825996,0.767703,0.613279,0.735659,0.089749,1
22,3.269516,0.106721,0.114464,0.026177,absolute_error,5,300,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824965,0.767552,0.613263,0.73526,0.089393,2
24,4.293485,0.377817,0.134634,0.043027,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.823719,0.767032,0.614405,0.735052,0.088393,3
21,1.724946,0.11563,0.073255,0.009514,absolute_error,5,200,"{'criterion': 'absolute_error', 'max_depth': 5...",0.821666,0.768712,0.614097,0.734825,0.088063,4
20,0.555403,0.008063,0.016792,0.000702,absolute_error,5,100,"{'criterion': 'absolute_error', 'max_depth': 5...",0.821056,0.764548,0.6134,0.733001,0.087661,5
0,0.524459,0.023322,0.038057,0.004297,squared_error,5,100,"{'criterion': 'squared_error', 'max_depth': 5,...",0.809097,0.768083,0.618339,0.73184,0.081985,6
2,1.416873,0.020675,0.109065,0.00494,squared_error,5,300,"{'criterion': 'squared_error', 'max_depth': 5,...",0.800823,0.769916,0.618107,0.729615,0.079851,7
1,1.006961,0.026941,0.080143,0.008634,squared_error,5,200,"{'criterion': 'squared_error', 'max_depth': 5,...",0.802758,0.765328,0.618878,0.728988,0.079345,8
4,1.788976,0.11861,0.069777,0.003772,squared_error,5,500,"{'criterion': 'squared_error', 'max_depth': 5,...",0.800019,0.768145,0.61492,0.727695,0.080798,9
3,1.962301,0.151465,0.06353,0.003875,squared_error,5,400,"{'criterion': 'squared_error', 'max_depth': 5,...",0.800788,0.76702,0.613176,0.726995,0.081654,10


In [12]:
grid.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_depth=5, n_estimators=400)

In [13]:
dump(grid.best_estimator_,"house_pricing_73.pkl")

['house_pricing_73.pkl']