In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler 
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.linear_model import LinearRegression
import plotly.express as px
from joblib import dump

In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv")
df

Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46
2,SACRAMENTO,Residential,2,1,796,119095.12
3,SACRAMENTO,Residential,2,1,852,130904.95
4,SACRAMENTO,Residential,2,1,797,120266.19
...,...,...,...,...,...,...
809,SACRAMENTO,Residential,4,3,2280,308248.47
810,SACRAMENTO,Residential,3,2,1477,212857.63
811,CITRUS HEIGHTS,Residential,3,2,1216,181746.98
812,ELK GROVE,Residential,4,2,1685,245385.59


In [3]:
x=df[['Beds','Baths','SquareFeet']]
y=df['Price']
xtrain,xtest,ytrain,ytest= train_test_split(x,y,test_size=0.2,random_state=1)

In [4]:
print("Random forest")
model1=RandomForestRegressor()
model1.fit(xtrain,ytrain)
print("Score:",model1.score(xtest,ytest)*100)
pred=model1.predict(x)
print("Mean Absolute Error:",mean_absolute_error(y,pred))
print("Mean Squared Error:",mean_squared_error(y,pred))

Random forest
Score: 71.78019506681817
Mean Absolute Error: 15250.941631000855
Mean Squared Error: 879151686.1558949


In [5]:
tree_score=cross_val_score(model1,x,y)
print("Random Forest")
print(tree_score,"Average:",tree_score.mean())

Random Forest
[0.80330219 0.71148733 0.67366283 0.51513916 0.63377049] Average: 0.6674724001469793


grid search

#we are going to create a dictionary with all parameters 

In [6]:
RandomForestRegressor?

[1;31mInit signature:[0m
[0mRandomForestRegressor[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'squared_error'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0moob_score[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m

In [7]:
params={
    'n_estimators':list(range(100,501,100)),
    'criterion':['squared_error','absolute_error','poisson'],
    'max_depth':list(range(5,50,20))
}
params

{'n_estimators': [100, 200, 300, 400, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 25, 45]}

In [8]:
grid=GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=3)

In [9]:
grid.fit(x,y)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 25, 45],
                         'n_estimators': [100, 200, 300, 400, 500]},
             verbose=3)

In [10]:
gf=pd.DataFrame(grid.cv_results_)

In [11]:
gf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.597999,0.066716,0.045001,0.003266,squared_error,5,100,"{'criterion': 'squared_error', 'max_depth': 5,...",0.801477,0.768729,0.614334,0.72818,0.081604,9
1,1.199332,0.042459,0.088334,0.004785,squared_error,5,200,"{'criterion': 'squared_error', 'max_depth': 5,...",0.804023,0.76808,0.614281,0.728795,0.082292,8
2,2.003331,0.124196,0.128336,0.011815,squared_error,5,300,"{'criterion': 'squared_error', 'max_depth': 5,...",0.804233,0.767697,0.619911,0.730613,0.079687,6
3,2.239998,0.061747,0.166669,0.030005,squared_error,5,400,"{'criterion': 'squared_error', 'max_depth': 5,...",0.804081,0.766794,0.617244,0.729373,0.080735,7
4,2.865997,0.114814,0.208668,0.008497,squared_error,5,500,"{'criterion': 'squared_error', 'max_depth': 5,...",0.801061,0.7678,0.612671,0.727177,0.082099,10
5,0.775999,0.047377,0.062001,0.018403,squared_error,25,100,"{'criterion': 'squared_error', 'max_depth': 25...",0.760389,0.754058,0.576831,0.697093,0.085077,22
6,1.65733,0.044011,0.139336,0.027011,squared_error,25,200,"{'criterion': 'squared_error', 'max_depth': 25...",0.759118,0.753202,0.578857,0.697059,0.083616,23
7,2.323998,0.085515,0.151334,0.004641,squared_error,25,300,"{'criterion': 'squared_error', 'max_depth': 25...",0.752733,0.753976,0.579094,0.695268,0.082149,25
8,3.075998,0.073542,0.190999,0.013949,squared_error,25,400,"{'criterion': 'squared_error', 'max_depth': 25...",0.753784,0.752243,0.58182,0.695949,0.080704,24
9,3.891667,0.006184,0.297,0.084914,squared_error,25,500,"{'criterion': 'squared_error', 'max_depth': 25...",0.758392,0.755369,0.578589,0.69745,0.084057,21


In [12]:
gf.sort_values(by='rank_test_score',inplace=True)
gf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
19,7.545,0.135477,0.187,0.019593,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824991,0.767756,0.615117,0.735955,0.088583,1
18,6.189996,0.079717,0.160666,0.009743,absolute_error,5,400,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824286,0.767629,0.615251,0.735722,0.08827,2
16,3.12033,0.08617,0.082666,0.006181,absolute_error,5,200,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824188,0.767052,0.614568,0.735269,0.088479,3
15,1.589333,0.10769,0.046,0.001633,absolute_error,5,100,"{'criterion': 'absolute_error', 'max_depth': 5...",0.823251,0.767422,0.612055,0.734243,0.089356,4
17,4.59633,0.055259,0.111336,0.008498,absolute_error,5,300,"{'criterion': 'absolute_error', 'max_depth': 5...",0.82381,0.766413,0.611563,0.733929,0.089643,5
2,2.003331,0.124196,0.128336,0.011815,squared_error,5,300,"{'criterion': 'squared_error', 'max_depth': 5,...",0.804233,0.767697,0.619911,0.730613,0.079687,6
3,2.239998,0.061747,0.166669,0.030005,squared_error,5,400,"{'criterion': 'squared_error', 'max_depth': 5,...",0.804081,0.766794,0.617244,0.729373,0.080735,7
1,1.199332,0.042459,0.088334,0.004785,squared_error,5,200,"{'criterion': 'squared_error', 'max_depth': 5,...",0.804023,0.76808,0.614281,0.728795,0.082292,8
0,0.597999,0.066716,0.045001,0.003266,squared_error,5,100,"{'criterion': 'squared_error', 'max_depth': 5,...",0.801477,0.768729,0.614334,0.72818,0.081604,9
4,2.865997,0.114814,0.208668,0.008497,squared_error,5,500,"{'criterion': 'squared_error', 'max_depth': 5,...",0.801061,0.7678,0.612671,0.727177,0.082099,10


In [15]:
dump(grid.best_estimator_,"house_pricing_model_73.pkl")

['house_pricing_model_73.pkl']