<h1 style="color:navy"><i><strong>Random Forest Regression Model</strong></i></h1>

In [48]:
import pandas as pd
import numpy as np

In [49]:
X_train_ds = pd.read_csv('train.csv' , index_col ='Id') 
X_test_ds = pd.read_csv('test.csv', index_col='Id')

In [50]:
target = X_train_ds.SalePrice
target

Id
1       208500
2       181500
3       223500
4       140000
5       250000
         ...  
1456    175000
1457    210000
1458    266500
1459    142125
1460    147500
Name: SalePrice, Length: 1460, dtype: int64

In [51]:
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = X_train_ds[features].copy()
X

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,8450,2003,856,854,2,3,8
2,9600,1976,1262,0,2,3,6
3,11250,2001,920,866,2,3,6
4,9550,1915,961,756,1,3,7
5,14260,2000,1145,1053,2,4,9
...,...,...,...,...,...,...,...
1456,7917,1999,953,694,2,3,7
1457,13175,1978,2073,0,2,3,7
1458,9042,1941,1188,1152,2,4,9
1459,9717,1950,1078,0,1,2,5


In [52]:
from sklearn.model_selection import train_test_split
train_X , test_X , train_y, test_y = train_test_split(X,target,test_size=0.4,random_state=1)

In [53]:
train_X.head(10)

Unnamed: 0_level_0,LotArea,YearBuilt,1stFlrSF,2ndFlrSF,FullBath,BedroomAbvGr,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1025,15498,1976,2898,0,2,2,10
458,53227,1954,1663,0,1,2,6
176,12615,1950,2158,0,2,4,7
550,9375,2003,912,1182,2,4,8
301,15750,1953,1336,0,1,2,5
460,7015,1950,979,224,1,3,5
684,11248,2002,1668,0,2,3,7
615,1491,1972,630,0,1,1,3
1257,14303,1994,1987,0,2,2,7
1202,10400,1998,866,913,2,3,6


In [54]:
test_y.head(10)

Id
259     231500
268     179500
289     122000
650      84500
1234    142000
168     325624
927     285000
832     151000
1238    195000
427     275000
Name: SalePrice, dtype: int64

<h1 style="color:skyblue">Evaluating Using Several Models</h1>

In [55]:
from sklearn.ensemble import RandomForestRegressor

model_1 = RandomForestRegressor(n_estimators = 50 , random_state=0)
model_2 = RandomForestRegressor(n_estimators = 100, random_state=0)
model_3 = RandomForestRegressor(n_estimators = 200 , criterion='mae' , random_state=0)
model_4 = RandomForestRegressor(n_estimators = 100 , criterion='mae' , random_state=0)
model_5 = RandomForestRegressor(n_estimators = 300 , criterion='mae' ,random_state = 0)
model_6 = RandomForestRegressor(n_estimators =200, min_samples_split=20, random_state=0)
model_7 = RandomForestRegressor(n_estimators =100, max_depth=7, random_state=0)

models = [model_1,model_2,model_3,model_4,model_5,model_6,model_7] 

<h2 style="color:pink">Mean Absolute Error</h2>

In [56]:
from sklearn.metrics import mean_absolute_error

<i style="color:red">Function For Comparing Different Models</i>

In [57]:
def model_score(model ,t_X=train_X , ts_X=test_X , t_y=train_y , ts_y=test_y):
    model.fit(t_X,t_y)
    predicted_values = model.predict(ts_X)
    return mean_absolute_error(predicted_values,ts_y)

In [58]:
for i in range(0,7):
    mae = model_score(models[i])
    print("Model %d MEAN ABSOLUTE ERROR: %d" % (i+1, mae))

Model 1 MEAN ABSOLUTE ERROR: 23181
Model 2 MEAN ABSOLUTE ERROR: 22943
Model 3 MEAN ABSOLUTE ERROR: 22993
Model 4 MEAN ABSOLUTE ERROR: 23062
Model 5 MEAN ABSOLUTE ERROR: 23143
Model 6 MEAN ABSOLUTE ERROR: 23285
Model 7 MEAN ABSOLUTE ERROR: 23056


<h2 style="color:green">By Observing Above models : The Model Which has Lowest Mean Absolute Error is the Best Model, So in this Case it's model_2</h2>

In [59]:
best_model = model_2

<h1 style="color:red">Generating Test Predictions</h1>

In [60]:
my_model = RandomForestRegressor()

In [62]:
my_model.fit(X,target)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [64]:
test_predictions = my_model.predict(test_X)
test_predictions

array([214979.        , 176457.75      , 125613.        ,  85020.        ,
       145603.        , 306605.96      , 300890.09      , 148046.93      ,
       201193.35      , 254646.8       , 173869.5       ,  77125.        ,
       188355.53      , 333682.48      , 235805.54      ,  98822.26      ,
       121641.83      , 116164.        , 233727.46      , 135582.5       ,
       129957.        , 139671.58      , 234913.        , 329543.23      ,
        85668.5       , 179923.5       , 126079.5       , 184042.        ,
       501506.69      , 122606.        , 118128.06      , 111507.33333333,
       125307.5       ,  90253.        , 138521.75      , 354563.96      ,
       124420.5       ,  92013.        , 280311.1       , 112020.        ,
       138196.7       , 134770.75      ,  93444.        , 121201.        ,
       182051.        , 167577.1       , 109120.        , 170353.03      ,
       243430.4       , 269556.27      , 103555.        , 347003.        ,
       112498.        , 2

In [68]:
my_model.score(test_X,test_y)

0.975188180607493

In [69]:
Output = pd.DataFrame({'Id':test_X.index , 'SalePrice':test_predictions})
Output

Unnamed: 0,Id,SalePrice
0,259,214979.00
1,268,176457.75
2,289,125613.00
3,650,85020.00
4,1234,145603.00
...,...,...
579,1350,129733.40
580,346,138563.00
581,1406,268193.10
582,764,318118.00


In [70]:
Output.to_csv('prediction.csv',index=False)