In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection

In [2]:
data = pd.read_csv("vadodara_house_price_dataset_new.csv")
data.head()

Unnamed: 0,h_type,location,society,size,bathroom,balcony,total_sqft,yr_built,furniture,sale_type,...,college,hospital,population,railway,airport,on_road,air_quality,restaurant,park,price
0,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2011.0,0,new,...,0,1,2,0,0,1,1,1,1,4361705
1,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2012.0,1,new,...,0,1,2,0,0,1,1,1,1,5001905
2,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1860.0,2010.0,1,new,...,0,1,2,0,0,1,1,1,1,5588795
3,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1015.0,2016.0,1,new,...,0,1,2,0,0,1,1,1,1,3184740
4,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1210.0,2019.0,0,new,...,0,1,2,0,0,1,1,1,1,3498895


In [3]:
data = data.drop(['society', 'yr_built', 'furniture', 'sale_type', 'amenities', 'market', 'office', 'school', 'college', 'hospital', 'population', 'railway', 'airport', 'on_road', 'air_quality', 'restaurant', 'park'], axis=1)
data.head()

Unnamed: 0,h_type,location,size,bathroom,balcony,total_sqft,price
0,apartment,Maneja,3 BHK,3,1,1550.0,4361705
1,apartment,Maneja,3 BHK,3,1,1550.0,5001905
2,apartment,Maneja,3 BHK,3,1,1860.0,5588795
3,apartment,Maneja,2 BHK,2,1,1015.0,3184740
4,apartment,Maneja,2 BHK,2,1,1210.0,3498895


In [4]:
data.isnull().sum()

h_type        0
location      0
size          0
bathroom      0
balcony       0
total_sqft    0
price         0
dtype: int64

In [5]:
df = data.iloc[ : , :-1].values
df

array([['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       ['apartment', 'Gotri', '3 BHK', 3, 3, 1550.0],
       ['apartment', 'Gotri', '3 BHK', 3, 2, 1750.0],
       ['apartment', 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [6]:
label_encoder = LabelEncoder()

In [7]:
df[ : ,0] = label_encoder.fit_transform(df[ : ,0])
df

array([[0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 'Gotri', '3 BHK', 3, 3, 1550.0],
       [0, 'Gotri', '3 BHK', 3, 2, 1750.0],
       [0, 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [8]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'apartment': 0,
 'duplex': 1,
 'pent house': 2,
 'tenament': 3,
 'triplex': 4,
 'villa': 5}

In [9]:
df[ : ,1] = label_encoder.fit_transform(df[ : ,1])
df

array([[0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 8, '3 BHK', 3, 3, 1550.0],
       [0, 8, '3 BHK', 3, 2, 1750.0],
       [0, 8, '2 BHK', 2, 3, 1200.0]], dtype=object)

In [10]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'Ajwa Road': 0,
 'Akota': 1,
 'Alkapuri': 2,
 'Atladra': 3,
 'Bhayli': 4,
 'Chhani': 5,
 'Fatehgunj': 6,
 'Gorwa': 7,
 'Gotri': 8,
 'Harni': 9,
 'Karelibaug': 10,
 'Khodiyar Nagar': 11,
 'Laxmipura': 12,
 'Madhav Pura': 13,
 'Mandvi': 14,
 'Maneja': 15,
 'Manjalpur': 16,
 'Navapura': 17,
 'New Alkapuri': 18,
 'New Karelibaugh': 19,
 'New Sama': 20,
 'New VIP Road': 21,
 'Sama': 22,
 'Sayajipura': 23,
 'Soma Talav': 24,
 'Vasant Vihar': 25,
 'Vasna Road': 26,
 'Vasna-Bhayli Road': 27,
 'Waghodia Road': 28}

In [11]:
df[ : ,2] = label_encoder.fit_transform(df[ : ,2])
df

array([[0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1860.0],
       ...,
       [0, 8, 2, 3, 3, 1550.0],
       [0, 8, 2, 3, 2, 1750.0],
       [0, 8, 1, 2, 3, 1200.0]], dtype=object)

In [12]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'1 BHK': 0, '2 BHK': 1, '3 BHK': 2, '4 BHK': 3, '5 BHK': 4}

In [13]:
y = data.price.values
X = df

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.2)

In [15]:
standard_x = StandardScaler()

In [16]:
X_train = standard_x.fit_transform(X_train)
X_val = standard_x.transform(X_val)

In [17]:
print('Train_Shape: ',X_train.shape)
print("\nX_train:")
X_train

Train_Shape:  (272, 6)

X_train:


array([[-0.40489619,  1.6113351 , -0.78202346, -0.65275545,  0.55804276,
        -0.50747977],
       [-0.40489619, -0.45706525, -0.78202346, -0.65275545, -0.57470075,
        -0.77806073],
       [ 0.46228305,  0.63156652, -1.93805814, -1.72881292, -0.57470075,
        -0.92838349],
       ...,
       [-0.40489619, -0.45706525,  0.37401122,  0.42330202,  0.55804276,
        -0.09750862],
       [-0.40489619,  1.6113351 , -1.93805814, -1.72881292, -0.57470075,
        -1.19076502],
       [-0.40489619, -0.45706525,  0.37401122,  0.42330202,  0.55804276,
        -0.43641811]])

In [18]:
print('Val_Shape: ',X_val.shape)
print("\nX_val:")
X_val

Val_Shape:  (68, 6)

X_val:


array([[-0.40489619, -0.13047572,  0.37401122,  0.42330202,  0.55804276,
         0.44911958],
       [-0.40489619,  0.63156652,  0.37401122, -0.65275545,  1.69078628,
        -0.23416567],
       [-0.40489619, -1.21910748,  0.37401122,  1.49935949,  1.69078628,
         0.3876239 ],
       [-0.40489619,  1.6113351 , -0.78202346, -0.65275545, -0.57470075,
        -0.93384977],
       [-0.40489619, -1.00138113,  1.5300459 ,  1.49935949,  0.55804276,
         0.24554157],
       [-0.40489619, -1.21910748,  1.5300459 ,  3.65147443,  0.55804276,
         4.20718844],
       [-0.40489619, -0.34820207,  0.37401122,  0.42330202,  0.55804276,
         0.244134  ],
       [-0.40489619, -0.34820207,  1.5300459 ,  1.49935949,  2.82352979,
         0.98212307],
       [-0.40489619,  1.6113351 , -0.78202346, -0.65275545,  0.55804276,
        -1.06367396],
       [-0.40489619, -0.89251795, -0.78202346, -0.65275545,  0.55804276,
        -0.27242965],
       [-0.40489619,  1.6113351 ,  0.37401122,  0.

In [19]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)

In [20]:
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [21]:
filename = 'vadodara_house_model.pkl'

In [22]:
pickle.dump(regressor, open(filename, 'wb'))

In [23]:
loaded_model = pickle.load(open(filename, 'rb'))

In [24]:
accuracy = loaded_model.score(X_val, y_val)

In [25]:
print(accuracy*100, '%')

67.7149520790453 %


In [26]:
predictions = loaded_model.predict(X_val)

In [27]:
predictions

array([ 4501300.        ,  3214400.        ,  8115500.        ,
        2253900.        ,  6771900.        , 22315300.        ,
        5539400.        ,  9320400.        ,  2740800.        ,
        2910000.        ,  5307200.        ,  2550000.        ,
        4934800.        ,  3344100.        ,  3233333.33333333,
        7897900.        ,  9218000.        ,  4963900.        ,
        7569200.        ,  2522400.        ,  2424500.        ,
       15160900.        ,  4207400.        ,  4381500.        ,
        3915000.        ,  3285100.        ,  4708000.        ,
        4442300.        ,  2770000.        ,  3420000.        ,
        6001200.        ,  7801000.        , 16740300.        ,
        5225300.        ,  3453489.5       ,  5664000.        ,
        8614500.        ,  1591200.        ,  4580300.        ,
        3351200.        ,  5930000.        ,  1880500.        ,
        6110500.        ,  5440400.        , 10278200.        ,
        3536400.        ,  2830100.     