In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection

In [2]:
data = pd.read_csv("house_price_dataset_new.csv")
data.head()

Unnamed: 0,h_type,location,society,size,bathroom,balcony,total_sqft,yr_built,furniture,sale_type,...,college,hospital,population,railway,airport,on_road,air_quality,restaurant,park,price
0,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2011.0,0,new,...,0,1,2,0,0,1,1,1,1,4361705
1,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2012.0,1,new,...,0,1,2,0,0,1,1,1,1,5001905
2,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1860.0,2010.0,1,new,...,0,1,2,0,0,1,1,1,1,5588795
3,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1015.0,2016.0,1,new,...,0,1,2,0,0,1,1,1,1,3184740
4,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1210.0,2019.0,0,new,...,0,1,2,0,0,1,1,1,1,3498895


In [3]:
data = data.drop(['society', 'yr_built', 'furniture', 'sale_type', 'amenities', 'market', 'office', 'school', 'college', 'hospital', 'population', 'railway', 'airport', 'on_road', 'air_quality', 'restaurant', 'park'], axis=1)
data.head()

Unnamed: 0,h_type,location,size,bathroom,balcony,total_sqft,price
0,apartment,Maneja,3 BHK,3,1,1550.0,4361705
1,apartment,Maneja,3 BHK,3,1,1550.0,5001905
2,apartment,Maneja,3 BHK,3,1,1860.0,5588795
3,apartment,Maneja,2 BHK,2,1,1015.0,3184740
4,apartment,Maneja,2 BHK,2,1,1210.0,3498895


In [4]:
data.isnull().sum()

h_type        0
location      0
size          0
bathroom      0
balcony       0
total_sqft    0
price         0
dtype: int64

In [5]:
df = data.iloc[ : , :-1].values
df

array([['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       ['apartment', 'Gotri', '3 BHK', 3, 3, 1550.0],
       ['apartment', 'Gotri', '3 BHK', 3, 2, 1750.0],
       ['apartment', 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [6]:
label_encoder = LabelEncoder()

In [7]:
df[ : ,0] = label_encoder.fit_transform(df[ : ,0])
df

array([[0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 'Gotri', '3 BHK', 3, 3, 1550.0],
       [0, 'Gotri', '3 BHK', 3, 2, 1750.0],
       [0, 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [8]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'apartment': 0,
 'duplex': 1,
 'pent house': 2,
 'tenament': 3,
 'triplex': 4,
 'villa': 5}

In [9]:
df[ : ,1] = label_encoder.fit_transform(df[ : ,1])
df

array([[0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 8, '3 BHK', 3, 3, 1550.0],
       [0, 8, '3 BHK', 3, 2, 1750.0],
       [0, 8, '2 BHK', 2, 3, 1200.0]], dtype=object)

In [10]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'Ajwa Road': 0,
 'Akota': 1,
 'Alkapuri': 2,
 'Atladra': 3,
 'Bhayli': 4,
 'Chhani': 5,
 'Fatehgunj': 6,
 'Gorwa': 7,
 'Gotri': 8,
 'Harni': 9,
 'Karelibaug': 10,
 'Khodiyar Nagar': 11,
 'Laxmipura': 12,
 'Madhav Pura': 13,
 'Mandvi': 14,
 'Maneja': 15,
 'Manjalpur': 16,
 'Navapura': 17,
 'New Alkapuri': 18,
 'New Karelibaugh': 19,
 'New Sama': 20,
 'New VIP Road': 21,
 'Sama': 22,
 'Sayajipura': 23,
 'Soma Talav': 24,
 'Vasant Vihar': 25,
 'Vasna Road': 26,
 'Vasna-Bhayli Road': 27,
 'Waghodia Road': 28}

In [11]:
df[ : ,2] = label_encoder.fit_transform(df[ : ,2])
df

array([[0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1860.0],
       ...,
       [0, 8, 2, 3, 3, 1550.0],
       [0, 8, 2, 3, 2, 1750.0],
       [0, 8, 1, 2, 3, 1200.0]], dtype=object)

In [12]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'1 BHK': 0, '2 BHK': 1, '3 BHK': 2, '4 BHK': 3, '5 BHK': 4}

In [13]:
y = data.price.values
X = df

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.2)

In [15]:
standard_x = StandardScaler()

In [16]:
X_train = standard_x.fit_transform(X_train)
X_val = standard_x.transform(X_val)

In [17]:
print('Train_Shape: ',X_train.shape)
print("\nX_train:")
X_train

Train_Shape:  (272, 6)

X_train:


array([[ 0.42570195,  1.56164038, -0.87214457, -0.68451085,  0.46550312,
        -0.30954234],
       [ 3.7578151 , -1.0231437 ,  1.51200943,  0.36739283, -0.63551296,
         1.00737405],
       [-0.40732633, -1.23854237,  0.31993243,  0.36739283, -0.63551296,
         0.72983635],
       ...,
       [-0.40732633,  1.56164038, -0.87214457, -0.68451085,  0.46550312,
        -1.08387252],
       [-0.40732633, -1.23854237,  1.51200943,  3.52310387,  0.46550312,
         4.26844202],
       [-0.40732633, -1.23854237, -0.87214457, -0.68451085,  2.66753528,
        -0.65785215]])

In [18]:
print('Val_Shape: ',X_val.shape)
print("\nX_val:")
X_val

Val_Shape:  (68, 6)

X_val:


array([[-0.40732633, -1.23854237,  0.31993243,  1.41929651,  1.5665192 ,
         0.38985267],
       [-0.40732633, -1.0231437 ,  0.31993243,  0.36739283,  0.46550312,
         0.78534389],
       [-0.40732633,  1.45394105,  1.51200943,  1.41929651,  1.5665192 ,
         1.1461429 ],
       [-0.40732633, -0.48464702,  0.31993243,  0.36739283, -0.63551296,
         2.39506255],
       [-0.40732633, -0.91544436, -0.87214457, -0.68451085, -1.73652904,
        -0.0611461 ],
       [-0.40732633, -1.45394105, -0.87214457, -0.68451085,  0.46550312,
        -0.38031445],
       [-0.40732633, -0.91544436,  0.31993243,  0.36739283, -0.63551296,
         1.09063536],
       [ 3.7578151 , -0.37694768,  1.51200943,  1.41929651,  0.46550312,
        -0.43582199],
       [ 3.7578151 , -1.23854237,  0.31993243,  0.36739283, -0.63551296,
         1.7012183 ],
       [-0.40732633, -1.34624171,  0.31993243,  0.36739283, -0.63551296,
         0.20945316],
       [-0.40732633, -1.23854237,  0.31993243, -0.

In [19]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)

In [20]:
regressor.fit(X_train, y_train)

In [21]:
filename = 'vadodara_house_model.pkl'

In [22]:
pickle.dump(regressor, open(filename, 'wb'))

In [23]:
loaded_model = pickle.load(open(filename, 'rb'))

In [24]:
accuracy = loaded_model.score(X_val, y_val)

In [25]:
print(accuracy*100, '%')

64.83637850373344 %


In [26]:
predictions = loaded_model.predict(X_val)

In [27]:
predictions

array([ 8565300. ,  5793300. ,  7395400. ,  8867500. ,  6091900. ,
        2360200. ,  6043200. ,  7581000. , 12505708.4,  4825400. ,
        3005100. ,  6990000. ,  2787900. ,  3983000. ,  3650400. ,
        5858600. , 32400000. ,  5233489.5,  7107100. ,  2762500. ,
        3610400. ,  3415100. ,  4794000. ,  7053600. ,  7294400. ,
        4540700. ,  4912000. ,  2859779. , 10260000. ,  2437000. ,
        5241400. ,  8950300. ,  4100000. ,  3619500. ,  7129500. ,
        4050000. ,  9025400. , 11917200. ,  6234400. ,  7123200. ,
        2755300. ,  1901300. ,  2570100. ,  2055200. ,  5940800. ,
        2982500. ,  4365400. ,  2865000. ,  2810200. ,  3540000. ,
        7303500. ,  2186200. ,  2780300. ,  1690100. ,  5260200. ,
        7100000. ,  2635300. ,  2250800. ,  4278711.5,  2529950. ,
        2860000. , 12041233.6,  3670300. ,  2725000. ,  1987700. ,
        3399979. ,  5726000. ,  2665100. ])