In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection

In [2]:
data = pd.read_csv("house_price_dataset_new.csv")
data.head()

Unnamed: 0,h_type,location,society,size,bathroom,balcony,total_sqft,yr_built,furniture,sale_type,...,college,hospital,population,railway,airport,on_road,air_quality,restaurant,park,price
0,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2011.0,0,new,...,0,1,2,0,0,1,1,1,1,4361705
1,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1550.0,2012.0,1,new,...,0,1,2,0,0,1,1,1,1,5001905
2,apartment,Maneja,Bakeri Swara,3 BHK,3,1,1860.0,2010.0,1,new,...,0,1,2,0,0,1,1,1,1,5588795
3,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1015.0,2016.0,1,new,...,0,1,2,0,0,1,1,1,1,3184740
4,apartment,Maneja,Bakeri Swara,2 BHK,2,1,1210.0,2019.0,0,new,...,0,1,2,0,0,1,1,1,1,3498895


In [3]:
data = data.drop(['society', 'yr_built', 'furniture', 'sale_type', 'amenities', 'market', 'office', 'school', 'college', 'hospital', 'population', 'railway', 'airport', 'on_road', 'air_quality', 'restaurant', 'park'], axis=1)
data.head()

Unnamed: 0,h_type,location,size,bathroom,balcony,total_sqft,price
0,apartment,Maneja,3 BHK,3,1,1550.0,4361705
1,apartment,Maneja,3 BHK,3,1,1550.0,5001905
2,apartment,Maneja,3 BHK,3,1,1860.0,5588795
3,apartment,Maneja,2 BHK,2,1,1015.0,3184740
4,apartment,Maneja,2 BHK,2,1,1210.0,3498895


In [4]:
data.isnull().sum()

h_type        0
location      0
size          0
bathroom      0
balcony       0
total_sqft    0
price         0
dtype: int64

In [5]:
df = data.iloc[ : , :-1].values
df

array([['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1550.0],
       ['apartment', 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       ['apartment', 'Gotri', '3 BHK', 3, 3, 1550.0],
       ['apartment', 'Gotri', '3 BHK', 3, 2, 1750.0],
       ['apartment', 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [6]:
label_encoder = LabelEncoder()

In [7]:
df[ : ,0] = label_encoder.fit_transform(df[ : ,0])
df

array([[0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1550.0],
       [0, 'Maneja', '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 'Gotri', '3 BHK', 3, 3, 1550.0],
       [0, 'Gotri', '3 BHK', 3, 2, 1750.0],
       [0, 'Gotri', '2 BHK', 2, 3, 1200.0]], dtype=object)

In [8]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'apartment': 0,
 'duplex': 1,
 'pent house': 2,
 'tenament': 3,
 'triplex': 4,
 'villa': 5}

In [9]:
df[ : ,1] = label_encoder.fit_transform(df[ : ,1])
df

array([[0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1550.0],
       [0, 15, '3 BHK', 3, 1, 1860.0],
       ...,
       [0, 8, '3 BHK', 3, 3, 1550.0],
       [0, 8, '3 BHK', 3, 2, 1750.0],
       [0, 8, '2 BHK', 2, 3, 1200.0]], dtype=object)

In [10]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'Ajwa Road': 0,
 'Akota': 1,
 'Alkapuri': 2,
 'Atladra': 3,
 'Bhayli': 4,
 'Chhani': 5,
 'Fatehgunj': 6,
 'Gorwa': 7,
 'Gotri': 8,
 'Harni': 9,
 'Karelibaug': 10,
 'Khodiyar Nagar': 11,
 'Laxmipura': 12,
 'Madhav Pura': 13,
 'Mandvi': 14,
 'Maneja': 15,
 'Manjalpur': 16,
 'Navapura': 17,
 'New Alkapuri': 18,
 'New Karelibaugh': 19,
 'New Sama': 20,
 'New VIP Road': 21,
 'Sama': 22,
 'Sayajipura': 23,
 'Soma Talav': 24,
 'Vasant Vihar': 25,
 'Vasna Road': 26,
 'Vasna-Bhayli Road': 27,
 'Waghodia Road': 28}

In [11]:
df[ : ,2] = label_encoder.fit_transform(df[ : ,2])
df

array([[0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1550.0],
       [0, 15, 2, 3, 1, 1860.0],
       ...,
       [0, 8, 2, 3, 3, 1550.0],
       [0, 8, 2, 3, 2, 1750.0],
       [0, 8, 1, 2, 3, 1200.0]], dtype=object)

In [12]:
mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
mapping

{'1 BHK': 0, '2 BHK': 1, '3 BHK': 2, '4 BHK': 3, '5 BHK': 4}

In [13]:
y = data.price.values
X = df

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.2)

In [15]:
standard_x = StandardScaler()

In [16]:
X_train = standard_x.fit_transform(X_train)
X_val = standard_x.transform(X_val)

In [17]:
print('Train_Shape: ',X_train.shape)
print("\nX_train:")
X_train

Train_Shape:  (272, 6)

X_train:


array([[-0.4349866 , -1.42249897, -1.98005874, -1.7095864 , -0.60745531,
        -0.96062469],
       [-0.4349866 , -1.20451603, -0.8243163 , -1.7095864 , -0.60745531,
        -0.82875915],
       [-0.4349866 , -1.20451603, -0.8243163 , -0.65753323,  2.67521711,
        -0.63096083],
       ...,
       [-0.4349866 , -0.98653309,  0.33142614,  0.39451994,  0.48676883,
        -0.0150828 ],
       [-0.4349866 , -0.76855015,  0.33142614,  0.39451994,  1.58099297,
         0.22616521],
       [-0.4349866 ,  1.52027073,  0.33142614, -0.65753323, -0.60745531,
        -0.14305831]])

In [18]:
print('Val_Shape: ',X_val.shape)
print("\nX_val:")
X_val

Val_Shape:  (68, 6)

X_val:


array([[-0.4349866 , -0.33258427,  1.48716858,  1.44657311,  2.67521711,
         1.42469116],
       [-0.4349866 , -0.44157574, -1.98005874, -1.7095864 , -0.60745531,
        -0.99359108],
       [-0.4349866 , -0.44157574, -0.8243163 , -0.65753323, -0.60745531,
        -0.70216822],
       [-0.4349866 , -1.3135075 ,  0.33142614,  0.39451994,  0.48676883,
         0.05474001],
       [-0.4349866 , -0.76855015,  0.33142614,  0.39451994,  0.48676883,
         0.0283669 ],
       [-0.4349866 , -0.76855015, -0.8243163 , -0.65753323, -0.60745531,
        -0.89469192],
       [-0.4349866 ,  0.53934749, -0.8243163 , -0.65753323,  1.58099297,
         0.04814673],
       [ 0.33330142,  1.6292622 , -0.8243163 , -0.65753323,  0.48676883,
        -1.0463373 ],
       [-0.4349866 ,  0.86632191, -0.8243163 , -0.65753323, -0.60745531,
        -0.2617373 ],
       [-0.4349866 , -0.76855015, -0.8243163 , -0.65753323, -0.60745531,
        -0.63096083],
       [-0.4349866 , -0.98653309,  0.33142614,  0.

In [19]:
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)

In [20]:
regressor.fit(X_train, y_train)

In [29]:
filename = 'vadodara_house_model.pkl'

In [30]:
pickle.dump(regressor, open(filename, 'wb'))

In [31]:
loaded_model = pickle.load(open(filename, 'rb'))

In [32]:
accuracy = loaded_model.score(X_val, y_val)

In [33]:
print(accuracy*100, '%')

66.38652868872579 %


In [34]:
predictions = loaded_model.predict(X_val)

In [35]:
predictions

array([12670000.        ,  1991200.        ,  3125000.        ,
        6022400.        ,  5951900.        ,  3470000.        ,
        5328800.        ,  2753450.        ,  3035500.        ,
        2711000.        ,  6766400.        ,  4695571.5       ,
        3125300.        ,  2812000.        ,  2305000.        ,
        2768800.        ,  3195000.        ,  5240333.5       ,
        7750000.        ,  3430200.        ,  5363800.        ,
       14280000.        ,  3455789.5       ,  8534000.        ,
        5335190.5       ,  5407300.        ,  6404600.        ,
        2765093.33333333,  2837500.        ,  2547543.33333333,
        3004800.        ,  1512100.        ,  6597100.        ,
       16990000.        ,  3400000.        ,  4582000.        ,
        3734000.        , 10063000.        ,  3290000.        ,
        3085000.        ,  3950000.        ,  3660200.        ,
        7198800.        ,  8616000.        ,  3395000.        ,
        3219000.        ,  7325000.     