In [1]:
# imports

import pandas as pd
import numpy as np
import pickle
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from src.viz import visualize_cross_validation


import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 300

In [2]:
df = pd.read_csv("..\Data\house_prices_cleaned.csv")

In [3]:
df.shape

(6907, 6)

**Applying one-hot encoding on location attribute**

In [4]:
dummy = pd.get_dummies(df['location'])\
          .drop('other', axis=1)

# joining it back to the original dataframe
print(df.shape)
df = pd.concat([df, dummy], axis=1)\
        .drop('location', axis=1)
print(df.shape)
df.head()

(6907, 6)
(6907, 195)


Unnamed: 0,total_sqft,bath,price,bhk,price_per_sqft,Devarachikkanahalli,1st Phase JP Nagar,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,...,Vasanthapura,Vidyaranyapura,Vijayanagar,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1056.0,2.0,39.07,2,3699.810606,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1440.0,2.0,62.0,3,4305.555556,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,1521.0,3.0,95.0,3,6245.890861,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1200.0,2.0,51.0,2,4250.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,1170.0,2.0,38.0,2,3247.863248,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


# Splitting the data

In [5]:
# creating independent and dependent variable
X = df.drop('price', axis=1)
y = df['price']
# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2,random_state=10)

In [6]:

# creating a linear model
model = LinearRegression()
# training the model using training data
model.fit(X_train,y_train)
# checking the R^2 score
model.score(X_test,y_test)

0.9867420403939049

The model has given suprisingly good result, let's see if we can increase it

In [9]:
# testing the model
def predict_price(location, sqft, bath, bhk):
    location_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if location_index >= 0:
        x[location_index] = 1
    
    return model.predict([x])[0]

In [11]:
predict_price('Vijayanagar',10000,2,4)

464.7768992320787

# Exporting the model

In [13]:
# exporting the model into pickle file
with open('../Data/final/hpp-lm.pickle','wb') as f:
    pickle.dump(model,f)

In [14]:
columns = {'data_columns': [col.lower() for col in X.columns]}   
# writing the column names in the jsonfile
with open('../data/final/columns.json','w') as f:
    f.write(json.dumps(columns)) 