In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data_dir = './data/processed_data.csv'

In [3]:
df = pd.read_csv(data_dir)

df.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk
0,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4
1,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3
2,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3
3,1st Block Jayanagar,1200.0,2.0,0.0,130.0,3
4,1st Block Jayanagar,1235.0,2.0,2.0,148.0,2


## Data Transformation

### 1. Converting location values to **one-hot encoded** vectors

* Using pandas get_dummies method the location can be one-hot encoded into a new DF.
* Drop one dummy column, usually the last one as to represent that value all other columns can be made 0.
* Append the new DF to the existing DF.
* Drop the location column.

In [4]:
dummies = pd.get_dummies(df['location'])

dummies.head(3)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df1 = pd.concat([df, dummies.drop(['other'], axis='columns')], axis='columns')

df1.head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,2850.0,4.0,1.0,428.0,4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,2.0,194.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1875.0,2.0,3.0,235.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1st Block Jayanagar,1200.0,2.0,0.0,130.0,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1st Block Jayanagar,1235.0,2.0,2.0,148.0,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Check if a record with location as other has all other columns as 0
df1[df1['location'] == 'other'].head(1)

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
6177,other,2770.0,4.0,2.0,290.0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
df2 = df1.drop(['location'], axis='columns')

df2.shape

(7320, 245)

### 2. Split the data into X(all - price) and y(price) for training

In [10]:
X = df2.drop(['price'], axis='columns')

X.shape

(7320, 244)

In [11]:
y = df2['price']

y.shape

(7320,)

### 3. Split X and y into training and testing

* Used sklearn train_test_split
* Split ratio train:test :: 80:20
* random state to be 10

In [13]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

print(f'  Training set shape: X={X_train.shape} | y={y_train.shape}')
print('-'*50)
print(f'  Test set shape:     X={X_test.shape} | y={y_test.shape}')

  Training set shape: X=(5856, 244) | y=(5856,)
--------------------------------------------------
  Test set shape:     X=(1464, 244) | y=(1464,)


## Model Building

### 1. Building a Baseline Linear Regression model

In [24]:
from sklearn.linear_model import LinearRegression

In [25]:
model_lr = LinearRegression()

model_lr.fit(X_train, y_train)

model_lr.score(X_test, y_test)

0.8783116278465859

##### So, the baseline has a score of 87.8% which is decent.
##### Now, building multiple models to achieve results better than baseline model

#### 2. Trying K Fold Cross Validation with Linear Regression

* Used ShuffleSplit and cross_val_score from sklearn
* K = 5

In [26]:
from sklearn.model_selection import ShuffleSplit, cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

array([0.85488175, 0.85752843, 0.87218901, 0.82540903, 0.85418445])

### 3. Trying Lasso and Decision tree regressions

* Used GridSearchCV from sklearn to enable comparision of different models with relevant hyperparameter tuning.
* Used K Fold Cross Validation scheme

In [28]:
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [29]:
# Method to perform Grid Search with Cross Validation
# Compares Liner, Lasso and DecisionTree Regression
def grid_search_cv(X, y):
    algorithms = {
        'linear_regression': {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion': ['mse', 'friedman_mse'],
                'splitter': ['best', 'random']
            }
        }
    }
    
    scores = []
    
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    
    for alg_name, alg_config in algorithms.items():
        grid = GridSearchCV(alg_config['model'], alg_config['params'], cv=cv, return_train_score=False)
        grid.fit(X, y)
        scores.append({
            'model': alg_name,
            'best_score': grid.best_score_,
            'best_params': grid.best_params_
        })
    
    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

In [30]:
grid_search_cv(X, y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.852839,{'normalize': True}
1,lasso,0.682179,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.732495,"{'criterion': 'mse', 'splitter': 'best'}"


#### Seeing the comparision it looks like LinearRegression is the best model choice with normalize set to True for the dataset

In [31]:
model_final = LinearRegression(normalize=True)

# Train the final model on the entire dataset
model_final.fit(X, y)

LinearRegression(normalize=True)

In [32]:
X.head()

Unnamed: 0,total_sqft,bath,balcony,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,1.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,2.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,3.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,0.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,2.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
def predict_house_price(location, sqft, bath, balcony, bhk):
    location_index = np.where(X.columns == location)[0][0]
    
    x = np.zeros(len(X.columns))
    
    x[0] = sqft
    x[1] = bath
    x[2] = balcony
    x[3] = bhk
    
    if location_index >= 0:
        x[location_index] = 1
    
    return model_final.predict([x])[0]

In [36]:
features = {
    'location': 'Rajaji Nagar',
    'sqft': 1000,
    'bath': 2,
    'balcony': 1,
    'bhk': 2
}

predict_house_price(**features)

224.93574204306367

In [56]:
features = {
    'location': '1st Phase JP Nagar',
    'sqft': 1200,
    'bath': 2,
    'balcony': 1,
    'bhk': 2
}

predict_house_price(**features)

101.54052522181915

## Export the model to a pickle file for later usage 

In [57]:
import pickle

In [58]:
with open('bangalore_housing_price_model.pickle', 'wb') as f:
    pickle.dump(model_final, f)

## Export the input data structure like columns and their indices to a json file for later usage

In [60]:
import json

In [61]:
columns = {
    'data_columns': [col.lower() for col in X.columns]
}

with open('columns.json', 'w') as f:
    f.write(json.dumps(columns))