In [2]:
import pandas as pd 
df1 = pd.read_csv("bengaluru_house_prices.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


### Data Cleaning

In [3]:
df2 = df1.drop(['availability', 'society', 'balcony'], axis=1)
df2.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0


In [4]:
df3 = df2.dropna()
df3.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0


In [5]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))


Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0,2


In [6]:
def range_to_num(x):
    tokens = x.split('-')
    if(len(tokens)==2):
        return (float(tokens[0]) + float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(range_to_num)
df4.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.0,2


### Feature Engineering

In [7]:
df5 = df4.copy()
df5['price_per_sqft'] = ((df5['price'] * 100000)/df5['total_sqft'])

In [8]:
df5['location'] = df5['location'].apply(lambda x: x.strip())

locations = df5.groupby('location')['location'].agg('count').sort_values(ascending = False)

locations_lessthan_10 = locations[locations<=10]

df5['location'] = df5['location'].apply(lambda x: "other" if x in locations_lessthan_10 else x)

df5

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.00,4,4615.384615
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.00,3,4305.555556
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.00,3,6245.890861
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.00,2,4250.000000
...,...,...,...,...,...,...,...,...
13315,Built-up Area,Whitefield,5 Bedroom,3453.0,4.0,231.00,5,6689.834926
13316,Super built-up Area,other,4 BHK,3600.0,5.0,400.00,4,11111.111111
13317,Built-up Area,Raja Rajeshwari Nagar,2 BHK,1141.0,2.0,60.00,2,5258.545136
13318,Super built-up Area,Padmanabhanagar,4 BHK,4689.0,4.0,488.00,4,10407.336319


### Outlier Removal

In [9]:
df6 = df5[(df5['total_sqft']/df5['bhk'])>=300]
df6.shape

(12456, 8)

In [10]:
import numpy as np

def remove_outliers(df):
    df_out = pd.DataFrame()

    for key, subdf in df.groupby('location'):
        m = np.mean(subdf['price_per_sqft'])
        std = np.std(subdf['price_per_sqft'])

        reduceddf = subdf[(subdf['price_per_sqft']>(m-std)) & (subdf['price_per_sqft']<=(m+std))]
        df_out = pd.concat([df_out, reduceddf], ignore_index=True)
    
    return df_out

df7 = remove_outliers(df6)
df7.shape

(10241, 8)

In [11]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, locationdf in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhkdf in locationdf.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhkdf['price_per_sqft']),
                'std' : np.std(bhkdf['price_per_sqft']),
                'count' : bhkdf.shape[0]
            }
        for bhk, bhkdf in locationdf.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5 :
                exclude_indices = np.append(exclude_indices, bhkdf[bhkdf['price_per_sqft']<stats['mean']].index.values)
    return df.drop(exclude_indices, axis=0)

df8 = remove_bhk_outliers(df7)
df8.shape

(7329, 8)

In [12]:
df9 = df8[df8['bath']<df8['bhk']+2]
df9.shape

(7251, 8)

In [13]:
df10 = df9.drop(['price_per_sqft', 'size', 'area_type'], axis=1)
df10.isnull().sum()

location      0
total_sqft    0
bath          0
price         0
bhk           0
dtype: int64

In [14]:
dummies = pd.get_dummies(df10['location'], dtype=int)
df11 = pd.concat([df10, dummies], axis=1)
df11.drop('location', axis=1, inplace=True)
df11.head()

Unnamed: 0,total_sqft,bath,price,bhk,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur,other
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1200.0,2.0,130.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1235.0,2.0,148.0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
x = df11.drop('price', axis=1)
y = df11['price']

In [16]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit

def find_best_model(x, y):
    algos = {
        'linear_regression': {
            'model' : LinearRegression(),
            'params' : {
                'fit_intercept' : [True, False]
            }
        },

        'lasso': {
            'model' : Lasso(),
            'params' : {
                'alpha': [1, 2],
                'selection': ['random', 'cyclic']
            }
        },

        'decision_tree': {
            'model' : DecisionTreeRegressor(),
            'params' : {
                'criterion': ['friedman_mse', 'poisson', 'absolute_error', 'squared_error'],
                'splitter': ['best', 'random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=10)
    for algo_name, config in algos.items():
        gs = GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(x, y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])

find_best_model(x, y)

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.852154,{'fit_intercept': True}
1,lasso,0.720564,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.742068,"{'criterion': 'friedman_mse', 'splitter': 'ran..."


In [17]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.25, random_state=20)
model = LinearRegression()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.8088074415676425

In [18]:
def predict_price(location, sqft, bath, bhk):
    loc_index = np.where(x.columns==location)[0][0]

    x = np.zeros(len(x.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    
    if loc_index>=0:
        x[loc_index] = 1
    
    return model.predict([x])[0]

In [19]:
import pickle

with open("House_Prediction_model.pickle", "wb") as f:
    pickle.dump(model, f)

In [20]:
import json

columns = {
    'data_columns': [col.strip() for col in x.columns]
}

with open("columns.json", "w") as f:
    f.write(json.dumps(columns))