## **Importing Python Libraries**

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib 
import re
matplotlib.rcParams["figure.figsize"] = (20,10)

## **Importing DataSet**

In [2]:
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056.0,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600.0,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440.0,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521.0,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200.0,2.0,1.0,51.0


## **Droping Unnecessary Features(Columns)**

In [3]:
df = df.drop(['area_type','society','balcony','availability'],axis='columns')
df.shape

(13320, 5)

## **Handling Missing Values.**

In [4]:
df.isnull().sum()

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [5]:
df['location']=df['location'].fillna(df['location'].mode()[0])
df['size']=df['size'].fillna(df['size'].mode()[0])
df['bath']=df['bath'].fillna(df['bath'].mean())


In [6]:
df.shape

(13320, 5)

## **Feature Engineering**

### **Converting Object Type 'size' Columns into Number(Float Type)**

In [7]:
df['BHK'] = pd.to_numeric(df['size'].str.split(' ').str[0])

### **Converting Impure Data form 'total_sqft' Columns into Number Format**

In [8]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [9]:
df[~df['total_sqft'].apply(is_float)].sample(5)

Unnamed: 0,location,size,total_sqft,bath,price,BHK
7607,Bommenahalli,3 Bedroom,15Acres,3.0,217.0,3
8289,Kanakpura Road,2 BHK,700 - 900,2.69261,41.145,2
9476,Hoodi,1 BHK,706 - 716,1.0,42.65,1
6481,Chandapura,3 BHK,1100 - 1225,2.69261,30.215,3
2862,Devanahalli,2 BHK,1500 - 2400,2.69261,46.8,2


In [10]:
def convert_rangeTo_sqft(x):
    val=x.split('-')
    if(len(val)==2):
        return (float(val[0])+float(val[1]))/2
    
    if(1):
        match = re.search(r'(\d+(\.\d+)?)(\D+)', x)
        if match:
            number, string = match.group(1), match.group(3) 
            number=float(number)
            if(string=='Sq. Meter'):
                return float(number*10.7639)
            # elif(string=='Sq. Yards'):
            #     return float(number*9)
            # elif(string=='Acres'):
            #     return float(number*43560)
            # elif(string=='Cents'):
            #     return float(number*435.56)
            # elif(string=='Guntha'):
            #     return float(number*1089)
            # elif(string=='Grounds'):
            #     return float(number*2190)
            # elif(string=='Perch'):
            #     return float(number*272.25)
            elif(string=='Sq. Feet'):
                return float(number*1)
    try:
        return float(x)
    except:
        return None

In [11]:
df['total_sqft']= df['total_sqft'].apply(convert_rangeTo_sqft)
df = df[df.total_sqft.notnull()]

### **Add new feature called price per square feet**

In [12]:
df['price_per_sqft'] = df['price']*100000/df['total_sqft']
df.head()

Unnamed: 0,location,size,total_sqft,bath,price,BHK,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [13]:
df.to_csv("bhp.csv",index=False)

## **Dimensionality Reduction**

**Examine locations which is a categorical variable. We need to apply dimensionality reduction technique here to reduce number of locations**

In [14]:
# df.location = df.location.apply(lambda x: x.strip())
location_stats = df['location'].value_counts(ascending=False)
location_stats

location
Whitefield               539
Sarjapur  Road           399
Electronic City          302
Kanakpura Road           271
Thanisandra              234
                        ... 
BEML Layout 5th stage      1
Kannur                     1
singapura paradise         1
Uvce Layout                1
Abshot Layout              1
Name: count, Length: 1299, dtype: int64

In [15]:
print("No of location count more than 10",len(location_stats[location_stats>10]))
print("No of location count less than 10",len(location_stats[location_stats<=10]))

No of location count more than 10 240
No of location count less than 10 1059


**Any location having less than 10 data points should be tagged as "other" location. This way number of categories can be reduced by huge amount. Later on when we do one hot encoding, it will help us with having fewer dummy columns**

In [16]:
df.location = df.location.apply(lambda x: 'other' if x in (location_stats[location_stats<=10]) else x)

## **Handling Outliers**

### **Outlier Removal Using Standard Deviation and Mean**

In [17]:
df.price_per_sqft.describe()

count    1.329100e+04
mean     7.904671e+03
std      1.063618e+05
min      2.678298e+02
25%      4.265734e+03
50%      5.433830e+03
75%      7.312469e+03
max      1.200000e+07
Name: price_per_sqft, dtype: float64

**Here we find that min price per sqft is 267 rs/sqft whereas max is 12000000, this shows a wide variation in property prices. We should remove outliers per location using mean and one standard deviation**

In [18]:
df = df[~(df.total_sqft/df.BHK<300)]


In [19]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df = remove_pps_outliers(df)
df.shape

(10321, 7)

In [20]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('BHK'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('BHK'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
df = remove_bhk_outliers(df)
df.shape

(7369, 7)

### **Outlier Removal Using Bathrooms Feature**

In [21]:
df[df.bath>10]
df[df.bath>df.BHK+2]
df = df[df.bath<df.BHK+2]

In [22]:
df = df.drop(['size','price_per_sqft'],axis='columns')
df.head(3)

Unnamed: 0,location,total_sqft,bath,price,BHK
1,Devarachikkanahalli,1250.0,2.0,40.0,2
2,Devarachikkanahalli,1200.0,2.0,83.0,2
3,Devarachikkanahalli,1170.0,2.0,40.0,2


## **Handling Catogorical Data.**

In [23]:
dummies = pd.get_dummies(df.location)
df = pd.concat([df,dummies.drop('other',axis='columns')],axis='columns')
df = df.drop('location',axis='columns')

## **Spliting the Data**

In [24]:
X = df.drop(['price'],axis='columns')
y = df.price

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

## **Model Training.**

In [26]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

### **Model 01 - Linear Regression** 

In [27]:
from sklearn.linear_model import LinearRegression
linear_reg = LinearRegression()
linear_reg.fit(X_train, y_train)
y_pred_test = linear_reg.predict(X_test)

In [28]:
r2_linear = r2_score(y_test, y_pred_test)
print("Linear Regression - R-squared Score:", r2_linear)

mse_linear = mean_squared_error(y_test, y_pred_test)
print("Linear Regression - Mean Squared Error:", mse_linear)

mae_linear= mean_absolute_error(y_test, y_pred_test)
print("Linear Regression - Mean Absolute Error:", mae_linear)

Linear Regression - R-squared Score: 0.8658163985208257
Linear Regression - Mean Squared Error: 777.4510745564617
Linear Regression - Mean Absolute Error: 16.37746189203427


### **Model 02 - Decision Tree Regression** 

In [29]:
from sklearn import tree

In [30]:
regtree = tree.DecisionTreeRegressor(max_depth=3)
regtree.fit(X_train, y_train)
y_pred_train = regtree.predict(X_train)
y_pred_test = regtree.predict(X_test)

In [31]:
from sklearn.metrics import r2_score, mean_squared_error
r2_regtree= r2_score(y_test, y_pred_test)
print("Decision Tree Regression - R-squared Score:", r2_regtree)

mse_regtree = mean_squared_error(y_test, y_pred_test)
print("Decision Tree Regression - Mean Squared Error:", mse_regtree)

mae_regtree= mean_absolute_error(y_test, y_pred_test)
print("Decision Tree Regression - Mean Absolute Error:", mae_regtree)


Decision Tree Regression - R-squared Score: 0.705446343565904
Decision Tree Regression - Mean Squared Error: 1706.6247602898366
Decision Tree Regression - Mean Absolute Error: 24.457205225585966


### **Model 03 -Random Forest With Grid Search CV** 

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [33]:
rf_model = RandomForestRegressor(random_state=15,n_jobs=-1,oob_score=True)
param_grid = {
    'n_estimators':[200,250],
    'min_samples_split': [5,15,20],
    'max_depth':[4,None]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2',n_jobs=-1)
grid_search.fit(X_train, y_train)

In [34]:
rf_best_params = grid_search.best_params_
rf_best_model = grid_search.best_estimator_

y_pred_train = rf_best_model.predict(X_train)
y_pred_test = rf_best_model.predict(X_test)

In [35]:
rf_best_params

{'max_depth': None, 'min_samples_split': 20, 'n_estimators': 200}

In [36]:
r2_rf= r2_score(y_test, y_pred_test)
print("Random Forest Regression - R-squared Score:", r2_rf)

mse_rf = mean_squared_error(y_test, y_pred_test)
print("Random Forest Regression - Mean Squared Error:", mse_rf)

mae_rf= mean_absolute_error(y_test, y_pred_test)
print("Random Forest Regression - Mean Absolute Error:", mae_rf)

Random Forest Regression - R-squared Score: 0.8427498793311514
Random Forest Regression - Mean Squared Error: 911.0969890542386
Random Forest Regression - Mean Absolute Error: 16.83876748007856


### **Model 04 - Gradient Boosting Regression** 

In [37]:
from sklearn.ensemble import GradientBoostingRegressor

In [38]:
boosting_regressor = GradientBoostingRegressor(random_state=14)
boosting_regressor.fit(X_train, y_train)
y_pred_train = boosting_regressor.predict(X_train)
y_pred_test = boosting_regressor.predict(X_test)

In [39]:
r2_boosting_regressor= r2_score(y_test, y_pred_test)
print("Gradient Boosting Regression - R-squared Score:", r2_boosting_regressor)

mse_boosting_regressor = mean_squared_error(y_test, y_pred_test)
print("Gradient Boosting Regression - Mean Squared Error:", mse_boosting_regressor)

mae_boosting_regressor= mean_absolute_error(y_test, y_pred_test)
print("Gradient Boosting  Regression - Mean Absolute Error:", mae_boosting_regressor)

Gradient Boosting Regression - R-squared Score: 0.8494794403995135
Gradient Boosting Regression - Mean Squared Error: 872.106349168161
Gradient Boosting  Regression - Mean Absolute Error: 18.8066432500583


In [40]:
param_grid_grad = {
    'n_estimators':[180,200],
    'min_samples_split': [2,5,20],
    'max_depth':[5,7,None],
    'learning_rate':[0.01,0.179]
}
grid_search_GRD = GridSearchCV(estimator=boosting_regressor, param_grid=param_grid_grad, cv=5, scoring='r2',n_jobs=-1)
grid_search_GRD.fit(X_train, y_train)

In [41]:
GRD_best_params = grid_search_GRD.best_params_
GRD_best_params_best_model = grid_search_GRD.best_estimator_

y_pred_train = GRD_best_params_best_model.predict(X_train)
y_pred_test = GRD_best_params_best_model.predict(X_test)

In [42]:
GRD_best_params

{'learning_rate': 0.179,
 'max_depth': 5,
 'min_samples_split': 20,
 'n_estimators': 200}

In [43]:
r2_GRD_boosting_regressor= r2_score(y_test, y_pred_test)
print("Gradient Boosting Regression - R-squared Score:", r2_GRD_boosting_regressor)

mse_GRD_boosting_regressor = mean_squared_error(y_test, y_pred_test)
print("Gradient Boosting Regression - Mean Squared Error:", mse_GRD_boosting_regressor)

mae_GRD_boosting_regressor= mean_absolute_error(y_test, y_pred_test)
print("Gradient Boosting  Regression - Mean Absolute Error:", mae_GRD_boosting_regressor)

Gradient Boosting Regression - R-squared Score: 0.8868794915584447
Gradient Boosting Regression - Mean Squared Error: 655.4128811031345
Gradient Boosting  Regression - Mean Absolute Error: 15.475882716655265


### **Model 05 - XG Boost Regressor** 

In [44]:
import xgboost as xgb

In [45]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [46]:
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'seed': 0
}

In [47]:
num_rounds = 100
xgb_model = xgb.train(params, dtrain, num_rounds)

In [48]:
y_pred_train = xgb_model.predict(dtrain)
y_pred_test = xgb_model.predict(dtest)

In [49]:
r2_xgb= r2_score(y_test, y_pred_test)
print("XG Boost Regressor - R-squared Score:",r2_xgb)

mse_xgb = mean_squared_error(y_test, y_pred_test)
print("XG Boost Regressor - Mean Squared Error:", mse_xgb )

mae_xgb= mean_absolute_error(y_test, y_pred_test)
print("Gradient Boosting  Regression - Mean Absolute Error:",mae_xgb)

XG Boost Regressor - R-squared Score: 0.8811151531261098
XG Boost Regressor - Mean Squared Error: 688.8110837070602
Gradient Boosting  Regression - Mean Absolute Error: 15.564197224589533


### **Model 06 - Lasso Regressor** 

In [61]:
from sklearn.linear_model import Lasso
lasso_reg=Lasso()
lasso_reg.fit(X_train,y_train)
y_pred_test = lasso_reg.predict(X_test)

In [62]:
r2_lasso_reg= r2_score(y_test, y_pred_test)
print("Decision Tree Regression - R-squared Score:", r2_lasso_reg)

mse_lasso_reg = mean_squared_error(y_test, y_pred_test)
print("Decision Tree Regression - Mean Squared Error:", mse_lasso_reg)

mae_lasso_reg= mean_absolute_error(y_test, y_pred_test)
print("Decision Tree Regression - Mean Absolute Error:", mae_lasso_reg)


Decision Tree Regression - R-squared Score: 0.7276650284794373
Decision Tree Regression - Mean Squared Error: 1577.8911425389454
Decision Tree Regression - Mean Absolute Error: 23.326425212390053


### **Model 07 - Ridge Regressor** 

In [63]:
from sklearn.linear_model import Ridge
Ridge_reg=Ridge()
Ridge_reg.fit(X_train,y_train)
y_pred_test = Ridge_reg.predict(X_test)

In [64]:
r2_Ridge_reg= r2_score(y_test, y_pred_test)
print("Ridge Regression - R-squared Score:", r2_Ridge_reg)

mse_Ridge_reg = mean_squared_error(y_test, y_pred_test)
print("Ridge Regression - Mean Squared Error:", mse_Ridge_reg)

mae_Ridge_reg= mean_absolute_error(y_test, y_pred_test)
print("Ridge Regression - Mean Absolute Error:", mae_Ridge_reg)


Ridge Regression - R-squared Score: 0.8670880283178288
Ridge Regression - Mean Squared Error: 770.0833340783411
Ridge Regression - Mean Absolute Error: 16.33843469448033


## **Model Evalueation**

In [65]:
print("Linear Regression - R-squared Score:", r2_linear)
print("Decision Tree Regression - R-squared Score:", r2_regtree)
print("Random Forest Regression - R-squared Score:", r2_rf)
print("Gradient Boosting Regression - R-squared Score:", r2_GRD_boosting_regressor)
# print("Ada Boost Regressor - R-squared Score:",r2_adaboost_regressor)
print("XG Boost Regressor - R-squared Score:",r2_xgb)
print("Lasso Regression - R-squared Score:", r2_lasso_reg)
print("Ridge Regression - R-squared Score:", r2_Ridge_reg)

print('______________________________________________________________________')

print("Linear Regression - Mean Squared Error:", mse_linear)
print("Decision Tree Regression - Mean Squared Error:", mse_regtree)
print("Random Forest Regression - Mean Squared Error:", mse_rf)
print("Gradient Boosting Regression - Mean Squared Error:", mse_GRD_boosting_regressor)
# print("Ada Boost Regressor - Mean Squared Error:", mse_adaboost_regressor )
print("XG Boost Regressor - Mean Squared Error:", mse_xgb )
print("Lasso Regression - Mean Squared Error:", mse_lasso_reg)
print("Ridge Regression - Mean Squared Error:", mse_Ridge_reg)
print('______________________________________________________________________')

print("Linear Regression - Mean Absolute Error:", mae_linear)
print("Decision Tree Regression - Mean Absolute Error:", mae_regtree)
print("Random Forest Regression - Mean Absolute Error:", mae_rf)
print("Gradient Boosting Regression - Mean Absolute Error:", mae_GRD_boosting_regressor)
# print("Ada Boost Regressor - Mean Absolute Error:", mae_adaboost_regressor )
print("XG Boost Regressor - Mean Absolute Error:", mae_xgb )
print("Lasso Regression - Mean Absolute Error:", mae_lasso_reg)
print("Ridge Regression - Mean Absolute Error:", mae_Ridge_reg)
print('______________________________________________________________________')

Linear Regression - R-squared Score: 0.8658163985208257
Decision Tree Regression - R-squared Score: 0.705446343565904
Random Forest Regression - R-squared Score: 0.8427498793311514
Gradient Boosting Regression - R-squared Score: 0.8868794915584447
XG Boost Regressor - R-squared Score: 0.8811151531261098
Lasso Regression - R-squared Score: 0.7276650284794373
Ridge Regression - R-squared Score: 0.8670880283178288
______________________________________________________________________
Linear Regression - Mean Squared Error: 777.4510745564617
Decision Tree Regression - Mean Squared Error: 1706.6247602898366
Random Forest Regression - Mean Squared Error: 911.0969890542386
Gradient Boosting Regression - Mean Squared Error: 655.4128811031345
XG Boost Regressor - Mean Squared Error: 688.8110837070602
Lasso Regression - Mean Squared Error: 1577.8911425389454
Ridge Regression - Mean Squared Error: 770.0833340783411
______________________________________________________________________
Linear Regr

## **Model Testing**

In [None]:
def predict_price(location,sqft,bath,bhk):    
    loc_index = np.where(X.columns==location)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if loc_index >= 0:
        x[loc_index] = 1

    return GRD_best_params_best_model.predict([x])[0]

In [None]:
predict_price('1st Phase JP Nagar',1000, 2, 2)



67.77210313642897

In [None]:
predict_price('1st Phase JP Nagar',1000, 3, 3)



80.75534683840925

In [None]:
predict_price('Indira Nagar',1000, 2, 2)



95.81084794657342

In [None]:
predict_price('Indira Nagar',1500, 2, 2)



196.519587723197

In [None]:
predict_price('Indira Nagar',1000, 3, 3)



102.0699750379272

## **Export the tested model to a pickle file**

In [None]:
import pickle
with open('../server/model/final_model.pickle','wb') as f:
    pickle.dump(GRD_best_params_best_model,f)

## **Export location and column information to a file that will be useful later on in our prediction application**

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("../server/model/columns.json","w") as f:
    f.write(json.dumps(columns))