In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib 
matplotlib.rcParams["figure.figsize"] = (20,10)



In [8]:
data = pd.read_csv('data/Bengaluru_House_Data.csv')

In [9]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [10]:
#Checking size of dataframe

data.shape


(13320, 9)

In [11]:
#Checking information of data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


### As showing above there are 6 columns are categorical column and 3 columns are float

In [12]:
#Fecthing all Categorical Columns and num columns
cat_cols = []
num_cols = []
for col in data.columns:
    if(data[col].dtypes == 'object'):
        cat_cols.append(col)
    else:
        num_cols.append(col)
print('Categorical Colunms : ', cat_cols)  
print('Numerical Colunms : ',num_cols)   

Categorical Colunms :  ['area_type', 'availability', 'location', 'size', 'society', 'total_sqft']
Numerical Colunms :  ['bath', 'balcony', 'price']


In [13]:
#Checking different types of values each categorical column have

for col in cat_cols:
    print('Values are in {} column:'.format(col))
    print(data[col].value_counts())

Values are in area_type column:
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: area_type, dtype: int64
Values are in availability column:
Ready To Move    10581
18-Dec             307
18-May             295
18-Apr             271
18-Aug             200
                 ...  
15-Aug               1
17-Jan               1
16-Nov               1
16-Jan               1
14-Jul               1
Name: availability, Length: 81, dtype: int64
Values are in location column:
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: location, Length: 1305, dtype: int


I observed that there are few columns named as availability, balcony and society don't have that much importance also, society has more no. of null values so I am dropping the same from my data frame

In [14]:
data.drop(['availability','society','balcony'], inplace = True, axis = 1)

In [15]:
data.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0


# Data Cleaning

In [16]:
#Checking for null values
data.isnull().sum()

area_type      0
location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [17]:
data1 = data.copy()

#### As above there are very less Na values so I am just dropping all na values

In [18]:
data1.dropna(inplace = True, axis = 0)

In [19]:
data1.isnull().sum()

area_type     0
location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [20]:
data1.shape

(13246, 6)

#### I have dropped all the null values and now I have 13246 rows in my dataset which are sufficiant to train a model. 

#### Next I will work on my categorical features which are not in proper format. First I will work on size column, size column has some values like BHK and BEDROOM which tells us size of house so here no need of BHK and Bedroom. We can simply fetch no. and convert this column into integer


In [21]:
# Defining a function to fetch no. from size. As we have cheked first letter in string is a no. so we will fecth the same from the String
num = ''
def fetch_num(x):
        x= x.split(' ')
        return int(x[0])
 

In [22]:
data1['bhk'] = data1['size'].apply(lambda x : fetch_num(x))

In [23]:
data1.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0,2


In [24]:
data1[data1['bath']>10]

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
938,Plot Area,5th Phase JP Nagar,9 Bedroom,1260,11.0,290.0,9
1078,Plot Area,BTM 1st Stage,9 Bedroom,3300,14.0,500.0,9
1718,Super built-up Area,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
1768,Plot Area,1 Ramamurthy Nagar,11 Bedroom,1200,11.0,170.0,11
1953,Plot Area,KR Puram,8 Bedroom,1200,12.0,110.0,8
1979,Plot Area,Hongasandra,8 Bedroom,990,12.0,120.0,8
3096,Super built-up Area,Jp nagar 8th Phase .,10 BHK,12000,12.0,525.0,10
3379,Super built-up Area,1Hanuman Nagar,19 BHK,2000,16.0,490.0,19
3609,Super built-up Area,Koramangala Industrial Layout,16 BHK,10000,16.0,550.0,16
4684,Plot Area,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [25]:
#Now I will work on total_sqft column
data1.total_sqft.unique()


array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

 #### As above seen there are different type of values in sqft column like range between 2 values. We need to check more type of values in the same and for that I will write following function

In [26]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [27]:
#finding out all values which are not float
data1[~data1['total_sqft'].apply(is_float)]

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
30,Super built-up Area,Yelahanka,4 BHK,2100 - 2850,4.0,186.000,4
122,Super built-up Area,Hebbal,4 BHK,3067 - 8156,4.0,477.000,4
137,Super built-up Area,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Super built-up Area,Sarjapur,2 BHK,1145 - 1340,2.0,43.490,2
188,Super built-up Area,KR Puram,2 BHK,1015 - 1540,2.0,56.800,2
...,...,...,...,...,...,...,...
12975,Super built-up Area,Whitefield,2 BHK,850 - 1060,2.0,38.190,2
12990,Super built-up Area,Talaghattapura,3 BHK,1804 - 2273,3.0,122.000,3
13059,Super built-up Area,Harlur,2 BHK,1200 - 1470,2.0,72.760,2
13265,Super built-up Area,Hoodi,2 BHK,1133 - 1384,2.0,59.135,2


#### As we have checked above most of the values are in range between 2 values so we will take avarage of these 2 values. Now I will write a function to convert all values into float and which are is not proper format (Range Format), I will take avarage of both the values in convert them into float

In [37]:
#Creating a copy of data

data2 = data1.copy()

In [38]:
num = ''
def convert_float(x):
    num = x.split('-')
    if(len(num) == 2):
        return (float(num[0])+float(num[1]))/2
    try:
        return float(x)
    except:
        return None

In [39]:
#applying above function on total_sqft column

data2['total_sqft'] = data2['total_sqft'].apply(convert_float)

In [40]:
data2.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [41]:
# Again checking different values of total_sqft

data2[~data2['total_sqft'].apply(is_float)]

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk


In [35]:
data2['total_sqft'] = data2['total_sqft'].apply(convert_float)


AttributeError: 'float' object has no attribute 'split'

In [42]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13246 entries, 0 to 13319
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13246 non-null  object 
 1   location    13246 non-null  object 
 2   size        13246 non-null  object 
 3   total_sqft  13200 non-null  float64
 4   bath        13246 non-null  float64
 5   price       13246 non-null  float64
 6   bhk         13246 non-null  int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 827.9+ KB


In [43]:
data2.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4
2,Built-up Area,Uttarahalli,3 BHK,1440.0,2.0,62.0,3
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3
4,Super built-up Area,Kothanur,2 BHK,1200.0,2.0,51.0,2


In [44]:
# Now I will work on location column, As checked above there are more no. of unique values in location column

len(data.location.unique())

1306

#### There are total 1265 unique location which is huge in number and if we will do one hot encoding then it will become dimentionality curse. For that we need to apply some dimentionality deduction alogorithms like PCA.

#### We have one more and effective solution for this, if there are many location coming between 1 to 10 then we can replace the same with "Other" location.

In [45]:
# Finding out occurence of location
data3 = data2.copy()
data3.location  = data3.location.apply(lambda x : x.strip())


In [46]:
loc_stats = data3.groupby('location')['location'].agg('count').sort_values(ascending = False)
loc_stats

location
Whitefield               535
Sarjapur  Road           392
Electronic City          304
Kanakpura Road           266
Thanisandra              236
                        ... 
1 Giri Nagar               1
Kanakapura Road,           1
Kanakapura main  Road      1
Karnataka Shabarimala      1
whitefiled                 1
Name: location, Length: 1293, dtype: int64

In [47]:
#I will give one threshold of 20 and less than 20 data point location will become Other location

len(loc_stats[loc_stats<20])

1145

In [None]:
#Creating a list which locations data points are less than 20 or 10
loc_less_than_20 = loc_stats[loc_stats<20]
loc_less_than_10 = loc_stats[loc_stats<10]

In [None]:
loc_less_than_20

In [None]:
data4 = data3.copy()

In [None]:
data4.location = data4.location.apply(lambda x : 'Other' if x in loc_less_than_20 else x)

In [None]:
data4.head()

In [None]:
#removing NUll values from total_sqft

data4.dropna(axis = 0, inplace = True)

In [None]:
data4.shape

In [None]:
data4.isnull().sum()

## Detecting Outlier and Removal

In [None]:
# I believe that we can create 1 bhk in 300 sqft so I am filtering data according to that

data4[data4.total_sqft/data4.bhk<300]

In [None]:
data5 = data4[~(data4.total_sqft/data4.bhk<300)]

In [None]:
data5.shape

In [None]:
#Calculating price per sqft

data5['price_per_sqft'] = (data5['price']*100000)/data5['total_sqft']

In [None]:
data5.head()

In [None]:
data5.price_per_sqft.describe()

#### Here we find that min price per sqft is 267 rs/sqft whereas max is 12000000, this shows a wide variation in property prices. We should remove outliers per location using mean and one standard deviation



In [None]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
data6 = remove_pps_outliers(data5)
data6.shape

In [None]:
data6.head()

#### Let's check if for a given location how does the 2 BHK and 3 BHK property prices look like

In [None]:
def plot_scatter_chart(df,location):
    bhk2 = df[(df.location==location) & (df.bhk==2)]
    bhk3 = df[(df.location==location) & (df.bhk==3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,marker='+', color='green',label='3 BHK', s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price (Lakh Indian Rupees)")
    plt.title(location)
    plt.legend()
    
plot_scatter_chart(data6,"Rajaji Nagar")

In [None]:
plot_scatter_chart(data6,"Hebbal")

In [None]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean': np.mean(bhk_df.price_per_sqft),
                'std': np.std(bhk_df.price_per_sqft),
                'count': bhk_df.shape[0]
            }
        for bhk, bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')
data7 = remove_bhk_outliers(data6)



In [None]:
data7.shape

In [None]:
plot_scatter_chart(data7,"Rajaji Nagar")

In [None]:

matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(data7.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

In [None]:
data7.bath.unique()

#### As above checked there are max 16 bathroom house in our dataset which is unusual so we will explore bathroom feature in our dataset

In [None]:
#PLoting a histogram for bathcount
plt.hist(data7.bath,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

#### There more no. of data points are in between 2 to 4 bathroom

In [None]:
# Lets check how many data points are there above 10 bathrooms

data7[data7['bath']>10]

#### Let's suppose that if no. of bathroom is greater than no. of bhk+2 then we will remove those outliers

In [None]:
#CHecking no. data points which are satisfying above condition
data7[data7['bath']>data7['bhk']+2]

In [None]:
data8 = data7[data7['bath']<data7['bhk']+2]

In [None]:
data8.shape

In [None]:
# Lets drop extra feature from our dataset which are size and price_per_sqft(Used only for outlier detection)

data9 = data8.drop(['size','price_per_sqft'], axis = 1)

In [None]:
data9.head()

#### Converting categorical feature to numerical using one hot encoding

In [None]:
dummies = pd.get_dummies(data9, drop_first=True)

In [None]:
dummies.columns

In [None]:
df = dummies

In [None]:
df.shape

# Train Test Split

In [None]:
X = df.drop('price', axis = 1)

In [None]:
y = df['price']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

# Building a Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

# Use K Fold cross validation to measure accuracy of our LinearRegression model

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)

cross_val_score(LinearRegression(), X, y, cv=cv)

### We can see that in 5 iterations we get a score more 80% all the time. This is pretty good but we want to test few other algorithms for regression to see if we can get even better score. We will use GridSearchCV for this purpose

In [None]:
##Find best model using GridSearchCV

from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

def find_best_model_using_gridsearchcv(X,y):
    algos = {
        'linear_regression' : {
            'model': LinearRegression(),
            'params': {
                'normalize': [True, False]
            }
        },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
    }
    scores = []
    cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    for algo_name, config in algos.items():
        gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
        gs.fit(X,y)
        scores.append({
            'model': algo_name,
            'best_score': gs.best_score_,
            'best_params': gs.best_params_
        })

    return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,y)

## Test the model for few properties

In [None]:
columns = X.columns
columns = list(columns)
columns

In [None]:
def predict_price(location,area_type,sqft,bath,bhk):    
    loc_index = columns.index(location)
    at_index =  columns.index(area_type)

    x = np.zeros(len(X.columns))
    x[0] = sqft
    x[1] = bath
    x[2] = bhk
    if at_index >= 0:
        x[at_index] = 1
    if loc_index >= 0:
        x[loc_index] = 1

    return lr_clf.predict([x])[0]

In [None]:
predict_price('location_Kaggadasapura','area_type_Plot  Area',1000, 3, 3)

In [None]:
import pickle
with open('banglore_home_prices_model.pickle','wb') as f:
    pickle.dump(lr_clf,f)

In [2]:
columns = ["total_sqft", "bath", "bhk", "area_type_Carpet  Area", "area_type_Plot  Area", "area_type_Super built-up  Area", "location_2nd Stage Nagarbhavi", "location_5th Phase JP Nagar", "location_6th Phase JP Nagar", "location_7th Phase JP Nagar", "location_8th Phase JP Nagar", "location_9th Phase JP Nagar", "location_Abbigere", "location_Akshaya Nagar", "location_Ambalipura", "location_Ambedkar Nagar", "location_Amruthahalli", "location_Anandapura", "location_Ananth Nagar", "location_Anekal", "location_Ardendale", "location_Attibele", "location_BTM 2nd Stage", "location_Babusapalaya", "location_Balagere", "location_Banashankari", "location_Banashankari Stage III", "location_Bannerghatta Road", "location_Basavangudi", "location_Basaveshwara Nagar", "location_Battarahalli", "location_Begur Road", "location_Bellandur", "location_Bhoganhalli", "location_Binny Pete", "location_Bisuvanahalli", "location_Bommanahalli", "location_Bommasandra", "location_Bommasandra Industrial Area", "location_Brookefield", "location_Budigere", "location_CV Raman Nagar", "location_Chandapura", "location_Channasandra", "location_Chikkalasandra", "location_Choodasandra", "location_Devanahalli", "location_Dodda Nekkundi", "location_Doddathoguru", "location_Domlur", "location_EPIP Zone", "location_Electronic City", "location_Electronic City Phase II", "location_Electronics City Phase 1", "location_Frazer Town", "location_Gottigere", "location_Green Glen Layout", "location_Gubbalala", "location_Gunjur", "location_HBR Layout", "location_HSR Layout", "location_Haralur Road", "location_Harlur", "location_Hebbal", "location_Hebbal Kempapura", "location_Hegde Nagar", "location_Hennur", "location_Hennur Road", "location_Hoodi", "location_Horamavu Agara", "location_Horamavu Banaswadi", "location_Hormavu", "location_Hosa Road", "location_Hosakerehalli", "location_Hoskote", "location_Hosur Road", "location_Hulimavu", "location_Iblur Village", "location_Indira Nagar", "location_JP Nagar", "location_Jakkur", "location_Jalahalli", "location_Jigani", "location_KR Puram", "location_Kadugodi", "location_Kaggadasapura", "location_Kalena Agrahara", "location_Kambipura", "location_Kammasandra", "location_Kanakapura", "location_Kanakpura Road", "location_Kasavanhalli", "location_Kathriguppe", "location_Kaval Byrasandra", "location_Kengeri", "location_Kengeri Satellite Town", "location_Kodichikkanahalli", "location_Kogilu", "location_Koramangala", "location_Kothannur", "location_Kothanur", "location_Kudlu", "location_Kudlu Gate", "location_Kumaraswami Layout", "location_Kundalahalli", "location_Lakshminarayana Pura", "location_Lingadheeranahalli", "location_Magadi Road", "location_Mahadevpura", "location_Malleshwaram", "location_Marathahalli", "location_Margondanahalli", "location_Munnekollal", "location_Mysore Road", "location_Nagarbhavi", "location_Old Airport Road", "location_Old Madras Road", "location_Other", "location_Padmanabhanagar", "location_Panathur", "location_Poorna Pragna Layout", "location_R.T. Nagar", "location_Rachenahalli", "location_Raja Rajeshwari Nagar", "location_Rajaji Nagar", "location_Ramagondanahalli", "location_Ramamurthy Nagar", "location_Rayasandra", "location_Sahakara Nagar", "location_Sanjay nagar", "location_Sarjapur", "location_Sarjapur  Road", "location_Seegehalli", "location_Singasandra", "location_Somasundara Palya", "location_Sonnenahalli", "location_Subramanyapura", "location_TC Palaya", "location_Talaghattapura", "location_Thanisandra", "location_Thigalarapalya", "location_Thubarahalli", "location_Tumkur Road", "location_Ulsoor", "location_Uttarahalli", "location_Varthur", "location_Vidyaranyapura", "location_Vijayanagar", "location_Vittasandra", "location_Whitefield", "location_Yelachenahalli", "location_Yelahanka", "location_Yelahanka New Town", "location_Yeshwanthpur"]
cols = []
import json
for i in columns:
    i = i.lower()
    cols.append(i)
with open("columns.json","w") as f:
    f.write(json.dumps(cols))