In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

In [4]:
df1 = pd.read_csv("/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv")
df1.head()

In [5]:
df1.shape

In [6]:
df1.groupby('area_type')['area_type'].agg('count')

In [7]:
df2 = df1.drop(['area_type','society','balcony','availability'],axis='columns')
df2.head()

In [8]:
df2.isnull().sum()

We can see that bath,size,location have null values but those are less in number compared to the size of our dataset so we can ignore them by droping those rows

In [9]:
df3 = df2.dropna()
df3.isnull().sum()

In [10]:
df3.shape

In [11]:
df3['size'].unique()

We can see that one of the unique values like 4BHK and 4 Bedroom which are same so we need to preprocess them

In [12]:
df3['bhk'] = df3['size'].apply(lambda x : int(x.split(' ')[0]))

In [13]:
df3.head()

In [14]:
df3['bhk'].unique()

In [15]:
df3[df3.bhk>20]

In [16]:
df3.total_sqft.unique()

we can see values like this '1133 - 1384'

using is_float function to get which values are correct and which values are given as ranges

In [17]:
def is_float(x):
    try:
        float(x)
    except:
        return False 
    return True

In [18]:
df3[~df3['total_sqft'].apply(is_float)].head(10)

We doing all this processes for data cleaning

In [19]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [20]:
convert_sqft_to_num('2166')

In [21]:
convert_sqft_to_num('2100 - 2850')

In [22]:
convert_sqft_to_num('34.46Sq.Meter')

In [23]:
df4 = df3.copy()

In [24]:
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)
df4.head()

In [25]:
df4.loc[30]

In [26]:
df4.head(3)

In [27]:
df5 = df4.copy()

Price per sqrt is needed for real estate predictions

In [28]:
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head()

In [29]:
len(df5.location.unique())

We can see more dimensions in this which is leading to dimensionality problem

In [30]:
df5.location = df5.location.apply(lambda x : x.strip())

location_stats = df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

Considering location with less than 10 count as other location to reduce the unique values

In [31]:
len(location_stats[location_stats<=10])

We can see many values less than 10 so we can consider <= 10 for precessing the location as other

In [32]:
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10

In [33]:
len(df5.location.unique())

In [34]:
df5.location = df5.location.apply(lambda x : 'other' if x in location_stats_less_than_10 else x)
len(df5.location.unique())

In [35]:
df5.head(10)

***Outlier detection***

in the dataset we are checking the no.of.bedrooms and sqft 

In [36]:
df5[df5.total_sqft/df5.bhk<300].head()

we can see that the above values are irrileative when above step is run we see 1020sqft and 6 bedrooms which are irrelevant and can be cosidered as outliers similarly 600 and 8 bedroom etc ...

We remove all these

In [37]:
df5.shape

In [38]:
df6 = df5[~(df5.total_sqft/df5.bhk<300)]
df6

In [39]:
df6.shape

In [40]:
df6.price_per_sqft.describe()

In [41]:
def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft>(m-st)) & (subdf.price_per_sqft<=(m+st))]
        df_out = pd.concat([df_out,reduced_df],ignore_index=True)
    return df_out
df7 = remove_pps_outliers(df6)
df7.shape

Checking the variation of prices in every location

In [42]:
import matplotlib.pyplot as plt

In [43]:
def plot_scatter_chart(df,location):
    bhk2 = df[(df.location==location) & (df.bhk==2)]
    bhk3 = df[(df.location==location) & (df.bhk==3)]
    matplotlib.rcParams['figure.figsize'] = (15,10)
    plt.scatter(bhk2.total_sqft,bhk2.price,color='blue',label='2 BHK', s=50)
    plt.scatter(bhk3.total_sqft,bhk3.price,marker='+',color='green',label='3 BHK',s=50)
    plt.xlabel("Total Square Feet Area")
    plt.ylabel("Price")
    plt.title(location)
    plt.legend()

plot_scatter_chart(df7,'Hebbal')

In [44]:
def remove_bhk_outliers(df):
    exclude_indices = np.array([])
    for location,location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk,bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] = {
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }
        for bhk,bhk_df in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk-1)
            if stats and stats['count']>5:
                    exclude_indices = np.append(exclude_indices,bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(exclude_indices,axis='index')

df8 = remove_bhk_outliers(df7)
df8.shape

In [45]:
plot_scatter_chart(df8,'Hebbal')

In [46]:
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df8.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")

We can see the curve is like bell shape and almost like a normal distribution

In [47]:
df8.bath.unique()

In [48]:
df8[df8.bath>10]

to reduce outliers any plot with more bathrooms than bedrooms we exclude them or if we have less number of bath rooms than bedrooms we also exclude them

In [49]:
plt.hist(df8.bath,rwidth=0.8)
plt.xlabel("Number of bathrooms")
plt.ylabel("Count")

In [50]:
df8[df8.bath > df8.bhk+2]

In [51]:
df9 = df8[df8.bath < df8.bhk+2]
df9.shape

In [52]:
df10 = df9.drop(['size','price_per_sqft'],axis = 'columns')
df10.head(3)

creating dummies

In [53]:
dummies = pd.get_dummies(df10.location)
dummies.head(3)

In [54]:
df11 = pd.concat([df10,dummies.drop('other',axis='columns')],axis = 'columns')
df11.head(3)

In [55]:
df12 = df11.drop('location',axis='columns')
df12.head(2)

In [56]:
X = df12.drop('price',axis='columns')
X.head()

In [57]:
y = df12.price
y.head()

In [58]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 10)

In [59]:
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

In [69]:
x = np.zeros(len(X.columns))
x[0] = 1000
x[1] = 3
x[2] = 3
lr_clf.predict([x])