In [None]:
# import all the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read the data
df = pd.read_csv('Bengaluru_House_Data.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [None]:
# check the shape of the data
df.shape

(13320, 9)

In [None]:
# check the columns
df.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [None]:
# check the info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [None]:
# check missing values
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [None]:
df.society.nunique()  # number of unique values in society 

2688

In [None]:
df[df.society.isnull()] 

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
8,Super built-up Area,Ready To Move,Marathahalli,3 BHK,,1310,3.0,1.0,63.25
9,Plot Area,Ready To Move,Gandhi Bazar,6 Bedroom,,1020,6.0,,370.00
10,Super built-up Area,18-Feb,Whitefield,3 BHK,,1800,2.0,2.0,70.00
...,...,...,...,...,...,...,...,...,...
13310,Super built-up Area,Ready To Move,Rachenahalli,2 BHK,,1050,2.0,2.0,52.71
13311,Plot Area,Ready To Move,Ramamurthy Nagar,7 Bedroom,,1500,9.0,2.0,250.00
13312,Super built-up Area,Ready To Move,Bellandur,2 BHK,,1262,2.0,2.0,47.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00


In [None]:
# drop the column society
df.drop("society",axis=1,inplace=True)


In [None]:
# check missing values
df.isnull().sum()

area_type         0
availability      0
location          1
size             16
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

In [None]:
# fill the missing value in balcony with 0
df["balcony"].fillna(0,inplace=True)

In [None]:
df[df['size'].isnull()]

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
579,Plot Area,Immediate Possession,Sarjapur Road,,1200 - 2400,,0.0,34.185
1775,Plot Area,Immediate Possession,IVC Road,,2000 - 5634,,0.0,124.0
2264,Plot Area,Immediate Possession,Banashankari,,2400,,0.0,460.0
2809,Plot Area,Immediate Possession,Sarjapur Road,,1200 - 2400,,0.0,28.785
2862,Plot Area,Immediate Possession,Devanahalli,,1500 - 2400,,0.0,46.8
5333,Plot Area,Immediate Possession,Devanahalli,,2100 - 5405,,0.0,177.115
6423,Plot Area,Immediate Possession,Whitefield,,2324,,0.0,26.73
6636,Plot Area,Immediate Possession,Jigani,,1500,,0.0,25.49
6719,Plot Area,Immediate Possession,Hoskote,,800 - 2660,,0.0,28.545
7680,Plot Area,Immediate Possession,Kasavanhalli,,5000,,0.0,400.0


In [None]:
df['size'].fillna(0,inplace=True)

In [None]:
# check missing values
df.isnull().sum()

area_type        0
availability     0
location         1
size             0
total_sqft       0
bath            73
balcony          0
price            0
dtype: int64

In [None]:
mss_bath = df[df.bath.isnull()]
#mss_bath
mss_bath.area_type.value_counts()

Super built-up  Area    49
Plot  Area              16
Built-up  Area           8
Name: area_type, dtype: int64

In [None]:
mss_bath[mss_bath.area_type == 'Super built-up  Area']

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
224,Super built-up Area,19-Dec,Devanahalli,3 BHK,1520 - 1740,,0.0,74.82
344,Super built-up Area,21-Dec,Kanakpura Road,1 BHK,525,,0.0,21.53
669,Super built-up Area,18-Dec,JP Nagar,5 BHK,4400 - 6640,,0.0,375.0
702,Super built-up Area,18-Dec,JP Nagar,5 BHK,4400 - 6800,,0.0,548.5
801,Super built-up Area,18-Dec,JP Nagar,4 BHK,4000 - 5249,,0.0,453.0
941,Super built-up Area,Ready To Move,Whitefield,4 Bedroom,3606 - 5091,,0.0,304.0
1267,Super built-up Area,18-Jun,Yelahanka,3 BHK,1440 - 1884,,0.0,67.98
1686,Super built-up Area,21-Dec,Whitefield,1 BHK,660 - 670,,0.0,28.585
1724,Super built-up Area,Ready To Move,Thanisandra,1 BHK,620 - 933,,0.0,48.145
1765,Super built-up Area,19-Dec,Binny Pete,4 BHK,2695 - 2940,,0.0,204.0


In [None]:
df[df.area_type == 'Super built-up  Area']

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.00
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,1170,2.0,1.0,38.00
6,Super built-up Area,18-May,Old Airport Road,4 BHK,2732,4.0,0.0,204.00
...,...,...,...,...,...,...,...,...
13313,Super built-up Area,Ready To Move,Uttarahalli,3 BHK,1345,2.0,1.0,57.00
13314,Super built-up Area,Ready To Move,Green Glen Layout,3 BHK,1715,3.0,3.0,112.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,3600,5.0,0.0,400.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00


In [None]:
def cleanbhk(x):
  x = str(x)
  a = x.split(' ')[0]
  return a

In [None]:
df['size'] = df[['size']].applymap(cleanbhk)

In [None]:
df['size'] = df['size'].astype('float')

In [None]:
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2.0,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4.0,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3.0,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3.0,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2.0,1200,2.0,1.0,51.0


In [None]:
s_bath_ms = df.loc[df.bath.isnull(),['size','bath']]
s_bath_ms

Unnamed: 0,size,bath
56,4.0,
81,4.0,
224,3.0,
344,1.0,
579,0.0,
...,...,...
11496,1.0,
11569,0.0,
12768,5.0,
12861,4.0,


In [None]:
dd = pd.DataFrame({'A' : [1,2,3,4,5],
                   'B' : [np.nan,2,np.nan,np.nan,4]})
dd

Unnamed: 0,A,B
0,1,
1,2,2.0
2,3,
3,4,
4,5,4.0


In [None]:
dd.loc[dd.B.isnull(),'B'] = dd.loc[dd.B.isnull(),'A']

In [None]:
dd

Unnamed: 0,A,B
0,1,1.0
1,2,2.0
2,3,3.0
3,4,4.0
4,5,4.0


In [None]:
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2.0,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4.0,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3.0,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3.0,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2.0,1200,2.0,1.0,51.0


In [None]:
df.loc[df.bath.isnull(),'bath'] = df.loc[df.bath.isnull(),'size']

In [None]:
df.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
df.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2.0,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4.0,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3.0,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3.0,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2.0,1200,2.0,1.0,51.0


In [None]:
df.availability.unique()

array(['19-Dec', 'Ready To Move', '18-May', '18-Feb', '18-Nov', '20-Dec',
       '17-Oct', '21-Dec', '19-Sep', '20-Sep', '18-Mar', '20-Feb',
       '18-Apr', '20-Aug', '18-Oct', '19-Mar', '17-Sep', '18-Dec',
       '17-Aug', '19-Apr', '18-Jun', '22-Dec', '22-Jan', '18-Aug',
       '19-Jan', '17-Jul', '18-Jul', '21-Jun', '20-May', '19-Aug',
       '18-Sep', '17-May', '17-Jun', '21-May', '18-Jan', '20-Mar',
       '17-Dec', '16-Mar', '19-Jun', '22-Jun', '19-Jul', '21-Feb',
       'Immediate Possession', '19-May', '17-Nov', '20-Oct', '20-Jun',
       '19-Feb', '21-Oct', '21-Jan', '17-Mar', '17-Apr', '22-May',
       '19-Oct', '21-Jul', '21-Nov', '21-Mar', '16-Dec', '22-Mar',
       '20-Jan', '21-Sep', '21-Aug', '14-Nov', '19-Nov', '15-Nov',
       '16-Jul', '15-Jun', '17-Feb', '20-Nov', '20-Jul', '16-Sep',
       '15-Oct', '15-Dec', '16-Oct', '22-Nov', '15-Aug', '17-Jan',
       '16-Nov', '20-Apr', '16-Jan', '14-Jul'], dtype=object)

In [None]:
# check the description for numeric values
# check description of numeric as well as categorical values
# check unique value in area-type
# check the frequency of each area type (value_counts)


In [None]:
# check the average price according to area_type
# based on area-type and the size of the prop check the avg price
# check correlation between area_sqft and price
# check the distribution of price
