In [1]:
import pandas as pd
df = pd.read_csv("Bengaluru_House_Data.csv")

In [2]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [4]:
df_clean = df.drop(['availability', 'society', 'balcony'], axis='columns')

In [5]:
df_clean.head()

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.0
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.0


In [6]:
print(df_clean.isnull().sum())

area_type      0
location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64


In [7]:
df_clean = df_clean.dropna()

In [8]:
df_clean

Unnamed: 0,area_type,location,size,total_sqft,bath,price
0,Super built-up Area,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Plot Area,Chikka Tirupathi,4 Bedroom,2600,5.0,120.00
2,Built-up Area,Uttarahalli,3 BHK,1440,2.0,62.00
3,Super built-up Area,Lingadheeranahalli,3 BHK,1521,3.0,95.00
4,Super built-up Area,Kothanur,2 BHK,1200,2.0,51.00
...,...,...,...,...,...,...
13315,Built-up Area,Whitefield,5 Bedroom,3453,4.0,231.00
13316,Super built-up Area,Richards Town,4 BHK,3600,5.0,400.00
13317,Built-up Area,Raja Rajeshwari Nagar,2 BHK,1141,2.0,60.00
13318,Super built-up Area,Padmanabhanagar,4 BHK,4689,4.0,488.00


In [9]:
print(df_clean.isnull().sum())
print(f"\nOriginal shape: {df.shape}, New shape: {df_clean.shape}")

area_type     0
location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

Original shape: (13320, 9), New shape: (13246, 6)


In [10]:
df_clean['bhk'] = df_clean['size'].apply(lambda x: int(x.split(' ')[0]))

In [11]:
df_clean = df_clean.drop('size', axis='columns')

In [12]:
df_clean

Unnamed: 0,area_type,location,total_sqft,bath,price,bhk
0,Super built-up Area,Electronic City Phase II,1056,2.0,39.07,2
1,Plot Area,Chikka Tirupathi,2600,5.0,120.00,4
2,Built-up Area,Uttarahalli,1440,2.0,62.00,3
3,Super built-up Area,Lingadheeranahalli,1521,3.0,95.00,3
4,Super built-up Area,Kothanur,1200,2.0,51.00,2
...,...,...,...,...,...,...
13315,Built-up Area,Whitefield,3453,4.0,231.00,5
13316,Super built-up Area,Richards Town,3600,5.0,400.00,4
13317,Built-up Area,Raja Rajeshwari Nagar,1141,2.0,60.00,2
13318,Super built-up Area,Padmanabhanagar,4689,4.0,488.00,4


In [13]:
def convert_sqft_to_num(x):
    areas = x.split('-')

    if len(areas) == 2:
        try:
            avg_area = (float(areas[0])+float(areas[1]))/2
        except ValueError:
            return None
    try: 
        return float(x)
    except:
        return None

df_clean = df_clean.dropna().copy()
df_clean['area'] = df_clean['total_sqft'].apply(convert_sqft_to_num)

In [14]:
df_clean = df_clean.drop('total_sqft', axis='columns')

In [15]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13246 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area_type  13246 non-null  object 
 1   location   13246 non-null  object 
 2   bath       13246 non-null  float64
 3   price      13246 non-null  float64
 4   bhk        13246 non-null  int64  
 5   area       13056 non-null  float64
dtypes: float64(3), int64(1), object(2)
memory usage: 724.4+ KB


In [16]:
df_clean['price_per_sqft'] = (df_clean['price']*100000) / df_clean['area']

In [27]:
#Remove Outliers
#BHK outlier - let's see
df_step1 = df_clean[~(df_clean['area']/ df_clean['bhk'] < 300)]
print(f"Original shape: {df_clean.shape}")
print(f"Shape after removing BHK outliers: {df_step1.shape}")
#Bathroom outlier
df_step2 = df_step1[~(df_step1['bath'] > df_step1['bhk'] + 2)]
print(f"Original shape: {df_step1.shape}")
print(f"Shape after removing BHK outliers: {df_step2.shape}")

Original shape: (13246, 7)
Shape after removing BHK outliers: (12502, 7)
Original shape: (12502, 7)
Shape after removing BHK outliers: (12493, 7)


In [32]:
import numpy as np

def remove_pps_outliers(df):
    df_out = pd.DataFrame()
    # Group by location, and for each location...
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        
        # Keep only the data points within 1 standard deviation
        reduced_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]
        
        # Add this cleaned subset back to our final DataFrame
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

# Apply the function to our DataFrame
df_final = remove_pps_outliers(df_step2)

print(f"Shape before price outliers: {df_step2.shape}")
print(f"Shape after removing price outliers: {df_final.shape}")

Shape before price outliers: (12493, 7)
Shape after removing price outliers: (9148, 7)


In [33]:
output_filename = 'cleaned_bengaluru_house_data.csv'
df_final.to_csv(output_filename, index=False)