In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("Bengaluru_House_Data.csv")
print("Shape of dataset:", df.shape)
df.head()


Shape of dataset: (13320, 9)


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [2]:
df.columns = df.columns.str.strip().str.replace('[^A-Za-z0-9]+', '_', regex=True).str.lower()
df.columns


Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [3]:
#3. Check missing values
df.isnull().sum()

# Impute few missing values
df['bath'] = df['bath'].fillna(df['bath'].median())
df['balcony'] = df['balcony'].fillna(df['balcony'].median())
df['size'] = df['size'].fillna("Unknown")
df['location'] = df['location'].fillna("Unknown")

# Drop 'society' column as it has many missing values
df = df.drop(columns=['society'])
df.isnull().sum()


area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [4]:
#4.Create a new column 'bhk' by extracting numeric value from 'size'
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]) if x != "Unknown" else 0)

# Clean 'total_sqft' column: convert range values like '2100 - 2850' to average
def convert_sqft_to_num(x):
    try:
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
df['total_sqft'] = df['total_sqft'].fillna(df['total_sqft'].median())


In [7]:
#5.Example: properties in 'Whitefield' area with more than 2 bedrooms
df_subset = df[(df['location'] == 'Whitefield') & (df['bhk'] > 2)]
df_subset.head()


Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,bhk
10,Super built-up Area,18-Feb,Whitefield,3 BHK,1800.0,2.0,2.0,70.0,3
11,Plot Area,Ready To Move,Whitefield,4 Bedroom,2785.0,5.0,3.0,295.0,4
27,Built-up Area,20-Dec,Whitefield,3 BHK,1610.0,3.0,2.0,81.0,3
52,Built-up Area,Ready To Move,Whitefield,3 BHK,2010.0,3.0,2.0,91.0,3
62,Plot Area,Ready To Move,Whitefield,4 Bedroom,5700.0,5.0,3.0,650.0,4


In [8]:
df_encoded = pd.get_dummies(df, columns=['area_type', 'availability', 'location'], drop_first=True)
df_encoded.head()


Unnamed: 0,size,total_sqft,bath,balcony,price,bhk,area_type_Carpet Area,area_type_Plot Area,area_type_Super built-up Area,availability_14-Nov,...,location_rr nagar,location_sankeswari,location_sapthagiri Layout,location_sarjapura main road,location_singapura paradise,location_t.c palya,location_tc.palya,location_vinayakanagar,"location_white field,kadugodi",location_whitefiled
0,2 BHK,1056.0,2.0,1.0,39.07,2,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1,4 Bedroom,2600.0,5.0,3.0,120.0,4,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3 BHK,1440.0,2.0,3.0,62.0,3,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,3 BHK,1521.0,3.0,1.0,95.0,3,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,2 BHK,1200.0,2.0,1.0,51.0,2,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# Average price by location
avg_price_by_location = df.groupby('location')['price'].mean().sort_values(ascending=False)
avg_price_by_location.head(10)


location
Cubbon Road            1900.000000
Ashok Nagar            1486.000000
Defence Colony         1167.714286
Yemlur                 1093.388889
Church Street          1068.000000
D Souza Layout         1015.000000
Sadashiva Nagar        1011.100000
Sindhi Colony           988.000000
Srinivas Colony         922.000000
5th Block Jayanagar     905.000000
Name: price, dtype: float64

In [10]:
# Using IQR method on 'price'
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

df_no_outliers = df[(df['price'] >= (Q1 - 1.5 * IQR)) & (df['price'] <= (Q3 + 1.5 * IQR))]
print("Shape before removing outliers:", df.shape)
print("Shape after removing outliers:", df_no_outliers.shape)


Shape before removing outliers: (13320, 9)
Shape after removing outliers: (12044, 9)
