In [67]:
import pandas as pd
import numpy as np

In [68]:
df = pd.read_csv("Bengaluru_House_Data_Final.csv")
print("Shape:", df.shape)
df.head()

Shape: (13320, 10)


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,Distance_to_nearest_MRT_station_m
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,1249.45
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,3901.43
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0,1678.39
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0,3197.32
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0,2312.04


In [69]:
print("Null Values:")
print(df.isnull().sum())

Null Values:
area_type                               0
availability                            0
location                                1
size                                   16
society                              5502
total_sqft                              0
bath                                   73
balcony                               609
price                                   0
Distance_to_nearest_MRT_station_m       0
dtype: int64


In [78]:
# Reload fresh dataset
import pandas as pd
import numpy as np

df = pd.read_csv("Bengaluru_House_Data_Final.csv")
print("Dataset loaded fresh!")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset loaded fresh!
Shape: (13320, 10)
Columns: ['area_type', 'availability', 'location', 'size', 'society', 'total_sqft', 'bath', 'balcony', 'price', 'Distance_to_nearest_MRT_station_m']


In [79]:
# 1. Drop society (41% missing)
df = df.drop('society', axis=1)

# 2. Drop null location
df = df.dropna(subset=['location'])
df['location'] = df['location'].astype(str).str.strip()

# 3. Fix total_sqft ranges like "2100-2850"
def convert_sqft(val):
    val = str(val).strip()
    if '-' in val:
        try:
            parts = val.split('-')
            return (float(parts[0]) + float(parts[1])) / 2
        except:
            return np.nan
    try:
        return float(val)
    except:
        return np.nan

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)
df = df.dropna(subset=['total_sqft'])

# 4. Fill missing values
df['bath'] = df['bath'].fillna(df['bath'].median())
df['balcony'] = df['balcony'].fillna(df['balcony'].mode()[0])
df['size'] = df['size'].fillna(df['size'].mode()[0])

# 5. Remove bath outliers (>10)
df = df[df['bath'] <= 10]

# 6. Remove size outliers (>10 BHK)
def get_bhk(size):
    try:
        return int(str(size).split()[0])
    except:
        return np.nan

df['bhk'] = df['size'].apply(get_bhk)
df = df[df['bhk'] <= 10]
df = df.drop('bhk', axis=1)

print("Data Cleaning Done!")
print("Final Shape:", df.shape)

Data Cleaning Done!
Final Shape: (13250, 9)


In [None]:
print("=== SHAPE ===")
print(df.shape)

print("\n=== COLUMNS ===")
print(df.columns.tolist())

print("\n=== NULL VALUES ===")
print(df.isnull().sum())

print("\n=== SAMPLE DATA ===")
df.head(10)

=== SHAPE ===
(13250, 9)

=== COLUMNS ===
['area_type', 'availability', 'location', 'size', 'total_sqft', 'bath', 'balcony', 'price', 'Distance_to_nearest_MRT_station_m']

=== NULL VALUES ===
area_type                            0
availability                         0
location                             0
size                                 0
total_sqft                           0
bath                                 0
balcony                              0
price                                0
Distance_to_nearest_MRT_station_m    0
dtype: int64

=== SAMPLE DATA ===


Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,Distance_to_nearest_MRT_station_m
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056.0,2.0,1.0,39.07,1249.45
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600.0,5.0,3.0,120.0,3901.43
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440.0,2.0,3.0,62.0,1678.39
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521.0,3.0,1.0,95.0,3197.32
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200.0,2.0,1.0,51.0,2312.04
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,1170.0,2.0,1.0,38.0,987.19
6,Super built-up Area,18-May,Old Airport Road,4 BHK,2732.0,4.0,2.0,204.0,869.7
7,Super built-up Area,Ready To Move,Rajaji Nagar,4 BHK,3300.0,4.0,2.0,600.0,733.09
8,Super built-up Area,Ready To Move,Marathahalli,3 BHK,1310.0,3.0,1.0,63.25,1521.34
9,Plot Area,Ready To Move,Gandhi Bazar,6 Bedroom,1020.0,6.0,2.0,370.0,3416.15


In [None]:
# Save cleaned dataset
df.to_csv("Bengaluru_House_Cleaned.csv", index=False)
print("Cleaned dataset saved as Bengaluru_House_Cleaned.csv")

Cleaned dataset saved as Bengaluru_House_Cleaned.csv
