In [1]:
import pandas as pd
import numpy as np
from google.colab import files

# Upload and Load Data
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)

Saving Mumbai Real Estate.csv to Mumbai Real Estate.csv


  df = pd.read_csv(file_name)


In [2]:
# Shape and first 5 rows
print(f"Shape: {df.shape}")
print(df.head())

# Data types
print("\n=== Data Types ===")
print(df.dtypes)

# Missing values
print("\n=== Missing Values ===")
print(df.isnull().sum())

# Descriptive statistics
print("\n=== Descriptive Statistics ===")
print(df.describe(include='all'))

Shape: (12685, 145)
      ID   Possession Status Availability Starts From Floor No Commercial  \
0  12685  Under Construction                  Dec '25        5          N   
1  12684       Ready to Move                      NaN       20          Y   
2  12683       Ready to Move                      NaN       18          N   
3  12682  Under Construction                  Dec '25        5          N   
4  12681  Under Construction                  Dec '24        8          Y   

                               Developer Approved Authority Name  \
0                                    NaN                    KDMC   
1  TATA Housing Development Company Ltd.                     TMC   
2                  Sai Satyam Developers                    KDMC   
3                          Birla Estates                    KDMC   
4                      Godrej Properties                     NaN   

   Units Available      Price Price (English)  ... Rentable CommuniPfty Space  \
0              1.0  3150000

In [10]:

# Initial Data Inspection
print("\nMissing Values Summary:")
missing_percent = df.isnull().sum().sort_values(ascending=False)/len(df)*100
print(missing_percent[missing_percent > 0].head(10))
# Column Selection and Removal
cols_to_drop = [col for col in df.columns
               if (missing_percent[col] > 90) or (df[col].nunique() == 1)]
df = df.drop(columns=cols_to_drop)



Missing Values Summary:
Series([], dtype: float64)


In [5]:
print(f"\nShape after dropping columns: {df.shape}")



Shape after dropping columns: (12685, 137)


In [6]:
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna('Not Specified')

In [7]:
# Fill numeric columns
num_cols = df.select_dtypes(include=['int64','float64']).columns
for col in num_cols:
    if df[col].isnull().sum() > 0:
        df[col] = df[col].fillna(df[col].median())

In [8]:
df['Price'] = df['Price'].replace('[^0-9.]', '', regex=True).astype(float)

In [9]:
price_q1, price_q3 = df['Price'].quantile([0.25, 0.75])
price_iqr = price_q3 - price_q1
df = df[(df['Price'] > (price_q1 - 1.5*price_iqr)) &
        (df['Price'] < (price_q3 + 1.5*price_iqr))]

In [11]:
status_map = {'Ready to Move': 'Ready', 'Under Construction': 'Under Construction',
              'Ready To Move': 'Ready', 'Rtm': 'Ready'}
df['Possession Status'] = df['Possession Status'].map(status_map).fillna('Not Specified')

In [12]:
binary_cols = ['Commercial', 'Retail Boulevard (Retail Shops)',
               'Cycling & Jogging Track', 'Fire Fighting Equipment']
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].map({'Y': 1, 'N': 0, 'Yes': 1, 'No': 0}).fillna(0)


In [13]:
df['Year Available'] = pd.to_datetime(df['Availability Starts From'],
                                     format="%b '%y", errors='coerce').dt.year

In [14]:
if 'Units Available' in df.columns and 'Price' in df.columns:
    df['Price Per Unit'] = df['Price'] / df['Units Available'].replace(0, 1)

In [15]:
print("\nMissing Values After Cleaning:")
print(df.isnull().sum().sum())

print("\nFinal Data Types:")
print(df.dtypes.value_counts())


Missing Values After Cleaning:
8644

Final Data Types:
int64      90
object     34
float64    15
Name: count, dtype: int64


In [16]:
# Save Cleaned Data
clean_file = 'cleaned_real_estate_data.csv'
df.to_csv(clean_file, index=False)
files.download(clean_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>