In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from numpy.linalg import norm

# 1. LOAD RAW LISTING DATA

df = pd.read_csv("C:/Users/cw/Downloads/Real Estate/clean_real_estate_forecast.csv")

df.head()


Unnamed: 0,Id,City,Locality,Total_Area,Price_per_SQFT,Price_in_Cr,Property_Type,BHK_Type,Baths,Balcony,City_Level_Annual_CAGR,Locality_Level_Annual_CAGR,CAGR_source,1yr_Forecast_PriceSQFT,2yr_Forecast_PriceSQFT,3yr_Forecast_PriceSQFT
0,1,Chennai,Kanathur Reddikuppam,2583,7700,1.99,Flat,4 BHK,4,Yes,0.0619,,city,8176.63,8682.76,9220.23
1,2,Chennai,Pozhichalur,7000,3210,2.25,Independent House,10 BHK,6,Yes,0.0619,,city,3408.7,3619.7,3843.76
2,3,Chennai,West Tambaram,1320,7580,1.0,Flat,3 BHK,3,No,0.0619,,city,8049.2,8547.45,9076.53
3,4,Chennai,Triplicane,4250,7840,3.33,Independent House,7 BHK,5,Yes,0.0619,,city,8325.3,8840.63,9387.87
4,5,Chennai,Avadi,960,5000,0.48,Flat,2 BHK,3,Yes,0.0619,,city,5309.5,5638.16,5987.16


In [2]:
# 2. CONVERT LISTINGS → MARKETS (LOCALITY LEVEL)

# We use:
#   MEDIAN price & area → avoids extreme outliers
#   COUNT(Id) → liquidity / demand proxy

market = df.groupby(["City", "Locality"]).agg(
    PPSF=("Price_per_SQFT", "median"),      # pricing power of locality
    Price_Cr=("Price_in_Cr", "median"),     # ticket size buyers pay
    Area=("Total_Area", "median"),          # size / quality of homes
    Listings=("Id", "count")                # how active the market is
).reset_index()



In [5]:
# 3. REMOVE "FAKE MARKETS"

# A locality with 1–2 listings is not a real market.
# We separate thin data from stable markets.

MIN_LISTINGS = 5
market["Is_Thin"] = market["Listings"] < MIN_LISTINGS

stable = market[market["Is_Thin"] == False].copy()   # real markets
thin   = market[market["Is_Thin"] == True].copy()    # unreliable data



In [6]:
# 4. FEATURES USED FOR SEGMENTATION

# These 4 variables define market structure

X = stable[["PPSF", "Price_Cr", "Area", "Listings"]]
X.head()

Unnamed: 0,PPSF,Price_Cr,Area,Listings
1,3810.0,0.5,1500.0,7
2,4835.0,0.78,1475.0,14
5,4850.0,0.95,1970.0,5
6,6950.0,0.9,918.0,6
7,5050.0,0.55,1015.0,26


In [7]:
# 5. NORMALIZATION

# Without scaling, PPSF would dominate.
# StandardScaler puts all on equal footing.

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [8]:
# 6. K-MEANS → DISCOVER 4 NATURAL MARKET TYPES

# "Find 4 natural clusters."

kmeans = KMeans(n_clusters=4, random_state=42, n_init=30)
stable["Cluster"] = kmeans.fit_predict(X_scaled)





In [9]:
# 7. UNDERSTAND WHAT EACH CLUSTER ACTUALLY IS

# We convert cluster centers back to real units
# so we can see what kind of markets were discovered.

centers = scaler.inverse_transform(kmeans.cluster_centers_)

cluster_profile = pd.DataFrame(
    centers,
    columns=["Avg_PPSF", "Avg_Price_Cr", "Avg_Area", "Avg_Listings"]
)

cluster_profile["Cluster"] = cluster_profile.index



In [16]:
cluster_profile = cluster_profile.sort_values(
    ["Avg_PPSF", "Avg_Price_Cr", "Avg_Area", "Avg_Listings"]
).reset_index(drop=True)

cluster_profile["Market_Segment"] = [
    "Matured",
    "Standard",
    "Luxury",
    "Opportunity"
]

cluster_map = dict(zip(cluster_profile["Cluster"], cluster_profile["Market_Segment"]))
stable["Market_Segment"] = stable["Cluster"].map(cluster_map)

In [17]:
# 9. DETECT BUBBLES / ABNORMAL MARKETS

# Some markets don't truly behave like their segment.
# We measure distance from the cluster center.
# Large distance = mispriced or unstable market.

distances = []

for i, row in enumerate(X_scaled):
    center = kmeans.cluster_centers_[stable.iloc[i]["Cluster"]]
    distances.append(norm(row - center))

stable["Fit_Distance"] = distances

# Data-driven threshold for abnormality
threshold = stable["Fit_Distance"].quantile(0.97)

stable["Market_Fit"] = np.where(
    stable["Fit_Distance"] > threshold,
    "Outlier",     # bubble, distortion, risky
    "Core"         # true market
)



In [18]:
# 10. ASSIGN THIN MARKETS

# Thin markets are not classified because data is unreliable

thin["Market_Segment"] = "Data Insufficient"



In [19]:
# 11. FINAL MARKET TABLE

final_market = pd.concat([stable, thin], ignore_index=True)



In [23]:
print(cluster_profile)

print(final_market["Market_Segment"].value_counts())

print(stable[stable["Market_Fit"] == "Outlier"][["City","Locality","Market_Segment"]])

       Avg_PPSF  Avg_Price_Cr     Avg_Area  Avg_Listings  Cluster  \
0   5608.400000      0.532550   914.250000     38.490000        3   
1   6285.140728      0.617823   977.515728     10.735099        0   
2   9185.208333      1.714000  1802.495833      9.791667        1   
3  22740.862069      1.497586   675.741379     16.568966        2   

  Market_Segment  
0        Matured  
1       Standard  
2         Luxury  
3    Opportunity  
Market_Segment
Data Insufficient    914
Standard             604
Luxury               120
Matured              100
Opportunity           58
Name: count, dtype: int64
           City                  Locality Market_Segment
33    Bangalore              Banashankari        Matured
68    Bangalore                 Binnipete         Luxury
94    Bangalore  Chandra Layout Extension         Luxury
123   Bangalore                 Cottonpet         Luxury
148   Bangalore           Electronic City        Matured
305   Bangalore                   Laggere         L

In [24]:
# 12. FINAL FILE

final_output = final_market[["City", "Locality", "Market_Segment"]]
final_output.to_csv("C:/Users/cw/Downloads/Real Estate/market_segments.csv", index=False)