### File Import

In [9]:
import json
import joblib
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans


df_before_cleaning = pd.read_csv('../data/eda_FE.csv')
df = df_before_cleaning
df.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_basement,lat,long,sqft_living15,price_log,basement_flag,renovated_flag,house_age
0,3,1.0,7.074117,8.639588,1.0,0,0,3,7,0.0,47.5112,-122.257,7.201171,12.309987,0,0,71
1,3,2.25,7.85205,8.887791,2.0,0,0,3,7,5.993961,47.721,-122.319,7.433075,13.195616,1,1,75
2,2,1.0,6.647688,9.21044,1.0,0,0,3,6,0.0,47.7379,-122.233,7.908755,12.100718,0,0,93
3,4,3.0,7.58121,8.517393,1.0,0,0,5,7,6.814543,47.5208,-122.393,7.215975,13.311331,1,0,61
4,3,2.0,7.427144,8.997271,1.0,0,0,3,8,0.0,47.6168,-122.045,7.496097,13.142168,0,0,39


In [10]:
coords = df[["lat", "long"]].astype(float)

# Choose cluster count (start with 12–25; 15 is a good default)
N_CLUSTERS = 15
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init="auto")
kmeans.fit(coords)

# Add labels (optional, useful for analysis)
df["neighborhood_cluster"] = kmeans.labels_

# Build metadata: bounds + centroids + simple names
centroids = kmeans.cluster_centers_  # [ [lat, long], ... ]

meta = {
    "bounds": {
        "min_lat": float(coords["lat"].min()),
        "max_lat": float(coords["lat"].max()),
        "min_long": float(coords["long"].min()),
        "max_long": float(coords["long"].max()),
    },
    "clusters": [
        {
            "id": int(i),
            "name": f"Neighborhood {i+1}",
            "centroid_lat": float(centroids[i][0]),
            "centroid_long": float(centroids[i][1]),
        }
        for i in range(N_CLUSTERS)
    ],
}

joblib.dump(kmeans, "geo_clusterer.joblib")

with open("geo_meta.json", "w") as f:
    json.dump(meta, f, indent=2)

In [12]:
import folium
from folium.plugins import MarkerCluster, HeatMap
import branca.colormap as cm

# Load data and model
df = pd.read_csv('../data/eda_FE.csv')
kmeans = joblib.load('../model/geo_clusterer.joblib')
df['neighborhood_cluster'] = kmeans.labels_

# Load metadata
with open('../json/geo_meta.json', 'r') as f:
    meta = json.load(f)

# Create base map centered on mean coordinates
center_lat = df['lat'].mean()
center_lon = df['long'].mean()
m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Create color map for clusters
colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 
          'lightred', 'beige', 'darkblue', 'darkgreen', 'cadetblue', 
          'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen', 
          'gray', 'black', 'lightgray']

# Add cluster centroids with popups
for i, centroid in enumerate(kmeans.cluster_centers_):
    cluster_size = sum(kmeans.labels_ == i)
    cluster_name = meta['clusters'][i]['name']
    
    folium.Marker(
        [centroid[0], centroid[1]],
        popup=f"""
        <b>{cluster_name}</b><br>
        Cluster ID: {i}<br>
        Properties: {cluster_size}<br>
        Center: ({centroid[0]:.4f}, {centroid[1]:.4f})
        """,
        tooltip=f"Click for cluster {i} details",
        icon=folium.Icon(
            color=colors[i % len(colors)], 
            icon='info-sign'
        )
    ).add_to(m)

# Add property points (sampled for performance)
sample_size = min(5000, len(df))  # Limit to 5000 points for performance
df_sample = df.sample(n=sample_size, random_state=42)

# Create feature groups for each cluster
for i in range(len(meta['clusters'])):
    cluster_data = df_sample[df_sample['neighborhood_cluster'] == i]
    if len(cluster_data) > 0:
        fg = folium.FeatureGroup(name=f"Cluster {i} ({len(cluster_data)} properties)")
        
        for idx, row in cluster_data.iterrows():
            folium.CircleMarker(
                [row['lat'], row['long']],
                radius=2,
                color=colors[i % len(colors)],
                fill=True,
                fillOpacity=0.5,
                popup=f"Price: ${row.get('price', 'N/A')}<br>Cluster: {i}"
            ).add_to(fg)
        
        fg.add_to(m)

# Add layer control
folium.LayerControl().add_to(m)

# Add heatmap layer
HeatMap(df_sample[['lat', 'long']].values, radius=15).add_to(m)

# Save map
m.save('property_clusters_map.html')
print("Map saved to 'property_clusters_map.html'")

# Display in Jupyter (if using notebook)
m

Map saved to 'property_clusters_map.html'
