In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN

import folium
from folium.plugins import MarkerCluster
from IPython.display import IFrame
import os


In [2]:
# %%

import hdbscan
import time


# %%
# Load cleaned data
df = pd.read_csv("../datasets/cleaned_crime_data3.csv", parse_dates=['datetime'])

# Check initial shape
print("🔹 Initial data shape:", df.shape)

# Drop rows with missing coordinates
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
print("🔹 After dropping NaNs in coordinates:", df.shape)

# %%
# 🔧 Optional Sampling for Faster Testing (disable this line to use all data)
sample_size = 100_000  # Try 50_000 if it's still slow
df_sampled = df.sample(sample_size, random_state=42) if df.shape[0] > sample_size else df.copy()
print(f"🔹 Using {df_sampled.shape[0]} points for clustering")

# Prepare coordinates in radians (lat/lon)
coords = df_sampled[['Latitude', 'Longitude']].to_numpy()
print("🔹 Coordinate array shape:", coords.shape)

# %%
# Run HDBSCAN clustering
print("⏳ Running HDBSCAN...")
start = time.time()

clusterer = hdbscan.HDBSCAN(
    min_cluster_size=30,  # play with this (e.g., 20–50) for more/fewer clusters
    min_samples=10,
    metric='haversine',
    cluster_selection_epsilon=0.001,  # optional fine-tuning
    algorithm='best'
)
df_sampled['cluster'] = clusterer.fit_predict(np.radians(coords))

duration = round(time.time() - start, 2)
print(f"✅ HDBSCAN completed in {duration} seconds")

# %%
# Cluster summary
unique_clusters = np.unique(df_sampled['cluster'])
print("🔹 Unique cluster labels:", unique_clusters)
print("🔹 Total clusters (excluding noise):", len(unique_clusters) - (1 if -1 in unique_clusters else 0))

# Filter real clusters (exclude noise = -1)
hotspots = df_sampled[df_sampled['cluster'] != -1].copy()
print("🔹 Hotspot records count:", hotspots.shape[0])

# %%
# Save hotspots with cluster labels
hotspot_file = "hotspots_with_labels_hdbscan.csv"
hotspots.to_csv(hotspot_file, index=False)

if os.path.exists(hotspot_file):
    print(f"✅ Saved hotspot points to {hotspot_file} ({os.path.getsize(hotspot_file)} bytes)")
else:
    print("❌ Failed to save hotspot CSV file.")

# %%
# Display top 5 largest clusters
print("🔹 Top hotspot clusters:")
print(hotspots['cluster'].value_counts().head())


🔹 Initial data shape: (751364, 6)
🔹 After dropping NaNs in coordinates: (751364, 6)
🔹 Using 100000 points for clustering
🔹 Coordinate array shape: (100000, 2)
⏳ Running HDBSCAN...




✅ HDBSCAN completed in 9.62 seconds
🔹 Unique cluster labels: [0 1 2 3]
🔹 Total clusters (excluding noise): 4
🔹 Hotspot records count: 100000
✅ Saved hotspot points to hotspots_with_labels_hdbscan.csv (8072703 bytes)
🔹 Top hotspot clusters:
cluster
3    43944
1    39604
2    16364
0       88
Name: count, dtype: int64


In [3]:



center = [hotspots['Latitude'].mean(), hotspots['Longitude'].mean()]
m = folium.Map(location=center, zoom_start=11, tiles='CartoDB positron')

mc = MarkerCluster().add_to(m)
for _, row in hotspots.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=3,
        color='crimson',
        fill=True,
        fill_opacity=0.6,
        popup=f"{row['city']} | {row['crime_category']}"
    ).add_to(mc)

# Save and display
filepath = "crime_hotspots_map.html"
m.save(filepath)

# Only if file exists
if os.path.exists(filepath):
    display(IFrame(filepath, width=700, height=500))
else:
    print("Map file was not created.")
