# Hotspot Storytelling Notebook

This notebook packages the existing medallion outputs (L2 → L3) into a set of quick hotspot visuals.

**What you get**
- Daily incident trends for a selected month
- H3 hex hotspot map (resolutions 8/9) using deterministic L3 aggregates
- Optional HDBSCAN cluster overlay using the clustering prototype

> Tip: run this from the repository root so relative paths resolve (e.g. `jupyter lab`).

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import h3
import sys

ROOT = Path.cwd().resolve()
SRC_DIR = ROOT / 'src'
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))


In [None]:
# Parameters
YEAR = 2024
MONTH = 9
H3_RES = 9  # default to street-level view


In [None]:
def load_l3_partition(year: int, month: int, res: int) -> pd.DataFrame:
    path = ROOT / f'data/l3/res={res}/year={year}/month={month:02d}' / f'l3-aggregates-{res}-{year}-{month:02d}.parquet'
    if not path.exists():
        raise FileNotFoundError(f'Missing L3 partition: {path}')
    df = pd.read_parquet(path)
    df['date'] = pd.to_datetime(df['date'])
    return df


l3_df = load_l3_partition(YEAR, MONTH, H3_RES)
print(f'L3 rows: {len(l3_df):,}, columns: {list(l3_df.columns)}')


In [None]:
# Daily trend (incident counts)
daily = l3_df.groupby('date', as_index=False)['n_crimes'].sum()
fig = px.line(daily, x='date', y='n_crimes', title=f'Daily incidents – {YEAR}-{MONTH:02d}', markers=True)
fig.update_layout(yaxis_title='Incidents', xaxis_title='Date')
fig.show()


In [None]:
# Aggregate to monthly hotspot summary per H3 cell
summary_cols = ['n_crimes', 'n_arrests', 'low_conf', 'smoothed_rate', 'pooled_smoothed']
month_summary = (
    l3_df.assign(low_conf=lambda d: d['low_conf'].astype(bool))
         .groupby(f'h3_r{H3_RES}', dropna=True)[summary_cols]
         .agg({
             'n_crimes': 'sum',
             'n_arrests': 'sum',
             'low_conf': 'mean',
             'smoothed_rate': 'mean',
             'pooled_smoothed': 'mean'
         })
         .reset_index()
)
month_summary.rename(columns={f'h3_r{H3_RES}': 'h3_id', 'low_conf': 'low_conf_share'}, inplace=True)
month_summary['low_conf_share'] = month_summary['low_conf_share'].round(3)
month_summary.sort_values('n_crimes', ascending=False).head(10)


In [None]:
# Convert H3 hexes to lat/lon centers for plotting
month_summary['lat'] = month_summary['h3_id'].apply(lambda h: h3.cell_to_latlng(h)[0])
month_summary['lon'] = month_summary['h3_id'].apply(lambda h: h3.cell_to_latlng(h)[1])


In [None]:
# Scatter map of hotspots (size by incidents, color by smoothed arrest rate)
fig = px.scatter_geo(
    month_summary,
    lat='lat',
    lon='lon',
    size='n_crimes',
    color='smoothed_rate',
    hover_data=['h3_id', 'n_crimes', 'n_arrests', 'low_conf_share', 'pooled_smoothed'],
    title=f'H3 hotspots (res={H3_RES}) – size=count, color=smoothed arrest rate',
    projection='natural earth'
)
fig.update_layout(height=600)
fig.show()


In [None]:
# Optional: run clustering prototype and compare
from importlib import import_module
clustering = import_module('l3_clustering_prototype')
clusters_dir = ROOT / 'data' / 'l3' / 'clusters' / f'res={H3_RES}' / f'year={YEAR}' / f'month={MONTH:02d}'
clusters_dir.mkdir(parents=True, exist_ok=True)
cluster_path = clusters_dir / f'clusters-{H3_RES}-{YEAR}-{MONTH:02d}.parquet'

if not cluster_path.exists():
    print('Running clustering prototype (UMAP + HDBSCAN)...')
    clustering.run_clustering(YEAR, MONTH, res=H3_RES)
else:
    print('Cluster parquet already exists – reusing cached results.')

if cluster_path.exists():
    cluster_df = pd.read_parquet(cluster_path)
    print(cluster_df.head())
    month_summary = month_summary.merge(cluster_df[[f'h3_r{H3_RES}', 'cluster']].rename(columns={f'h3_r{H3_RES}': 'h3_id'}), on='h3_id', how='left')


In [None]:
if 'cluster' in month_summary.columns:
    cluster_counts = (
        month_summary.groupby('cluster', dropna=False)['n_crimes']
        .sum()
        .reset_index()
        .sort_values('n_crimes', ascending=False)
    )
    cluster_counts.head(10)
else:
    print('No clusters available (prototype may have failed to install dependencies).')


## Next Steps Checklist
- Adjust `YEAR`, `MONTH`, `H3_RES` at the top to explore other periods.
- Capture insights (stable hotspots, emerging hexes, low-confidence pockets).
- Feed these visuals into the Streamlit app for the two-day demo.

If clustering fails, ensure `umap-learn` and `hdbscan` are installed (`pip install umap-learn hdbscan`).