# Clustering & Region Segmentation — Indian Accident Dataset (2021)

**Goal:** Group regions with **similar accident profiles** (age + gender mix) without using total count, so we can identify region types for targeted policy (e.g. same cluster → similar interventions).

**Approach:** Normalize each region to **proportions** by age group (and gender); apply **K-means** and **hierarchical** clustering; name clusters (e.g. youth-heavy, senior-heavy, balanced); visualize profiles per cluster.

## 1. Load data and build proportion features

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("Dataset/Indian-Accident-Dataset.csv")
age_groups = ['0-14', '14-18', '18-30', '30-45', '45-60', '60+']
count_cols = [c for c in df.columns if c not in ['Type', 'Region']]

# Total accidents per region
df['Total_Accidents'] = df[count_cols].sum(axis=1)

# Age-group totals (male + female) per region
for ag in age_groups:
    df[f'Age_{ag}'] = df[f'Male({ag})'] + df[f'Female({ag})']

age_cols = [f'Age_{ag}' for ag in age_groups]

# Proportions by age (each row sums to 1)
df[age_cols] = df[age_cols].div(df[age_cols].sum(axis=1), axis=0)

# Male share overall (0–1) as extra feature for "gender mix"
male_cols = [c for c in count_cols if c.startswith('Male')]
df['Male_Share'] = df[male_cols].sum(axis=1) / df[count_cols].sum(axis=1)

feature_cols = age_cols + ['Male_Share']
X = df[feature_cols].copy()
X.index = df['Region']
print("Features (proportions):")
print(X.head(8).round(3))

## 2. Scale and choose number of clusters (K-means)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Elbow and silhouette for k=2..8
from sklearn.metrics import silhouette_score

inertias, silhouettes = [], []
K_range = range(2, 9)
for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_scaled)
    inertias.append(km.inertia_)
    silhouettes.append(silhouette_score(X_scaled, labels))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.plot(K_range, inertias, 'o-')
ax1.set_xlabel('Number of clusters (k)')
ax1.set_ylabel('Inertia')
ax1.set_title('Elbow method')
ax2.plot(K_range, silhouettes, 'o-')
ax2.set_xlabel('Number of clusters (k)')
ax2.set_ylabel('Silhouette score')
ax2.set_title('Silhouette score')
plt.tight_layout()
plt.show()

k_best = int(np.argmax(silhouettes) + 2)  # k with best silhouette in 2..8
print(f"Suggested k (best silhouette): {k_best}")

## 3. Fit K-means and assign cluster labels

In [None]:
n_clusters = k_best  # or set manually, e.g. 4
km = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df['Cluster'] = km.fit_predict(X_scaled)

print("Regions per cluster:")
print(df.groupby('Cluster').agg(Region_count=('Region', 'count'), Regions=('Region', list)).to_string())

## 4. Name clusters by dominant age profile

Label each cluster as youth-heavy (0–14, 14–18), working-age (18–30, 30–45), senior-heavy (45–60, 60+), or balanced.

In [None]:
cluster_profiles = df.groupby('Cluster')[age_cols].mean()

def name_cluster(row):
    ag = age_groups[np.argmax(row.values)]
    if ag in ['0-14', '14-18']:
        return 'Youth-heavy'
    if ag in ['18-30', '30-45']:
        return 'Working-age dominant'
    if ag in ['45-60', '60+']:
        return 'Senior-heavy'
    return 'Balanced'

cluster_names = cluster_profiles.apply(name_cluster, axis=1)
for c in cluster_profiles.index:
    dominant = age_groups[np.argmax(cluster_profiles.loc[c].values)]
    cluster_names[c] = f"{cluster_names[c]} (peak: {dominant})"

df['Cluster_Name'] = df['Cluster'].map(cluster_names)
print(cluster_names.to_string())

## 5. Cluster profile plots (mean proportions by age)

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(age_groups))
width = 0.8 / n_clusters
for i, cl in enumerate(sorted(df['Cluster'].unique())):
    offset = (i - n_clusters/2 + 0.5) * width
    means = cluster_profiles.loc[cl].values
    ax.bar(x + offset, means, width, label=cluster_names[cl])
ax.set_xticks(x)
ax.set_xticklabels(age_groups)
ax.set_ylabel('Mean proportion')
ax.set_xlabel('Age group')
ax.set_title('Accident age profile by cluster (mean proportions)')
ax.legend(loc='upper right', fontsize=8)
plt.tight_layout()
plt.show()

## 6. Heatmap: clusters × age proportions

In [None]:
heat_df = cluster_profiles.copy()
heat_df.index = [cluster_names[i] for i in heat_df.index]
fig, ax = plt.subplots(figsize=(10, max(4, n_clusters * 0.8)))
sns.heatmap(heat_df, annot=True, fmt='.2f', cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Mean proportion'})
ax.set_title('Cluster × Age group (mean proportion)')
plt.tight_layout()
plt.show()

## 7. Regions per cluster (table)

In [None]:
for cl in sorted(df['Cluster'].unique()):
    name = cluster_names[cl]
    regions = df[df['Cluster'] == cl]['Region'].tolist()
    print(f"\n--- Cluster {cl}: {name} ({len(regions)} regions) ---")
    print(regions)

## 8. Optional: 2D visualization (PCA)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=42)
X2 = pca.fit_transform(X_scaled)

fig, ax = plt.subplots(figsize=(9, 6))
for cl in sorted(df['Cluster'].unique()):
    mask = df['Cluster'] == cl
    ax.scatter(X2[mask, 0], X2[mask, 1], label=cluster_names[cl], alpha=0.8)
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_title('Regions in 2D (PCA of proportion features), colored by cluster')
ax.legend(loc='best', fontsize=8)
plt.tight_layout()
plt.show()

## 9. Optional: Hierarchical clustering dendrogram

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

Z = linkage(X_scaled, method='ward')
fig, ax = plt.subplots(figsize=(14, 6))
dendrogram(Z, labels=df['Region'].values, leaf_rotation=90, leaf_font_size=8, ax=ax)
ax.set_title('Hierarchical clustering (Ward) — region labels')
ax.set_xlabel('Region')
plt.tight_layout()
plt.show()

---
**Takeaway:** Clusters group regions with similar age (and gender) accident profiles. Use these segments to target interventions (e.g. youth safety in youth-heavy clusters, senior mobility in senior-heavy ones).