In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy.cluster.hierarchy import dendrogram, linkage

import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

In [None]:
crime_df = pd.read_csv('../Data/clean_crime_data.csv')
print(f"‚úì Loaded {len(crime_df):,} crime records")
print(f"‚úì Original columns: {crime_df.shape[1]}")

FEATURE ENGINEERING

In [None]:


# Convert Date to datetime
crime_df['Date'] = pd.to_datetime(crime_df['Date'], errors='coerce')

# Extract temporal features
crime_df['Hour'] = crime_df['Date'].dt.hour
crime_df['Day_of_Week'] = crime_df['Date'].dt.dayofweek  # 0=Monday
crime_df['Month'] = crime_df['Date'].dt.month
crime_df['Year'] = crime_df['Date'].dt.year
print("‚úì Temporal features: Hour, Day_of_Week, Month, Year")

# Season classification
def get_season(month):
    if month in [12, 1, 2]: return 'Winter'
    elif month in [3, 4, 5]: return 'Spring'
    elif month in [6, 7, 8]: return 'Summer'
    else: return 'Fall'

crime_df['Season'] = crime_df['Month'].apply(get_season)
print("‚úì Season feature created")

# Weekend flag
crime_df['Is_Weekend'] = crime_df['Day_of_Week'].isin([5, 6]).astype(int)
print("‚úì Weekend flag created")

# Crime severity score
severity_map = {
    'HOMICIDE': 10, 'CRIM SEXUAL ASSAULT': 9, 'ROBBERY': 8,
    'ASSAULT': 7, 'BATTERY': 7, 'BURGLARY': 6,
    'MOTOR VEHICLE THEFT': 6, 'THEFT': 5, 'NARCOTICS': 5,
    'CRIMINAL DAMAGE': 4, 'DECEPTIVE PRACTICE': 4, 'WEAPONS VIOLATION': 8
}
crime_df['Crime_Severity_Score'] = crime_df['Primary Type'].map(severity_map).fillna(3)
print("‚úì Crime Severity Score created")

# Arrest as binary
crime_df['Arrest'] = crime_df['Arrest'].astype(int)

# Encode categorical variables
label_encoders = {}
categorical_cols = ['Primary Type', 'Location Description', 'District', 'Season']

for col in categorical_cols:
    if col in crime_df.columns:
        le = LabelEncoder()
        crime_df[f'{col}_Encoded'] = le.fit_transform(crime_df[col].astype(str))
        label_encoders[col] = le

print(f"‚úì Encoded: {categorical_cols}")

# Clean data
crime_df = crime_df.dropna(subset=['Latitude', 'Longitude', 'Hour', 'Month'])
print(f"\n‚úì Final dataset: {crime_df.shape[0]:,} rows √ó {crime_df.shape[1]} columns")

In [None]:
mlflow.set_tracking_uri('http://ec2-65-2-75-98.ap-south-1.compute.amazonaws.com:5000/')
mlflow.set_experiment("PatrolIQ_Geographic_Clustering_Experiment")

MLFLOW SETUP

GEOGRAPHIC CLUSTERING

In [None]:


X_geo = crime_df[['Latitude', 'Longitude']].copy()
scaler_geo = StandardScaler()
X_geo_scaled = scaler_geo.fit_transform(X_geo)

# ---- ELBOW METHOD ----
print("\nüîç Finding optimal K...")
sample_size = min(10000, len(X_geo_scaled))
sample_idx = np.random.choice(len(X_geo_scaled), sample_size, replace=False)
X_sample = X_geo_scaled[sample_idx]

k_range = range(5, 11)
inertias = []
silhouette_scores = []

for k in k_range:
    kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels_temp = kmeans_temp.fit_predict(X_geo_scaled)
    inertias.append(kmeans_temp.inertia_)
    sil_score = silhouette_score(X_sample, labels_temp[sample_idx])
    silhouette_scores.append(sil_score)
    print(f"  k={k} ‚Üí Silhouette={sil_score:.3f}")

optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"\n‚úì Optimal K: {optimal_k}")

# Plot Elbow Method
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(k_range, inertias, 'bo-')
axes[0].set_title('Elbow Method')
axes[0].set_xlabel('Number of Clusters')
axes[0].set_ylabel('Inertia')
axes[0].grid(True)

axes[1].plot(k_range, silhouette_scores, 'ro-')
axes[1].axhline(y=0.5, color='green', linestyle='--', label='Target 0.5')
axes[1].set_title('Silhouette Score vs K')
axes[1].set_xlabel('Number of Clusters')
axes[1].set_ylabel('Silhouette Score')
axes[1].legend()
axes[1].grid(True)
plt.tight_layout()
plt.savefig('elbow_method.png', dpi=300)
plt.show()

In [None]:

with mlflow.start_run(run_name='KMeans'):
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    kmeans_labels = kmeans.fit_predict(X_geo_scaled)
    
    kmeans_sil = silhouette_score(X_sample, kmeans_labels[sample_idx])
    kmeans_db = davies_bouldin_score(X_geo_scaled, kmeans_labels)
    
    mlflow.log_param('algorithm', 'KMeans')
    mlflow.log_param('n_clusters', optimal_k)
    mlflow.log_metric('silhouette_score', kmeans_sil)
    mlflow.log_metric('davies_bouldin_score', kmeans_db)
    mlflow.log_artifact('elbow_method.png')
    mlflow.sklearn.log_model(kmeans, 'model')
    
    print(f"  Silhouette: {kmeans_sil:.3f} | Davies-Bouldin: {kmeans_db:.3f}")

In [None]:

with mlflow.start_run(run_name='DBSCAN'):
    EPS_METERS = 300
    MIN_SAMPLES = 10
    EARTH_RADIUS = 6371000
    
    coords_rad = np.deg2rad(X_geo.values)
    eps_rad = EPS_METERS / EARTH_RADIUS
    
    dbscan = DBSCAN(eps=eps_rad, min_samples=MIN_SAMPLES, metric='haversine', n_jobs=-1)
    dbscan_labels = dbscan.fit_predict(coords_rad)
    
    n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
    n_noise = np.sum(dbscan_labels == -1)
    
    if n_clusters >= 2:
        valid_idx = np.where(dbscan_labels != -1)[0]
        sample_idx_db = np.random.choice(valid_idx, min(10000, len(valid_idx)), replace=False)
        dbscan_sil = silhouette_score(coords_rad[sample_idx_db], dbscan_labels[sample_idx_db])
        dbscan_db = davies_bouldin_score(coords_rad[valid_idx], dbscan_labels[valid_idx])
    else:
        dbscan_sil = -1.0
        dbscan_db = 999.0
    
    mlflow.log_param('algorithm', 'DBSCAN')
    mlflow.log_param('eps_meters', EPS_METERS)
    mlflow.log_param('min_samples', MIN_SAMPLES)
    mlflow.log_metric('n_clusters', n_clusters)
    mlflow.log_metric('n_noise_points', n_noise)
    mlflow.log_metric('silhouette_score', dbscan_sil)
    mlflow.log_metric('davies_bouldin_score', dbscan_db)
    mlflow.sklearn.log_model(dbscan, 'model')
    
    print(f"  Clusters: {n_clusters} | Noise: {n_noise} | Silhouette: {dbscan_sil:.3f}")

In [None]:

with mlflow.start_run(run_name='Hierarchical'):
    sample_size_hier = min(5000, len(X_geo_scaled))
    idx_hier = np.random.choice(len(X_geo_scaled), sample_size_hier, replace=False)
    X_sample_hier = X_geo_scaled[idx_hier]
    
    hierarchical = AgglomerativeClustering(n_clusters=optimal_k, linkage='ward')
    hier_labels_sample = hierarchical.fit_predict(X_sample_hier)
    
    hier_sil = silhouette_score(X_sample_hier, hier_labels_sample)
    hier_db = davies_bouldin_score(X_sample_hier, hier_labels_sample)
    
    mlflow.log_param('algorithm', 'Hierarchical')
    mlflow.log_param('n_clusters', optimal_k)
    mlflow.log_param('sample_size', sample_size_hier)
    mlflow.log_metric('silhouette_score', hier_sil)
    mlflow.log_metric('davies_bouldin_score', hier_db)
    
    print(f"  Silhouette: {hier_sil:.3f} | Davies-Bouldin: {hier_db:.3f}")
    
    # Dendrogram
    dendro_size = min(1000, sample_size_hier)
    d_idx = np.random.choice(sample_size_hier, dendro_size, replace=False)
    X_dendro = X_sample_hier[d_idx]
    
    Z = linkage(X_dendro, method='ward')
    plt.figure(figsize=(15, 8))
    dendrogram(Z, truncate_mode='lastp', p=30)
    plt.title("Hierarchical Clustering Dendrogram")
    plt.xlabel("Cluster Size")
    plt.ylabel("Distance")
    plt.savefig("dendrogram.png", dpi=300)
    plt.show()
    mlflow.log_artifact("dendrogram.png")

In [None]:


comparison_df = pd.DataFrame({
    'Algorithm': ['KMeans', 'DBSCAN', 'Hierarchical'],
    'Silhouette': [kmeans_sil, dbscan_sil, hier_sil],
    'Davies-Bouldin': [kmeans_db, dbscan_db, hier_db],
    'Clusters': [optimal_k, n_clusters, optimal_k]
})
print(comparison_df)

valid_sil = comparison_df['Silhouette'].copy()
valid_sil[valid_sil < 0] = -999
best_algo = comparison_df.loc[valid_sil.idxmax(), 'Algorithm']
print(f"\nüèÜ Best Algorithm: {best_algo}")

# Add best labels to dataframe
if best_algo == 'KMeans':
    best_labels = kmeans_labels
elif best_algo == 'DBSCAN':
    best_labels = dbscan_labels
else:
    # Predict hierarchical on full dataset
    hier_full = AgglomerativeClustering(n_clusters=optimal_k, linkage='ward')
    best_labels = hier_full.fit_predict(X_geo_scaled)

crime_df['GeoCluster'] = best_labels

# Cluster statistics
cluster_stats = crime_df.groupby('GeoCluster').agg({
    'ID': 'count',
    'Primary Type': lambda x: x.value_counts().idxmax(),
    'Arrest': 'mean',
    'Latitude': 'mean',
    'Longitude': 'mean'
}).rename(columns={
    'ID': 'Total_Crimes',
    'Primary Type': 'Dominant_Crime',
    'Arrest': 'Arrest_Rate'
})

print("\nüìç Cluster Statistics:")
print(cluster_stats)

TEMPORAL CLUSTERING

In [None]:
mlflow.set_experiment("PatrolIQ_Temporal_Clustering_Experiment")

X_temporal = crime_df[['Hour', 'Day_of_Week', 'Month']].copy()
scaler_temp = StandardScaler()
X_temporal_scaled = scaler_temp.fit_transform(X_temporal)

with mlflow.start_run(run_name='KMeans_Temporal'):
    kmeans_temp = KMeans(n_clusters=4, random_state=42, n_init=10)
    temporal_labels = kmeans_temp.fit_predict(X_temporal_scaled)
    
    sample_idx_temp = np.random.choice(len(X_temporal_scaled), min(5000, len(X_temporal_scaled)), replace=False)
    temp_sil = silhouette_score(X_temporal_scaled[sample_idx_temp], temporal_labels[sample_idx_temp])
    
    mlflow.log_param('algorithm', 'KMeans_Temporal')
    mlflow.log_param('n_clusters', 4)
    mlflow.log_metric('silhouette_score', temp_sil)
    mlflow.sklearn.log_model(kmeans_temp, 'model')
    
    print(f"‚úì Silhouette Score: {temp_sil:.3f}")

crime_df['TemporalCluster'] = temporal_labels

temporal_profiles = crime_df.groupby('TemporalCluster')[['Hour', 'Day_of_Week', 'Month']].mean()
print("\n‚è±Ô∏è  Temporal Cluster Profiles:")
print(temporal_profiles)

# Hourly heatmap
print("\nüìà Creating hourly heatmap...")
hourly_daily = crime_df.groupby(['Day_of_Week', 'Hour']).size().unstack(fill_value=0)
plt.figure(figsize=(16, 8))
sns.heatmap(hourly_daily, cmap='YlOrRd', cbar_kws={'label': 'Crime Count'})
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week (0=Monday)')
plt.title('Crime Heatmap: Day vs Hour')
plt.savefig('hourly_heatmap.png', dpi=300)
plt.show()

DIMENSIONALITY REDUCTION - PCA

In [None]:
mlflow.set_experiment("PatrolIQ_Dimensionality_Reduction_Experiment")

# Prepare feature matrix
numeric_features = ['Hour', 'Day_of_Week', 'Month', 'Is_Weekend', 
                    'Crime_Severity_Score', 'Arrest',
                    'Primary Type_Encoded', 'Location Description_Encoded',
                    'District_Encoded', 'Season_Encoded']
numeric_features = [col for col in numeric_features if col in crime_df.columns]

X_all_features = crime_df[numeric_features].copy()
scaler_all = StandardScaler()
X_all_scaled = scaler_all.fit_transform(X_all_features)

with mlflow.start_run(run_name='PCA'):
    # Find optimal components for 70% variance
    pca_full = PCA()
    pca_full.fit(X_all_scaled)
    
    cumsum_var = np.cumsum(pca_full.explained_variance_ratio_)
    n_components_70 = np.argmax(cumsum_var >= 0.70) + 1
    
    # Final PCA
    pca = PCA(n_components=max(3, n_components_70))
    X_pca_transformed = pca.fit_transform(X_all_scaled)
    
    explained_var = pca.explained_variance_ratio_.sum()
    
    mlflow.log_param('technique', 'PCA')
    mlflow.log_param('n_components', pca.n_components_)
    mlflow.log_metric('explained_variance', explained_var)
    mlflow.log_metric('n_components_70pct', n_components_70)
    mlflow.sklearn.log_model(pca, 'model')
    
    print(f"‚úì Components: {pca.n_components_} | Variance: {explained_var:.2%}")

# Scree plot
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(pca_full.explained_variance_ratio_) + 1),
         pca_full.explained_variance_ratio_, 'bo-')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Scree Plot')
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(range(1, len(cumsum_var) + 1), cumsum_var, 'ro-')
plt.axhline(y=0.70, color='green', linestyle='--', label='70% Target')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Variance')
plt.title('Cumulative Explained Variance')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('pca_scree_plot.png', dpi=300)
plt.show()

# Top features
feature_importance = np.abs(pca.components_).sum(axis=0)
top_features = pd.DataFrame({
    'Feature': numeric_features,
    'Importance': feature_importance
}).sort_values('Importance', ascending=False)
print("\nüîù Top 5 Important Features:")
print(top_features.head(5))

# PCA 2D visualization
plt.figure(figsize=(10, 8))
plt.scatter(X_pca_transformed[:, 0], X_pca_transformed[:, 1], 
            c=temporal_labels, cmap='viridis', s=10, alpha=0.5)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
plt.title('PCA 2D Visualization')
plt.colorbar(label='Temporal Cluster')
plt.savefig('pca_2d.png', dpi=300)
plt.show()

# Reconstruction error
from sklearn.metrics import mean_squared_error
X_pca_reconstructed = pca.inverse_transform(X_pca_transformed)
reconstruction_error = mean_squared_error(X_all_scaled, X_pca_reconstructed)
print(f"‚úì Reconstruction Error: {reconstruction_error:.4f}")

t-SNE VISUALIZATION

In [None]:

with mlflow.start_run(run_name='t-SNE'):
    sample_size_tsne = min(5000, len(X_all_scaled))
    sample_indices = np.random.choice(len(X_all_scaled), sample_size_tsne, replace=False)
    X_sample_tsne = X_all_scaled[sample_indices]
    
    tsne = TSNE(n_components=2, random_state=42, perplexity=30)
    X_tsne = tsne.fit_transform(X_sample_tsne)
    
    mlflow.log_param('technique', 't-SNE')
    mlflow.log_param('n_components', 2)
    mlflow.log_param('perplexity', 30)
    mlflow.log_param('sample_size', sample_size_tsne)
    
    print(f"‚úì t-SNE completed on {sample_size_tsne:,} samples")

# t-SNE plots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

temp_labels_sample = temporal_labels[sample_indices]
axes[0].scatter(X_tsne[:, 0], X_tsne[:, 1], c=temp_labels_sample, 
                cmap='viridis', s=20, alpha=0.6)
axes[0].set_title('t-SNE: Colored by Temporal Cluster')
axes[0].set_xlabel('t-SNE 1')
axes[0].set_ylabel('t-SNE 2')

hour_sample = crime_df.iloc[sample_indices]['Hour'].values
axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=hour_sample, 
                cmap='coolwarm', s=20, alpha=0.6)
axes[1].set_title('t-SNE: Colored by Hour of Day')
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
plt.colorbar(axes[1].collections[0], ax=axes[1], label='Hour')

plt.tight_layout()
plt.savefig('tsne_visualization.png', dpi=300)
plt.show()

MODEL REGISTRY

In [None]:
client = MlflowClient()

experiment = client.get_experiment_by_name("PatrolIQ_Geographic_Clustering_Experiment")
runs = client.search_runs(experiment_ids=[experiment.experiment_id])
best_run = max(runs, key=lambda r: r.data.metrics.get('silhouette_score', 0))

print(f"Best Run ID: {best_run.info.run_id}")
print(f"Algorithm: {best_run.data.params.get('algorithm')}")
print(f"Silhouette Score: {best_run.data.metrics.get('silhouette_score'):.3f}")

model_uri = f"runs:/{best_run.info.run_id}/model"
result = mlflow.register_model(model_uri, "PatrolIQ_Best_Model")
print(f"\n‚úÖ Model Registered: {result.name} (Version {result.version})")

In [None]:
exp = client.get_experiment_by_name("PatrolIQ_Temporal_Clustering_Experiment")
runs = client.search_runs(experiment_ids=[exp.experiment_id])

best_temporal = max(runs, key=lambda r: r.data.metrics.get('silhouette_score', 0))
temp_uri = f"runs:/{best_temporal.info.run_id}/model"

temp_reg = mlflow.register_model(temp_uri, "PatrolIQ_Temporal_Clustering_Model")
print("Temporal Registered Version =", temp_reg.version)


In [None]:
exp = client.get_experiment_by_name("PatrolIQ_Dimensionality_Reduction_Experiment")
runs = client.search_runs(experiment_ids=[exp.experiment_id])

best_pca = max(runs, key=lambda r: r.data.metrics.get('explained_variance', 0))
pca_uri = f"runs:/{best_pca.info.run_id}/model"

pca_reg = mlflow.register_model(pca_uri, "PatrolIQ_Dimensionality_Reduction_Model")
print("PCA Registered Version =", pca_reg.version)


FINAL SUMMARY

In [None]:

print("\n" + "="*80)
print("‚úÖ PROJECT COMPLETED - ALL REQUIREMENTS MET")
print("="*80)
print("‚úì Feature Engineering (Hour, Day, Month, Season, Weekend, Severity)")
print("‚úì 3 Clustering Algorithms (K-Means, DBSCAN, Hierarchical)")
print("‚úì Elbow Method for optimal K")
print("‚úì Dendrogram for Hierarchical Clustering")
print("‚úì Temporal Clustering with 4 patterns")
print("‚úì Hourly Crime Heatmap")
print(f"‚úì PCA with {pca.n_components_} components ({explained_var:.1%} variance)")
print("‚úì PCA Scree Plot")
print("‚úì t-SNE 2D Visualization")
print("‚úì MLflow Experiment Tracking")
print("‚úì Model Registry (Best Model Registered)")
print("="*80)