In [1]:
# pandas: Needed for data manipulation, loading CSV/GeoJSON, grouping, and creating DataFrames.
import pandas as pd

# geopandas: Needed for handling geospatial data from GeoJSON, merging with crime data for mapping.
import geopandas as gpd

# numpy: Needed for numerical operations, like rounding predictions.
import numpy as np

# matplotlib.pyplot: Needed for creating basic plots like bar charts.
import matplotlib.pyplot as plt

# seaborn: Needed for advanced visualizations like countplots and heatmaps.
import seaborn as sns

# sklearn.linear_model.LinearRegression: Needed for the linear regression model.
from sklearn.linear_model import LinearRegression

# sklearn.ensemble.RandomForestRegressor: Needed for the random forest regression model.
from sklearn.ensemble import RandomForestRegressor

# sklearn.neighbors.KNeighborsRegressor: Needed for the KNN regression model.
from sklearn.neighbors import KNeighborsRegressor

# sklearn.svm.SVR: Needed for the support vector regression model.
from sklearn.svm import SVR

# sklearn.neural_network.MLPRegressor: Needed for the multi-layer perceptron (neural network) regression model.
from sklearn.neural_network import MLPRegressor

# xgboost.XGBRegressor: Needed for the XGBoost regression model.
from xgboost import XGBRegressor

# Load the CSV data (main dataset with crime details).
crime_data = pd.read_csv('ODC_CRIME_OFFENSES_P_-3254178225590307312.csv')

# Optionally load GeoJSON for geospatial previews or merges (contains point geometries for crimes).
geo_crime = gpd.read_file('ODC_CRIME_OFFENSES_P_-5582264493798559810.geojson')

# Preview the CSV data to understand structure (why: Ensures data loaded correctly, shows columns like dates, neighborhoods).
print("CSV Data Preview:")
print(crime_data.head())

# Preview the GeoJSON data (why: Confirms geospatial points for potential mapping).
print("\nGeoJSON Data Preview:")
print(geo_crime.head())

# Parse the 'FIRST_OCCURRENCE_DATE' to datetime (why: Allows extraction of year, day, hour for temporal analysis).
crime_data['FIRST_OCCURRENCE_DATE'] = pd.to_datetime(crime_data['FIRST_OCCURRENCE_DATE'], errors='coerce')

# Extract year for train/test split (why: Use historical years for training, current for testing to simulate real prediction).
crime_data['Year'] = crime_data['FIRST_OCCURRENCE_DATE'].dt.year

# Extract DayOfWeek and Hour (why: Key features for grouping and predicting crime patterns by time).
crime_data['DayOfWeek'] = crime_data['FIRST_OCCURRENCE_DATE'].dt.day_name()
crime_data['Hour'] = crime_data['FIRST_OCCURRENCE_DATE'].dt.hour

# Filter to actual crimes and drop NaNs in key columns (why: Focus on relevant data, avoid model errors from missing values).
crime_data = crime_data[crime_data['IS_CRIME'] == 1].dropna(subset=['NEIGHBORHOOD_ID', 'Hour', 'DayOfWeek'])

# Preview engineered features (why: Verify transformations worked).
print("\nEngineered Features Preview:")
print(crime_data[['FIRST_OCCURRENCE_DATE', 'DayOfWeek', 'Hour', 'NEIGHBORHOOD_ID']].head())

# Visualization: Crime count by hour (why: Provides insight into temporal patterns, e.g., peaks at night for policing focus).
plt.figure(figsize=(12, 6))
sns.countplot(x='Hour', data=crime_data)
plt.title('Crime Distribution by Hour in Denver')
plt.tight_layout()
plt.show()

# Add 'Crimes' column for counting (why: Each row represents one crime; this allows aggregation).
crime_data['Crimes'] = 1

# Group by year, day, neighborhood, hour and sum crimes (why: Aggregates to predictable units like 'crimes in Five Points on Monday at 8pm').
hour_totals = crime_data.groupby(['Year', 'DayOfWeek', 'NEIGHBORHOOD_ID', 'Hour'])['Crimes'].sum().reset_index()

# Split into train (pre-2025) and test (2025) (why: Train on past data, test on current to evaluate predictive power).
train = hour_totals[hour_totals['Year'] < 2025]
test = hour_totals[hour_totals['Year'] == 2025]

# Create dummy variables for categorical features (why: Models require numerical inputs; one-hot encoding handles categories like days/neighborhoods).
train_dummies = pd.get_dummies(train[['Crimes', 'Hour', 'DayOfWeek', 'NEIGHBORHOOD_ID']])
test_dummies = pd.get_dummies(test[['Crimes', 'Hour', 'DayOfWeek', 'NEIGHBORHOOD_ID']])

# Align test columns to train (fill missing with 0) (why: Ensures same features in both sets for model compatibility).
test_dummies = test_dummies.reindex(columns=train_dummies.columns, fill_value=0)

# Define X and y for train/test (why: Standard setup for supervised learning; X=features, y=target).
X_train = train_dummies.drop('Crimes', axis=1)
y_train = train_dummies['Crimes']
X_test = test_dummies.drop('Crimes', axis=1)
y_test = test_dummies['Crimes']

# Visualization: Heatmap of average crimes by day and hour (why: Visualizes patterns, e.g., weekend evenings hotter, aiding explanation).
plt.figure(figsize=(14, 8))
pivot = hour_totals.pivot_table(index='DayOfWeek', columns='Hour', values='Crimes', aggfunc='mean')
sns.heatmap(pivot, cmap='YlOrRd')
plt.title('Average Crimes by Day and Hour in Denver')
plt.tight_layout()
plt.show()

# Define models dictionary (why: Allows easy iteration to train/compare multiple regressors for best performance).
models = {
    'LinearRegression': LinearRegression(),  # Simple linear relationship model
    'RandomForestRegressor': RandomForestRegressor(),  # Ensemble of decision trees, good for complex patterns
    'KNeighborsRegressor': KNeighborsRegressor(),  # Distance-based, predicts based on similar instances
    'SVR': SVR(),  # Support Vector Regression, finds optimal boundary for prediction
    'XGBRegressor': XGBRegressor(),  # Gradient boosting, sequential tree building for high accuracy
    'MLPRegressor': MLPRegressor(hidden_layer_sizes=(100,100,100,100), random_state=444)  # Neural network model
}

# Train and evaluate each model (why: Compare R2 scores to select best for predictions; higher score = better fit).
print("Model Performance (R2 Scores):")
print("R2 Score Range: -∞ to 1.0 (higher is better)")
print("0.0 = model predicts mean, <0.0 = worse than mean prediction")
print("=" * 60)

# Store scores for analysis
scores = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    scores[name] = score
    print(f'{name}: {score:.4f}')

# Identify best performing model based on R2 score
best_model_name = max(scores, key=scores.get)
best_model = models[best_model_name]
best_score = scores[best_model_name]

# Sort models by performance for interpretation
sorted_models = sorted(scores.items(), key=lambda x: x[1], reverse=True)

print(f"\nBest Model: {best_model_name} with R2 Score: {best_score:.4f}")

# Dynamic R2 Score Interpretation
print(f"\nR2 Score Interpretation:")
print(f"- {sorted_models[0][0]} ({sorted_models[0][1]:.4f}): Best performer, explains {sorted_models[0][1]*100:.1f}% of variance in crime data")

if len(sorted_models) > 1:
    print(f"- {sorted_models[1][0]} ({sorted_models[1][1]:.4f}): Second best, explains {sorted_models[1][1]*100:.1f}% of variance")

# Performance categorization
print(f"\nPerformance Categories:")
for i, (name, score) in enumerate(sorted_models):
    if score >= 0.3:
        category = "Good"
    elif score >= 0.2:
        category = "Moderate" 
    elif score >= 0.1:
        category = "Weak"
    else:
        category = "Poor"
    print(f"- {name}: {category} (R2 = {score:.4f})")

# Visualization: Feature importances from Random Forest (why: Shows which features (e.g., neighborhoods) drive predictions, building interpretability).
if 'RandomForestRegressor' in models:
    plt.figure(figsize=(10, 6))
    importances = pd.Series(models['RandomForestRegressor'].feature_importances_, index=X_train.columns)
    importances.nlargest(10).plot(kind='barh')
    plt.title('Top 10 Feature Importances for Crime Prediction (Random Forest)')
    plt.xlabel('Importance Score (higher = more influential)')
    plt.tight_layout()
    plt.show()

# Generate predictions using the BEST model (why: To compare predicted vs actual crimes using the most accurate model).
test = test.copy()  # Create explicit copy to avoid SettingWithCopyWarning
test.loc[:, 'Predicted'] = best_model.predict(X_test)

# Round predictions (why: Crime counts are integers; improves readability).
test = np.round(test, 2)

print(f"\nUsing {best_model_name} for final predictions (best R2 score: {best_score:.4f})")

# Export predictions to JSON (why: For interactive visualizations, e.g., in D3.js maps as in SF example).
test.to_json('denver_crime_predictions.json', orient='records', double_precision=2)
print("Predictions exported to 'denver_crime_predictions.json'")

# Additional performance summary
print(f"\n=== Performance Summary ===")
print(f"Models tested: {len(scores)}")
print(f"Score range: {min(scores.values()):.4f} - {max(scores.values()):.4f}")
print(f"Average R2 score: {np.mean(list(scores.values())):.4f}")

# Visualization: Bar plot of actual vs predicted by neighborhood (why: Compares model accuracy visually per area).
# Melt for side-by-side bars.
plt.figure(figsize=(14, 8))
melted = pd.melt(test, id_vars=['NEIGHBORHOOD_ID'], value_vars=['Crimes', 'Predicted'], var_name='Type', value_name='Count')
sns.barplot(x='NEIGHBORHOOD_ID', y='Count', hue='Type', data=melted)
plt.title(f'Actual vs Predicted Crimes by Neighborhood (2025 Test Data)\nUsing {best_model_name} Model (R² = {best_score:.3f})')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

# Merge with GeoJSON for chloropleth (why: Spatial viz of predictions; requires neighborhood shapes, but uses points here as proxy).
print("\nAttempting spatial merge for chloropleth visualization...")
print("geo_crime columns:", list(geo_crime.columns))
print("test columns:", list(test.columns))

# Extract Hour from datetime columns in geo_crime to match test dataset
datetime_cols = ['FIRST_OCCURRENCE_DATE', 'LAST_OCCURRENCE_DATE', 'REPORTED_DATE']
available_datetime_cols = [col for col in datetime_cols if col in geo_crime.columns]

if available_datetime_cols:
    # Use the first available datetime column to extract hour
    datetime_col = available_datetime_cols[0]
    print(f"Using '{datetime_col}' to extract hour information")
    
    # Convert to datetime and extract hour
    geo_crime_copy = geo_crime.copy()
    geo_crime_copy['Hour'] = pd.to_datetime(geo_crime_copy[datetime_col]).dt.hour
    print(f"Extracted hours range: {geo_crime_copy['Hour'].min()} to {geo_crime_copy['Hour'].max()}")
    
    # Now merge on both NEIGHBORHOOD_ID and Hour
    merge_on = ['NEIGHBORHOOD_ID', 'Hour']
    print(f"Merging on columns: {merge_on}")
    
    # Perform the merge
    merged_geo = geo_crime_copy.merge(test, on=merge_on, how='left', suffixes=('_geo', '_pred'))
    
    # Check if merge was successful and Predicted column exists
    if 'Predicted' in merged_geo.columns:
        # Create the chloropleth plot
        plt.figure(figsize=(14, 10))
        
        # Plot by predicted crime density
        ax = merged_geo.plot(column='Predicted', 
                            cmap='OrRd', 
                            legend=True, 
                            alpha=0.8, 
                            edgecolor='black', 
                            linewidth=0.5,
                            figsize=(14, 10),
                            legend_kwds={'label': 'Predicted Crime Count', 
                                       'orientation': 'horizontal',
                                       'shrink': 0.8})
        
        plt.title(f'Predicted Crimes Distribution by Neighborhood and Hour\nUsing {best_model_name} Model (R² = {best_score:.3f})', 
                 fontsize=16, fontweight='bold', pad=20)
        plt.axis('off')
        
        # Add some context to the plot
        plt.figtext(0.5, 0.01, 
                   f"Data points: {merged_geo['Predicted'].notna().sum()} | "
                   f"Prediction range: {merged_geo['Predicted'].min():.1f} to {merged_geo['Predicted'].max():.1f}",
                   ha='center', fontsize=10, style='italic')
        
        plt.tight_layout()
        plt.subplots_adjust(bottom=0.15)
        plt.show()
        
        # Print detailed statistics about the predictions
        predicted_data = merged_geo['Predicted'].dropna()
        print(f"\n=== Spatial Prediction Statistics ===")
        print(f"Total spatial points with predictions: {len(predicted_data)}")
        print(f"Predicted crimes - Min: {predicted_data.min():.2f}")
        print(f"Predicted crimes - Max: {predicted_data.max():.2f}")
        print(f"Predicted crimes - Mean: {predicted_data.mean():.2f}")
        print(f"Predicted crimes - Std: {predicted_data.std():.2f}")
        
    else:
        print("Warning: 'Predicted' column not found after merge.")
else:
    print("Note: Could not create spatial visualization - hour extraction from geo_crime failed.")

print("\n=== Analysis Complete ===")

KeyboardInterrupt: 