In [None]:
from prophet import Prophet
import pandas as pd
import numpy as np
import os
import pickle
import tqdm
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from math import sqrt
import seaborn as sns
from collections import defaultdict
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
TOP_X_CRIMES = 10 

In [None]:
path = '..\\data\\mergedData\\ts_df.csv'
ts_df = pd.read_csv(path)
ts_df = ts_df.loc[:, ~ts_df.columns.str.contains('^Unnamed')]
ts_df.head()

In [None]:
dist_ids = ts_df.dist_id.unique()
dist_ids = sorted(dist_ids)
print("dist_ids", dist_ids)
num_dists = len(dist_ids)
print("num_dists", num_dists)

In [None]:
cluster_names = ts_df.Cluster_Name.unique()
cluster_names = sorted(cluster_names)
print("cluster_names", cluster_names)
num_cluster_names = len(cluster_names)
print("num_cluster_names", num_cluster_names)

In [None]:
crime_dist_dfs = {}
for i, dist_id in enumerate(dist_ids):
    # Filter data for this district
    dist_data = ts_df[ts_df['dist_id'] == dist_id]
    crime_dist_dfs[dist_id] = {}
    for i, cluster_name in enumerate(cluster_names):
        crime_data = dist_data[dist_data['Cluster_Name'] == cluster_name]
        # Prepare ts data and split
        train = crime_data.loc[crime_data.Reported_Date < "2024-01-01"]
        test = crime_data.loc[crime_data.Reported_Date >= "2024-01-01"]
        train = train.groupby("Reported_Date").size().reset_index(name='Crime_Count')
        test = test.groupby("Reported_Date").size().reset_index(name='Crime_Count')
        train.index = pd.to_datetime(train.Reported_Date)
        test.index = pd.to_datetime(test.Reported_Date)
        train = train.asfreq('d', fill_value=0)
        test = test.asfreq('d', fill_value=0)
        train = train.drop(columns=['Reported_Date'])
        test = test.drop(columns=['Reported_Date'])
        # Save final dict
        crime_dist_dfs[dist_id][cluster_name] = {
            "train": train,
            "test": test
        }

In [None]:
print(f"number of datasets in dict: {num_dists*num_cluster_names}")
print(f"districts in dict: {crime_dist_dfs.keys()}")
print(f"crimes in dict: {crime_dist_dfs[1.0].keys()}")
print(f"test/train in dict: {crime_dist_dfs[1.0]['Alcohol Influence'].keys()}")

In [None]:
crime_dist_dfs[1.0]['Alcohol Influence']["train"].head()

In [None]:
def get_top_x_crimes(crime_dist_dfs, X = 10):
    # Step 1: Aggregate crime counts across dates and districts
    total_crime_counts = defaultdict(int)

    for district, crimes in crime_dist_dfs.items():
        for crime, data_dict in crimes.items():
            # Sum the Crime_Count column for the 'train' DataFrame of each crime
            total_crime_counts[crime] += data_dict["train"]["Crime_Count"].sum()

    # Step 2: Sort crimes by count in descending order and extract the top X crimes
    X = X  # Specify the number of top crimes to retrieve
    top_crimes_set = sorted(total_crime_counts.items(), key=lambda x: x[1], reverse=True)[:X]
    
    top_crimes = []
    # Display the top X crimes
    for crime, count in top_crimes_set:
        print(f"{crime}: {count}")
        top_crimes.append(crime)

    # Step 3: Filter `crime_dist_dfs` to retain only the top X crimes
    filtered_crime_dist_dfs = {
        district: {crime: data_dict for crime, data_dict in crimes.items() if crime in top_crimes}
        for district, crimes in crime_dist_dfs.items()
    }

    # `filtered_crime_dist_dfs` now contains only the top X crimes
    return filtered_crime_dist_dfs

In [None]:
crime_dist_dfs = get_top_x_crimes(crime_dist_dfs, X = TOP_X_CRIMES)

In [None]:
# Loop through each district and crime category
for dist_id, crimes in crime_dist_dfs.items():
    for cluster_name, data_splits in tqdm.tqdm(crimes.items(), desc=f"Processing District {dist_id}", leave=False):
        train = data_splits["train"]
        test = data_splits["test"]

        try:
            # Prepare data for Prophet
            train_data_prepared = train.reset_index().rename(columns={'Reported_Date': 'ds', 'Crime_Count': 'y'})
            test_data_prepared = test.reset_index().rename(columns={'Reported_Date': 'ds', 'Crime_Count': 'y'})

            # Ensure 'ds' is datetime
            train_data_prepared['ds'] = pd.to_datetime(train_data_prepared['ds'])
            test_data_prepared['ds'] = pd.to_datetime(test_data_prepared['ds'])

            # check if train_data_prepared has NaN values
            if train_data_prepared.isnull().values.any():
                print(f"NaN values in train_data_prepared for district {dist_id}, crime {cluster_name}")

            # Fit Prophet model
            model = Prophet(yearly_seasonality=True, weekly_seasonality=True, daily_seasonality=False)
            model.fit(train_data_prepared)

            # Determine the number of predictions
            n_predictions = len(test_data_prepared) if len(test_data_prepared) > 0 else 265

            # Create future dataframe
            future_dates = model.make_future_dataframe(periods=n_predictions, freq='D')
            forecast = model.predict(future_dates)

            # Extract the predicted values
            predicted_values = forecast[['ds', 'yhat']]

            # If test data is available, calculate RMSE
            if len(test_data_prepared) > 0:
                # Merge test_data and predicted_values on 'ds'
                merged = test_data_prepared.merge(predicted_values, on='ds', how='left')
                rmse = sqrt(mean_squared_error(merged['y'], merged['yhat']))
                crime_dist_dfs[dist_id][cluster_name]["model_config"] = {"rmse": rmse}
            else:
                crime_dist_dfs[dist_id][cluster_name]["model_config"] = {"rmse": None}

            # Save forecast and residuals
            crime_dist_dfs[dist_id][cluster_name]["forecast"] = forecast
            residuals = train_data_prepared['y'] - model.predict(train_data_prepared)['yhat']
            crime_dist_dfs[dist_id][cluster_name]["residuals"] = residuals

        except Exception as e:
            print(f"Prophet model failed for district {dist_id}, crime {cluster_name} with error: {e}")
            crime_dist_dfs[dist_id][cluster_name]["model_config"] = None


Processing District 1.0:   0%|          | 0/10 [00:00<?, ?it/s]09:21:23 - cmdstanpy - INFO - Chain [1] start processing
09:21:23 - cmdstanpy - INFO - Chain [1] done processing
Processing District 1.0:  10%|█         | 1/10 [00:02<00:20,  2.24s/it]

Prophet model failed for district 1.0, crime Aggravated Assault with error: Input contains NaN.


09:21:25 - cmdstanpy - INFO - Chain [1] start processing
09:21:25 - cmdstanpy - INFO - Chain [1] done processing
Processing District 1.0:  20%|██        | 2/10 [00:04<00:18,  2.31s/it]

Prophet model failed for district 1.0, crime Assault with error: Input contains NaN.


09:21:27 - cmdstanpy - INFO - Chain [1] start processing
09:21:28 - cmdstanpy - INFO - Chain [1] done processing
Processing District 1.0:  30%|███       | 3/10 [00:07<00:16,  2.39s/it]

Prophet model failed for district 1.0, crime Auto Theft with error: Input contains NaN.


09:21:30 - cmdstanpy - INFO - Chain [1] start processing
09:21:30 - cmdstanpy - INFO - Chain [1] done processing
Processing District 1.0:  40%|████      | 4/10 [00:09<00:14,  2.34s/it]

Prophet model failed for district 1.0, crime Burglary with error: Input contains NaN.


09:21:32 - cmdstanpy - INFO - Chain [1] start processing
09:21:32 - cmdstanpy - INFO - Chain [1] done processing
Processing District 1.0:  50%|█████     | 5/10 [00:11<00:11,  2.36s/it]

Prophet model failed for district 1.0, crime Domestic Assault with error: Input contains NaN.


09:21:34 - cmdstanpy - INFO - Chain [1] start processing
09:21:34 - cmdstanpy - INFO - Chain [1] done processing
Processing District 1.0:  60%|██████    | 6/10 [00:12<00:07,  1.96s/it]

Prophet model failed for district 1.0, crime Domestic Violence with error: Input contains NaN.


09:21:36 - cmdstanpy - INFO - Chain [1] start processing
09:21:36 - cmdstanpy - INFO - Chain [1] done processing
Processing District 1.0:  70%|███████   | 7/10 [00:15<00:06,  2.13s/it]

Prophet model failed for district 1.0, crime Property Damage with error: Input contains NaN.


09:21:38 - cmdstanpy - INFO - Chain [1] start processing
09:21:38 - cmdstanpy - INFO - Chain [1] done processing
Processing District 1.0:  80%|████████  | 8/10 [00:17<00:04,  2.17s/it]

Prophet model failed for district 1.0, crime Sexual Assault with error: Input contains NaN.


09:21:40 - cmdstanpy - INFO - Chain [1] start processing
09:21:41 - cmdstanpy - INFO - Chain [1] done processing
Processing District 1.0:  90%|█████████ | 9/10 [00:21<00:02,  2.69s/it]09:21:44 - cmdstanpy - INFO - Chain [1] start processing
09:21:44 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:   0%|          | 0/10 [00:00<?, ?it/s]         09:21:46 - cmdstanpy - INFO - Chain [1] start processing
09:21:47 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:  10%|█         | 1/10 [00:02<00:21,  2.34s/it]

Prophet model failed for district 2.0, crime Aggravated Assault with error: Input contains NaN.


09:21:48 - cmdstanpy - INFO - Chain [1] start processing
09:21:49 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:  20%|██        | 2/10 [00:04<00:19,  2.40s/it]

Prophet model failed for district 2.0, crime Assault with error: Input contains NaN.


09:21:51 - cmdstanpy - INFO - Chain [1] start processing
09:21:51 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:  30%|███       | 3/10 [00:07<00:17,  2.43s/it]

Prophet model failed for district 2.0, crime Auto Theft with error: Input contains NaN.


09:21:53 - cmdstanpy - INFO - Chain [1] start processing
09:21:54 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:  40%|████      | 4/10 [00:09<00:14,  2.35s/it]

Prophet model failed for district 2.0, crime Burglary with error: Input contains NaN.


09:21:56 - cmdstanpy - INFO - Chain [1] start processing
09:21:56 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:  50%|█████     | 5/10 [00:11<00:12,  2.40s/it]

Prophet model failed for district 2.0, crime Domestic Assault with error: Input contains NaN.


09:21:58 - cmdstanpy - INFO - Chain [1] start processing
09:21:58 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:  60%|██████    | 6/10 [00:13<00:08,  2.23s/it]09:22:00 - cmdstanpy - INFO - Chain [1] start processing
09:22:01 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:  70%|███████   | 7/10 [00:16<00:06,  2.32s/it]

Prophet model failed for district 2.0, crime Property Damage with error: Input contains NaN.


09:22:03 - cmdstanpy - INFO - Chain [1] start processing
09:22:03 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:  80%|████████  | 8/10 [00:18<00:04,  2.35s/it]

Prophet model failed for district 2.0, crime Sexual Assault with error: Input contains NaN.


09:22:05 - cmdstanpy - INFO - Chain [1] start processing
09:22:05 - cmdstanpy - INFO - Chain [1] done processing
Processing District 2.0:  90%|█████████ | 9/10 [00:21<00:02,  2.32s/it]

Prophet model failed for district 2.0, crime Shoplifting with error: Input contains NaN.


09:22:07 - cmdstanpy - INFO - Chain [1] start processing
09:22:07 - cmdstanpy - INFO - Chain [1] done processing
                                                                        

Prophet model failed for district 2.0, crime Stolen Auto with error: Input contains NaN.


Processing District 3.0:   0%|          | 0/10 [00:00<?, ?it/s]09:22:09 - cmdstanpy - INFO - Chain [1] start processing
09:22:09 - cmdstanpy - INFO - Chain [1] done processing
Processing District 3.0:  10%|█         | 1/10 [00:03<00:34,  3.78s/it]09:22:12 - cmdstanpy - INFO - Chain [1] start processing
09:22:13 - cmdstanpy - INFO - Chain [1] done processing
Processing District 3.0:  20%|██        | 2/10 [00:07<00:30,  3.80s/it]09:22:16 - cmdstanpy - INFO - Chain [1] start processing
09:22:17 - cmdstanpy - INFO - Chain [1] done processing
Processing District 3.0:  30%|███       | 3/10 [00:11<00:27,  3.97s/it]09:22:20 - cmdstanpy - INFO - Chain [1] start processing
09:22:21 - cmdstanpy - INFO - Chain [1] done processing
Processing District 3.0:  40%|████      | 4/10 [00:14<00:20,  3.39s/it]

Prophet model failed for district 3.0, crime Burglary with error: Input contains NaN.


09:22:23 - cmdstanpy - INFO - Chain [1] start processing
09:22:23 - cmdstanpy - INFO - Chain [1] done processing
Processing District 3.0:  50%|█████     | 5/10 [00:16<00:15,  3.04s/it]

Prophet model failed for district 3.0, crime Domestic Assault with error: Input contains NaN.


09:22:25 - cmdstanpy - INFO - Chain [1] start processing
09:22:25 - cmdstanpy - INFO - Chain [1] done processing
Processing District 3.0:  60%|██████    | 6/10 [00:18<00:10,  2.64s/it]09:22:27 - cmdstanpy - INFO - Chain [1] start processing
09:22:28 - cmdstanpy - INFO - Chain [1] done processing
Processing District 3.0:  70%|███████   | 7/10 [00:21<00:07,  2.60s/it]

Prophet model failed for district 3.0, crime Property Damage with error: Input contains NaN.


09:22:30 - cmdstanpy - INFO - Chain [1] start processing
09:22:30 - cmdstanpy - INFO - Chain [1] done processing
Processing District 3.0:  80%|████████  | 8/10 [00:24<00:06,  3.02s/it]09:22:33 - cmdstanpy - INFO - Chain [1] start processing
09:22:34 - cmdstanpy - INFO - Chain [1] done processing
Processing District 3.0:  90%|█████████ | 9/10 [00:27<00:02,  2.89s/it]

Prophet model failed for district 3.0, crime Shoplifting with error: Input contains NaN.


09:22:36 - cmdstanpy - INFO - Chain [1] start processing


In [None]:
def plot_crime_forecast(train_data, test_data, forecast_data, crime_type="Unknown Crime", district_id="Unknown District"):
    plt.figure(figsize=(20, 6))

    # Plot training data
    plt.plot(train_data['ds'], train_data['y'], label="Training Data", color="blue")

    # Plot test data
    if len(test_data) > 0:
        plt.plot(test_data['ds'], test_data['y'], label="Test Data (Actual)", color="green")

    # Plot forecast data
    plt.plot(forecast_data['ds'], forecast_data['yhat'], label="Predictions", color="red", linestyle="--")

    # Formatting the plot
    plt.title(f"Crime Forecast for {crime_type} in District {district_id}")
    plt.xlabel("Date")
    plt.ylabel("Number of Crimes")
    plt.legend(loc="upper left")
    plt.grid(True)
    plt.show()

In [None]:
district = 5.0
crime_name = 'Auto Theft'

# Prepare data for plotting
train_data = crime_dist_dfs[district][crime_name]["train"].reset_index().rename(columns={'Reported_Date': 'ds', 'Crime_Count': 'y'})
test_data = crime_dist_dfs[district][crime_name]["test"].reset_index().rename(columns={'Reported_Date': 'ds', 'Crime_Count': 'y'})
forecast_data = crime_dist_dfs[district][crime_name]["forecast"]

plot_crime_forecast(train_data, test_data, forecast_data, crime_type=crime_name, district_id=district)


In [None]:
def plot_rmse_heatmap(crime_dist_dfs):
    """
    Function to create a heatmap plot of RMSE values for crimes across districts
    """
    # Initialize an empty dictionary to hold RMSE values
    rmse_data = {
        "District": [],
        "Crime": [],
        "RMSE": []
    }
    
    # Collect RMSE values for each crime in each district
    for dist_id, crimes in crime_dist_dfs.items():
        for crime_name, crime_info in crimes.items():
            if crime_info.get("model_config") and crime_info["model_config"].get("rmse") is not None:
                rmse_data["District"].append(dist_id)
                rmse_data["Crime"].append(crime_name)
                rmse_data["RMSE"].append(crime_info["model_config"]["rmse"])
    
    # Create a DataFrame from the collected RMSE data
    rmse_df = pd.DataFrame(rmse_data)
    rmse_pivot = rmse_df.pivot(index="Crime", columns="District", values="RMSE")
    
    # Plot the heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(rmse_pivot, annot=True, fmt=".2f", cmap="coolwarm", cbar_kws={'label': 'RMSE'})
    plt.title("RMSE of Crime Predictions by District and Crime Category")
    plt.xlabel("District")
    plt.ylabel("Crime Category")
    plt.tight_layout()
    plt.show()



In [None]:
plot_rmse_heatmap(crime_dist_dfs)

In [None]:
with open(f"..\\data\\mergedData\\crime_dist_dfs_prophet_top_{TOP_X_CRIMES}.pkl", 'wb') as file:
    pickle.dump(crime_dist_dfs, file)
