In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import FastMarkerCluster
from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

## Download Required Datasets

In [None]:
!wget https://raw.githubusercontent.com/SpatialTurn/DataCollection-Notebooks/main/Census/CrimesChicago20220225.csv

In [None]:
!wget https://raw.githubusercontent.com/SpatialTurn/DataCollection-Notebooks/main/Census/Illinois.zip

### make sure to unzip the file

In [None]:
!unzip Illinois.zip

## Task 1 : Data Exploration

### Chicago Crime Data Obtained from

`https://data.cityofchicago.org/`

In [None]:
# Load the crime data
crime_df = pd.read_csv('CrimesChicago20220225.csv')

# Remove rows with missing latitude or longitude
crime_df = crime_df.dropna(subset=['LATITUDE', 'LONGITUDE'])

In [None]:
crime_df.columns

In [None]:
crime_df.head()

In [None]:
# Summarize the frequency of crimes by category
print("Crime Categories and their Frequencies:")
display(crime_df[' PRIMARY DESCRIPTION'].value_counts())

# Convert 'Date' to datetime objects
crime_df['Date'] = pd.to_datetime(crime_df['DATE  OF OCCURRENCE'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

# Extract time-based features
crime_df['Month'] = crime_df['Date'].dt.month
crime_df['DayOfWeek'] = crime_df['Date'].dt.dayofweek # Monday=0, Sunday=6
crime_df['Hour'] = crime_df['Date'].dt.hour

# Explore crime frequency by month
print("\nCrime Frequency by Month:")
display(crime_df['Month'].value_counts().sort_index())

### Visualizing the categories using Histograms

In [None]:
# Create a histogram for crime categories
plt.figure(figsize=(12, 6))
sns.countplot(data=crime_df, y=' PRIMARY DESCRIPTION', order=crime_df[' PRIMARY DESCRIPTION'].value_counts().index, color='skyblue')
plt.title('Frequency of Crimes by Category')
plt.xlabel('Frequency')
plt.ylabel('Crime Category')
plt.show()

In [None]:
# Create a histogram for crime frequency by month
plt.figure(figsize=(10, 6))
sns.countplot(data=crime_df, x='Month', order=crime_df['Month'].value_counts().sort_index().index, color='skyblue')
plt.title('Crime Frequency by Month')
plt.xlabel('Month')
plt.ylabel('Frequency')
plt.xticks(ticks=range(12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.show()

In [None]:
# Create a histogram for crime frequency by day of the week
plt.figure(figsize=(10, 6))
sns.countplot(data=crime_df, x='DayOfWeek', order=crime_df['DayOfWeek'].value_counts().sort_index().index, color='skyblue')
plt.title('Crime Frequency by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Frequency')
plt.xticks(ticks=range(7), labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
plt.show()

In [None]:
# Create a histogram for crime frequency by hour of the day
plt.figure(figsize=(12, 6))
sns.countplot(data=crime_df, x='Hour', order=crime_df['Hour'].value_counts().sort_index().index, color='skyblue')
plt.title('Crime Frequency by Hour of the Day')
plt.xlabel('Hour of the Day')
plt.ylabel('Frequency')
plt.show()

### Chose a subset of categories for simplified analysis:

`Weapons Violation`

`Robbery`

`Narcotics`

In [None]:
crime_categories = ['WEAPONS VIOLATION', 'ROBBERY', 'NARCOTICS']
filtered_crimes_df = crime_df[crime_df[' PRIMARY DESCRIPTION'].isin(crime_categories)]
filtered_crimes_df.head()

#### Subsetting the Chicago Shape File

##### Chicago has a FIPS code of 031.


#### Illinois Shape file will be provided.

`Downloaded from TIGER/Line Shape Files Website`

In [None]:
illinois_gdf = gpd.read_file("Illinois.shp")

chicago_gdf = illinois_gdf[illinois_gdf['COUNTYFP'] == '031']

# Dissolve the Chicago GeoDataFrame to create a single city boundary
chicago_boundary = chicago_gdf.dissolve()

# Reproject to UTM Zone 16N (EPSG:32616)
chicago_boundary = chicago_boundary.to_crs(epsg=32616)

### Convert crime data to geodataframe


In [None]:
# Drop rows with missing latitude or longitude
filtered_crimes_df = filtered_crimes_df.dropna(subset=['LATITUDE', 'LONGITUDE'])

# Create geometry points from longitude and latitude
geometry = gpd.points_from_xy(filtered_crimes_df['LONGITUDE'], filtered_crimes_df['LATITUDE'])

# Create a GeoDataFrame
crime_gdf = gpd.GeoDataFrame(filtered_crimes_df, geometry=geometry, crs='EPSG:4326')

# Reproject to UTM Zone 16N (EPSG:32616)
crime_gdf = crime_gdf.to_crs(epsg=32616)

# Display the first few rows and information of the GeoDataFrame
crime_gdf.info()
crime_gdf.head()

### Using `folium` in python for interactive mapping.



In [None]:
# Calculate the mean latitude and longitude for centering the map
center_lat = crime_gdf['LATITUDE'].mean()
center_lon = crime_gdf['LONGITUDE'].mean()

# Create a Folium map centered on Chicago
m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

#### Weapons Mapped


In [None]:
# Calculate the mean latitude and longitude for centering the map
center_lat = crime_gdf['LATITUDE'].mean()
center_lon = crime_gdf['LONGITUDE'].mean()

# Create a Folium map centered on Chicago
m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add the Chicago boundary to the map
folium.GeoJson(chicago_boundary).add_to(m)

# Filter for 'weapons violation' crimes
weapons_crimes = crime_gdf[crime_gdf[' PRIMARY DESCRIPTION'] == 'WEAPONS VIOLATION']

# Use FastMarkerCluster
locations = list(zip(weapons_crimes['LATITUDE'], weapons_crimes['LONGITUDE']))
FastMarkerCluster(locations).add_to(m)

# Display the map
m

#### Robbery Mapped

In [None]:
# Filter for 'robbery' crimes
robbery_crimes = crime_gdf[crime_gdf[' PRIMARY DESCRIPTION'] == 'ROBBERY']

# Create a Folium map centered on Chicago
m_robbery = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add the Chicago boundary to the map
folium.GeoJson(chicago_boundary).add_to(m_robbery)

# Use FastMarkerCluster for robbery crimes
locations = list(zip(robbery_crimes['LATITUDE'], robbery_crimes['LONGITUDE']))
FastMarkerCluster(locations).add_to(m_robbery)

# Display the map
m_robbery

#### Narcotics Mapped


In [None]:
# Filter for 'NARCOTICS' crimes
narcotics_crimes = crime_gdf[crime_gdf[' PRIMARY DESCRIPTION'] == 'NARCOTICS']

# Create a Folium map centered on Chicago
m_narcotics = folium.Map(location=[center_lat, center_lon], zoom_start=10)

# Add the Chicago boundary to the map
folium.GeoJson(chicago_boundary).add_to(m_narcotics)

# Use FastMarkerCluster for Narcotics crimes
locations = list(zip(narcotics_crimes['LATITUDE'], narcotics_crimes['LONGITUDE']))
FastMarkerCluster(locations).add_to(m_narcotics)

# Display the map
m_narcotics

## Task 2 : Spatial Clustering

### DBSCAN

In [None]:
# Filter for each crime category
weapons_violation_gdf = crime_gdf[crime_gdf[' PRIMARY DESCRIPTION'] == 'WEAPONS VIOLATION']
robbery_gdf = crime_gdf[crime_gdf[' PRIMARY DESCRIPTION'] == 'ROBBERY']
narcotics_gdf = crime_gdf[crime_gdf[' PRIMARY DESCRIPTION'] == 'NARCOTICS']

display(weapons_violation_gdf.head())

In [None]:
from sklearn.neighbors import NearestNeighbors


def plot_k_distance(coords, k, crime_type):

    neigh = NearestNeighbors(n_neighbors=k)
    nbrs = neigh.fit(coords)
    distances, indices = nbrs.kneighbors(coords)

    # Sort distances to the k-th nearest neighbor
    k_distance = np.sort(distances[:, k-1], axis=0)

    plt.figure(figsize=(10, 6))
    plt.plot(k_distance)
    plt.xlabel('Points sorted by distance')
    plt.ylabel(f'{k}-th Nearest Neighbor Distance')
    plt.title(f'{k}-Distance Plot for {crime_type} Crimes (UTM Zone 16N)')
    plt.grid(True)
    plt.show()

# Use the re-projected coordinates for k-distance calculation
weapons_violation_coords = np.array(list(weapons_violation_gdf.geometry.apply(lambda x: (x.x, x.y))))
robbery_coords = np.array(list(robbery_gdf.geometry.apply(lambda x: (x.x, x.y))))
narcotics_coords = np.array(list(narcotics_gdf.geometry.apply(lambda x: (x.x, x.y))))

k_value = 25

plot_k_distance(weapons_violation_coords, k_value, 'WEAPONS VIOLATION')
plot_k_distance(robbery_coords, k_value, 'ROBBERY')
plot_k_distance(narcotics_coords, k_value, 'NARCOTICS')

## You will need to play around with the numbers to determine the perfect values for clustering.

`eps` : epsilon is the distance/radius, set at 1500 meters currently. This acts like a region around a point to determine how many neighbors are around it. Greater the value the more points it will have.

`min_samples` : Determines how many points within the eps region is to be considered a cluster.

In [None]:
# Apply DBSCAN for Weapons Violation crimes
weapons_violation_coords = np.array(list(weapons_violation_gdf.geometry.apply(lambda x: (x.x, x.y))))
dbscan_weapons_violation = DBSCAN(eps=1500, min_samples=50)
weapons_violation_clusters = dbscan_weapons_violation.fit_predict(weapons_violation_coords)

# Apply DBSCAN for Robbery crimes
robbery_coords = np.array(list(robbery_gdf.geometry.apply(lambda x: (x.x, x.y))))
dbscan_robbery = DBSCAN(eps=1500, min_samples=50)
robbery_clusters = dbscan_robbery.fit_predict(robbery_coords)

# Apply DBSCAN for Narcotics crimes
narcotics_coords = np.array(list(narcotics_gdf.geometry.apply(lambda x: (x.x, x.y))))
dbscan_narcotics = DBSCAN(eps=1500, min_samples=50)
narcotics_clusters = dbscan_narcotics.fit_predict(narcotics_coords)

# Add cluster labels to the GeoDataFrames
weapons_violation_gdf['dbscan_cluster'] = weapons_violation_clusters
robbery_gdf['dbscan_cluster'] = robbery_clusters
narcotics_gdf['dbscan_cluster'] = narcotics_clusters

In [None]:
def create_clustered_crime_map(gdf, clustering_method, crime_type, color_palette='jet'):

    center_lat = gdf['LATITUDE'].mean()
    center_lon = gdf['LONGITUDE'].mean()

    m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

    # Add the Chicago boundary to the map
    folium.GeoJson(chicago_boundary).add_to(m)

    cluster_col = f'{clustering_method}_cluster'
    unique_clusters = sorted(gdf[cluster_col].unique())
    colors = plt.cm.get_cmap(color_palette, len(unique_clusters))(np.arange(len(unique_clusters))) # Use the specified color palette
    color_map = {cluster: f'#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}' for cluster, c in zip(unique_clusters, colors)}
    color_map[-1] = '#000000' # Color for noise points

    # Create a legend
    legend_html = '<div style="position: fixed; bottom: 50px; left: 50px; width: 150px; z-index:9999; font-size:14px; background-color:white; border:2px solid grey; opacity:0.85;">&nbsp;<b>Clusters</b><br>'
    for cluster_label in unique_clusters:
        if cluster_label == -1:
            legend_label = 'Outliers'
        else:
            legend_label = f'Cluster {cluster_label}'
        legend_html += f'&nbsp;<i class="fa fa-circle fa-lg" style="color:{color_map[cluster_label]}"></i>&nbsp;{legend_label}<br>'
    legend_html += '</div>'

    m.get_root().html.add_child(folium.Element(legend_html))

    for _, row in gdf.iterrows():
        cluster_label = row[cluster_col]
        color = color_map.get(cluster_label, '#808080') # Default to grey if cluster not in map
        tooltip_text = f"{crime_type} - Cluster: {cluster_label if cluster_label != -1 else 'Outlier'}"
        folium.CircleMarker(
            location=[row['LATITUDE'], row['LONGITUDE']],
            radius=3,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.6,
            tooltip=tooltip_text
        ).add_to(m)
    return m

# Create and display maps for each crime category and clustering method
map_weapons_violation_dbscan = create_clustered_crime_map(weapons_violation_gdf, 'dbscan', 'WEAPONS VIOLATION', color_palette='jet')
print("WEAPONS VIOLATION - DBSCAN Clustering:")
display(map_weapons_violation_dbscan)

map_robbery_dbscan = create_clustered_crime_map(robbery_gdf, 'dbscan', 'ROBBERY', color_palette='jet')
print("\nROBBERY - DBSCAN Clustering:")
display(map_robbery_dbscan)

map_narcotics_dbscan = create_clustered_crime_map(narcotics_gdf, 'dbscan', 'NARCOTICS', color_palette='jet')
print("\nNarcotics - DBSCAN Clustering:")
display(map_narcotics_dbscan)

## HDBSCAN


`min_cluster_size` is the minimum number of neighbors a point must have to be considered a *core* point.

`min_samples` is the minimum number of points to be considered a cluster.

### HDBSCAN is independent of the distance i.e., it automatically determines the distance and the user does not need to put in that value manually. It is like an extension to DBSCAN.

In [None]:
size = 5
samples = 50

In [None]:
# Apply HDBSCAN for Weapons Violation crimes
weapons_violation_coords = np.array(list(weapons_violation_gdf.geometry.apply(lambda x: (x.x, x.y))))
hdbscan_weapons_violation = hdbscan.HDBSCAN(min_cluster_size=size, min_samples=50)
weapons_violation_hdbscan_clusters = hdbscan_weapons_violation.fit(weapons_violation_coords)

# Apply HDBSCAN for Robbery crimes
robbery_coords = np.array(list(robbery_gdf.geometry.apply(lambda x: (x.x, x.y))))
hdbscan_robbery = hdbscan.HDBSCAN(min_cluster_size=size, min_samples=samples)
robbery_hdbscan_clusters = hdbscan_robbery.fit(robbery_coords)

# Apply HDBSCAN for Narcotics crimes
narcotics_coords = np.array(list(narcotics_gdf.geometry.apply(lambda x: (x.x, x.y))))
hdbscan_narcotics = hdbscan.HDBSCAN(min_cluster_size=size, min_samples=samples)
narcotics_hdbscan_clusters = hdbscan_narcotics.fit(narcotics_coords)

In [None]:
# Add the HDBSCAN cluster labels as new columns to the respective GeoDataFrames
weapons_violation_gdf['hdbscan_cluster'] = weapons_violation_hdbscan_clusters.labels_
robbery_gdf['hdbscan_cluster'] = robbery_hdbscan_clusters.labels_
narcotics_gdf['hdbscan_cluster'] = narcotics_hdbscan_clusters.labels_

### Visualize HDBSCAN results



In [None]:
def create_clustered_crime_map(gdf, clustering_method, crime_type, color_palette='viridis'):

    center_lat = gdf['LATITUDE'].mean()
    center_lon = gdf['LONGITUDE'].mean()

    m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

    # Add the Chicago boundary to the map
    folium.GeoJson(chicago_boundary).add_to(m)

    cluster_col = f'{clustering_method}_cluster'
    unique_clusters = sorted(gdf[cluster_col].unique())
    colors = plt.cm.get_cmap(color_palette, len(unique_clusters))(np.arange(len(unique_clusters))) # Use the specified color palette
    color_map = {cluster: f'#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}' for cluster, c in zip(unique_clusters, colors)}
    color_map[-1] = '#000000' # Color for noise points

    # Create a legend
    legend_html = '<div style="position: fixed; bottom: 50px; left: 50px; width: 150px; z-index:9999; font-size:14px; background-color:white; border:2px solid grey; opacity:0.85;">&nbsp;<b>Clusters</b><br>'
    for cluster_label in unique_clusters:
        if cluster_label == -1:
            legend_label = 'Outliers'
        else:
            legend_label = f'Cluster {cluster_label}'
        legend_html += f'&nbsp;<i class="fa fa-circle fa-lg" style="color:{color_map[cluster_label]}"></i>&nbsp;{legend_label}<br>'
    legend_html += '</div>'

    m.get_root().html.add_child(folium.Element(legend_html))

    for _, row in gdf.iterrows():
        cluster_label = row[cluster_col]
        color = color_map.get(cluster_label, '#808080') # Default to grey if cluster not in map
        tooltip_text = f"{crime_type} - Cluster: {cluster_label if cluster_label != -1 else 'Outlier'}"
        folium.CircleMarker(
            location=[row['LATITUDE'], row['LONGITUDE']],
            radius=3,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.6,
            tooltip=tooltip_text
        ).add_to(m)
    return m

# Create and display maps for each crime category and clustering method
map_weapons_violation_hdbscan = create_clustered_crime_map(weapons_violation_gdf, 'hdbscan', 'WEAPONS VIOLATION', color_palette='jet')
print("WEAPONS VIOLATION - HDBSCAN Clustering:")
display(map_weapons_violation_hdbscan)

map_robbery_hdbscan = create_clustered_crime_map(robbery_gdf, 'hdbscan', 'ROBBERY', color_palette='jet')
print("\nROBBERY - HDBSCAN Clustering:")
display(map_robbery_hdbscan)

map_narcotics_hdbscan = create_clustered_crime_map(narcotics_gdf, 'hdbscan', 'NARCOTICS', color_palette='jet')
print("\nNarcotics - HDBSCAN Clustering:")
display(map_narcotics_hdbscan)

### Discussion of Clustering Methods Parameters

The effectiveness of both DBSCAN and HDBSCAN clustering algorithms is highly dependent on their parameters.

> For DBSCAN, the key parameters are:

>> `eps`: This is the maximum distance between two samples for one to be considered as in the neighborhood of the other. A smaller eps will result in more clusters and potentially more noise points, while a larger eps will lead to fewer, larger clusters.

>> `min_samples`: This is the number of samples in a neighborhood for a point to be considered as a core point. A higher min_samples requires more points to form a dense region, leading to fewer and potentially larger clusters, and more points being classified as noise.

> For HDBSCAN, the key parameters are:

>> `min_cluster_size`: This is the minimum number of samples in a group for that group to be considered a cluster. Smaller values will allow for the detection of smaller clusters.

>> `min_samples`: This parameter is similar to DBSCAN's min_samples but influences the density estimate. Higher values result in a more conservative clustering, with more points classified as noise.


Adjusting these parameters can significantly impact the clustering results. For example, in spatial clustering of crime data, a smaller `eps` in DBSCAN or `min_cluster_size` in HDBSCAN might identify very localized crime hotspots, while larger values could reveal broader patterns of crime concentration. The choice of parameters often requires experimentation and domain knowledge to find meaningful clusters.

## K-Means

`n_clusters` is a predefined number by the user to determine how many clusters is to be made for the data. This number can be found from the elbow method plot.

In [None]:
n_clusters = 5

# Apply K-Means for Weapons Violation crimes
weapons_violation_coords = np.array(list(weapons_violation_gdf.geometry.apply(lambda x: (x.x, x.y))))
# Reshape to a 2D array if it's 1D
if weapons_violation_coords.ndim == 1:
    weapons_violation_coords = weapons_violation_coords.reshape(-1, 1)
kmeans_weapons_violation = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) # applied here
weapons_violation_kmeans_clusters = kmeans_weapons_violation.fit_predict(weapons_violation_coords)

# Apply K-Means for Robbery crimes
robbery_coords = np.array(list(robbery_gdf.geometry.apply(lambda x: (x.x, x.y))))
if robbery_coords.ndim == 1:
    robbery_coords = robbery_coords.reshape(-1, 1)
kmeans_robbery = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) # applied here
robbery_kmeans_clusters = kmeans_robbery.fit_predict(robbery_coords)

# Apply K-Means for Narcotics crimes
narcotics_coords = np.array(list(narcotics_gdf.geometry.apply(lambda x: (x.x, x.y))))
if narcotics_coords.ndim == 1:
    narcotics_coords = narcotics_coords.reshape(-1, 1)
kmeans_narcotics = KMeans(n_clusters=n_clusters, random_state=42, n_init=10) # applied here
narcotics_kmeans_clusters = kmeans_narcotics.fit_predict(narcotics_coords)

# Add K-Means cluster labels to the GeoDataFrames
weapons_violation_gdf['kmeans_cluster'] = weapons_violation_kmeans_clusters
robbery_gdf['kmeans_cluster'] = robbery_kmeans_clusters
narcotics_gdf['kmeans_cluster'] = narcotics_kmeans_clusters

In [None]:
# Function to perform K-Means and calculate inertia for a range of k
def plot_elbow_method(coords, crime_type):
    inertia = []
    k_range = range(1, 11)  # Test k from 1 to 10

    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(coords)
        inertia.append(kmeans.inertia_)

    # Plotting the Elbow method results
    plt.figure(figsize=(8, 5))
    plt.plot(k_range, inertia, marker='o')
    plt.title(f'Elbow Method for {crime_type} Crimes (UTM Zone 16N)')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia (Sum of Squared Distances)')
    plt.xticks(k_range)
    plt.grid(True)
    plt.show()

# Apply the Elbow method for each crime category
weapons_violation_coords = np.array(list(weapons_violation_gdf.geometry.apply(lambda x: (x.x, x.y))))
if weapons_violation_coords.ndim == 1:
    weapons_violation_coords = weapons_violation_coords.reshape(-1, 1)
robbery_coords = np.array(list(robbery_gdf.geometry.apply(lambda x: (x.x, x.y))))
if robbery_coords.ndim == 1:
    robbery_coords = robbery_coords.reshape(-1, 1)
narcotics_coords = np.array(list(narcotics_gdf.geometry.apply(lambda x: (x.x, x.y))))
if narcotics_coords.ndim == 1:
    narcotics_coords = narcotics_coords.reshape(-1, 1)

plot_elbow_method(weapons_violation_coords, 'WEAPONS VIOLATION')
plot_elbow_method(robbery_coords, 'ROBBERY')
plot_elbow_method(narcotics_coords, 'NARCOTICS')

In [None]:
# Create and display maps for each crime category and K-Means clustering
def create_clustered_crime_map(gdf, clustering_method, crime_type, color_palette='viridis'):

    center_lat = gdf['LATITUDE'].mean()
    center_lon = gdf['LONGITUDE'].mean()

    m = folium.Map(location=[center_lat, center_lon], zoom_start=10)

    # Add the Chicago boundary to the map
    folium.GeoJson(chicago_boundary).add_to(m)

    cluster_col = f'{clustering_method}_cluster'
    unique_clusters = sorted(gdf[cluster_col].unique())
    colors = plt.cm.get_cmap(color_palette, len(unique_clusters))(np.arange(len(unique_clusters))) # Use the specified color palette
    color_map = {cluster: f'#{int(c[0]*255):02x}{int(c[1]*255):02x}{int(c[2]*255):02x}' for cluster, c in zip(unique_clusters, colors)}
    color_map[-1] = '#000000' # Color for noise points

    # Create a legend
    legend_html = '<div style="position: fixed; bottom: 50px; left: 50px; width: 150px; z-index:9999; font-size:14px; background-color:white; border:2px solid grey; opacity:0.85;">&nbsp;<b>Clusters</b><br>'
    for cluster_label in unique_clusters:
        if cluster_label == -1:
            legend_label = 'Outliers'
        else:
            legend_label = f'Cluster {cluster_label}'
        legend_html += f'&nbsp;<i class="fa fa-circle fa-lg" style="color:{color_map[cluster_label]}"></i>&nbsp;{legend_label}<br>'
    legend_html += '</div>'

    m.get_root().html.add_child(folium.Element(legend_html))

    for _, row in gdf.iterrows():
        cluster_label = row[cluster_col]
        color = color_map.get(cluster_label, '#808080') # Default to grey if cluster not in map
        tooltip_text = f"{crime_type} - Cluster: {cluster_label if cluster_label != -1 else 'Outlier'}"
        folium.CircleMarker(
            location=[row['LATITUDE'], row['LONGITUDE']],
            radius=3,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.6,
            tooltip=tooltip_text
        ).add_to(m)
    return m


map_weapons_violation_kmeans = create_clustered_crime_map(weapons_violation_gdf, 'kmeans', 'WEAPONS VIOLATION', color_palette='viridis')
print("WEAPONS VIOLATION - K-Means Clustering:")
display(map_weapons_violation_kmeans)

map_robbery_kmeans = create_clustered_crime_map(robbery_gdf, 'kmeans', 'ROBBERY', color_palette='viridis')
print("\nROBBERY - K-Means Clustering:")
display(map_robbery_kmeans)

map_narcotics_kmeans = create_clustered_crime_map(narcotics_gdf, 'kmeans', 'NARCOTICS', color_palette='viridis')
print("\nNarcotics - K-Means Clustering:")
display(map_narcotics_kmeans)

### Discussion of Parameter Settings for Different Crime Categories

In this analysis, the same parameter settings (`eps=1500`, `min_samples=50` for DBSCAN and HDBSCAN, and `n_clusters=5` for K-Means) were used across all three crime categories ('WEAPONS VIOLATION', 'ROBBERY', and 'NARCOTICS'). This was done to provide a consistent basis for comparison of the spatial clustering patterns between these crime types. However, it's important to note that the optimal parameter settings might differ for each crime category, as their spatial distributions can vary.

Choosing the most appropriate parameters often requires domain knowledge about the specific crime type and the local context. For example, a crime that tends to be very localized might require a smaller `eps` or `min_cluster_size` to identify meaningful clusters, while a more widespread crime might benefit from larger values. The k-distance plots and the Elbow method plots were used as visual aids to help inform the choice of parameters in the previous steps. Future analysis could involve tuning these parameters specifically for each crime type to potentially reveal more nuanced spatial patterns.

In [None]:
# Number of clusters (excluding noise, labeled as -1)
weapons_violation_hdbscan_n_clusters = len(set(weapons_violation_hdbscan_clusters.labels_)) - (1 if -1 in weapons_violation_hdbscan_clusters.labels_ else 0)
robbery_hdbscan_n_clusters = len(set(robbery_hdbscan_clusters.labels_)) - (1 if -1 in robbery_hdbscan_clusters.labels_ else 0)
narcotics_hdbscan_n_clusters = len(set(narcotics_hdbscan_clusters.labels_)) - (1 if -1 in narcotics_hdbscan_clusters.labels_ else 0)

print(f"Number of HDBSCAN clusters (WEAPONS VIOLATION): {weapons_violation_hdbscan_n_clusters}")
print(f"Number of HDBSCAN clusters (ROBBERY): {robbery_hdbscan_n_clusters}")
print(f"Number of HDBSCAN clusters (Narcotics): {narcotics_hdbscan_n_clusters}")

# Cluster sizes (excluding noise)
weapons_violation_hdbscan_cluster_sizes = pd.Series(weapons_violation_hdbscan_clusters.labels_).value_counts().sort_index()
if -1 in weapons_violation_hdbscan_cluster_sizes:
    weapons_violation_hdbscan_cluster_sizes = weapons_violation_hdbscan_cluster_sizes.drop(-1)

robbery_hdbscan_cluster_sizes = pd.Series(robbery_hdbscan_clusters.labels_).value_counts().sort_index()
if -1 in robbery_hdbscan_cluster_sizes:
    robbery_hdbscan_cluster_sizes = robbery_hdbscan_cluster_sizes.drop(-1)

narcotics_hdbscan_cluster_sizes = pd.Series(narcotics_hdbscan_clusters.labels_).value_counts().sort_index()
if -1 in narcotics_hdbscan_cluster_sizes:
    narcotics_hdbscan_cluster_sizes = narcotics_hdbscan_cluster_sizes.drop(-1)

print("\nHDBSCAN Cluster Sizes (WEAPONS VIOLATION):")
display(weapons_violation_hdbscan_cluster_sizes)

print("\nHDBSCAN Cluster Sizes (ROBBERY):")
display(robbery_hdbscan_cluster_sizes)

print("\nHDBSCAN Cluster Sizes (Narcotics):")
display(narcotics_hdbscan_cluster_sizes)

# K-Means cluster sizes
weapons_violation_kmeans_cluster_sizes = pd.Series(weapons_violation_kmeans_clusters).value_counts().sort_index()
robbery_kmeans_cluster_sizes = pd.Series(robbery_kmeans_clusters).value_counts().sort_index()
narcotics_kmeans_cluster_sizes = pd.Series(narcotics_kmeans_clusters).value_counts().sort_index()

print("\nK-Means Cluster Sizes (WEAPONS VIOLATION):")
display(weapons_violation_kmeans_cluster_sizes)

print("\nK-Means Cluster Sizes (ROBBERY):")
display(robbery_kmeans_cluster_sizes)

print("\nK-Means Cluster Sizes (Narcotics):")
display(narcotics_kmeans_cluster_sizes)

# K-Means cluster centers
weapons_violation_kmeans_centers = kmeans_weapons_violation.cluster_centers_
robbery_kmeans_centers = kmeans_robbery.cluster_centers_
narcotics_kmeans_centers = kmeans_narcotics.cluster_centers_

print("\nK-Means Cluster Centers (WEAPONS VIOLATION):")
display(pd.DataFrame(weapons_violation_kmeans_centers, columns=['LATITUDE', 'LONGITUDE']))

print("\nK-Means Cluster Centers (ROBBERY):")
display(pd.DataFrame(robbery_kmeans_centers, columns=['LATITUDE', 'LONGITUDE']))

print("\nK-Means Cluster Centers (Narcotics):")
display(pd.DataFrame(narcotics_kmeans_centers, columns=['LATITUDE', 'LONGITUDE']))

### Comparison of HDBSCAN and K-Means Clustering Results

Let's compare the results obtained from HDBSCAN and K-Means clustering for each of the three crime categories: 'WEAPONS VIOLATION', 'ROBBERY', and 'NARCOTICS'.

**WEAPONS VIOLATION:**

*   **Number of Clusters:** HDBSCAN identified 18 clusters (excluding noise), while K-Means found 5 clusters (as specified by the `n_clusters` parameter). HDBSCAN's ability to find variable-density clusters resulted in a higher number of smaller, more localized hotspots.
*   **Cluster Sizes:** HDBSCAN cluster sizes vary significantly (e.g., from 5 to 2073 points), reflecting the varying densities of weapons violation crime. K-Means cluster sizes are more evenly distributed due to the algorithm's nature of partitioning the data into a fixed number of groups of roughly equal variance (e.g., from 1074 to 2484 points).
*   **Cluster Centers:** While not directly comparable in number, the spatial distribution of the cluster centers from both methods would highlight areas of high concentration. K-Means centers represent the average location for larger, pre-defined clusters, while HDBSCAN centers would correspond to the centroids of the denser, variable-sized clusters.

**ROBBERY:**

*   **Number of Clusters:** HDBSCAN identified only 2 clusters for robbery, while K-Means found 5. This suggests that with the chosen parameters, HDBSCAN found fewer, larger dense regions for robbery compared to the other crime types, or a large portion of the robbery incidents were classified as noise.
*   **Cluster Sizes:** The HDBSCAN cluster sizes for robbery are very imbalanced, with one large cluster (7798 points) and one very small cluster (7 points), along with a substantial number of noise points. K-Means, again, provides more balanced cluster sizes (e.g., from 1017 to 1821 points).
*   **Cluster Centers:** The two HDBSCAN centers would represent the centroids of the two dense areas, while the five K-Means centers provide a different spatial partitioning based on minimizing within-cluster variance.

**NARCOTICS:**

*   **Number of Clusters:** HDBSCAN identified 22 clusters for narcotics, the highest number among the three crime types, suggesting a more fragmented or numerous set of dense hotspots. K-Means identified 5 clusters.
*   **Cluster Sizes:** Similar to weapons violation, HDBSCAN shows a wide range of cluster sizes for narcotics (e.g., from 5 to 904 points), indicating diverse sizes and densities of narcotics activity hotspots. K-Means provides more uniform cluster sizes (e.g., from 296 to 2081 points).
*   **Cluster Centers:** The numerous HDBSCAN centers would pinpoint many smaller, dense areas of narcotics crime, while the five K-Means centers would represent the average locations of larger, less granular clusters.

**Overall Comparison:**

HDBSCAN, being a density-based algorithm, is more effective at identifying arbitrarily shaped clusters and distinguishing noise. It reveals a varying number of clusters with diverse sizes and densities for each crime type, reflecting the potentially complex and localized nature of crime hotspots. K-Means, a centroid-based algorithm, partitions the data into a pre-determined number of clusters of roughly equal influence, providing a more generalized view of crime concentration areas.

The choice between HDBSCAN and K-Means depends on the specific goals of the analysis. If the aim is to identify dense, irregularly shaped hotspots and handle noise effectively, HDBSCAN is more suitable. If the objective is to partition the study area into a fixed number of regions based on crime concentration, K-Means can be a useful approach. The quantitative differences in the number and size of clusters highlight these fundamental differences between the two algorithms and their output.

## TASK 3 : Spatial Structure

### Each Category

#### DBSCAN

In [None]:
# Function to calculate and plot histogram of distances between cluster centers
def plot_cluster_distance_histogram(centers, crime_type):

    distances = euclidean_distances(centers)

    # Flatten the upper triangle of the distance matrix (excluding the diagonal)
    # and remove duplicate distances
    upper_triangle_distances = distances[np.triu_indices_from(distances, k=1)]

    plt.figure(figsize=(8, 5))
    plt.hist(upper_triangle_distances, bins=20, edgecolor='black')
    plt.title(f'Distribution of Distances Between DBSCAN Cluster Centers for {crime_type} Crimes (UTM Zone 16N)')
    plt.xlabel('Distance (meters)')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

weapons_violation_dbscan_centers_geom = np.array(list(weapons_violation_gdf.groupby('dbscan_cluster').geometry.apply(lambda x: (x.x.mean(), x.y.mean()))))
robbery_dbscan_centers_geom = np.array(list(robbery_gdf.groupby('dbscan_cluster').geometry.apply(lambda x: (x.x.mean(), x.y.mean()))))
narcotics_dbscan_centers_geom = np.array(list(narcotics_gdf.groupby('dbscan_cluster').geometry.apply(lambda x: (x.x.mean(), x.y.mean()))))

# Calculate and plot histograms for each crime type
plot_cluster_distance_histogram(weapons_violation_dbscan_centers_geom, 'WEAPONS VIOLATION')
plot_cluster_distance_histogram(robbery_dbscan_centers_geom, 'ROBBERY')
plot_cluster_distance_histogram(narcotics_dbscan_centers_geom, 'NARCOTICS')

#### K-means

In [None]:
# Function to calculate and plot histogram of distances between cluster centers
def plot_cluster_distance_histogram(centers, crime_type):

    distances = euclidean_distances(centers)

    # Flatten the upper triangle of the distance matrix (excluding the diagonal)
    # and remove duplicate distances
    upper_triangle_distances = distances[np.triu_indices_from(distances, k=1)]

    plt.figure(figsize=(8, 5))
    plt.hist(upper_triangle_distances, bins=20, edgecolor='black')
    plt.title(f'Distribution of Distances Between K-Means Cluster Centers for {crime_type} Crimes (UTM Zone 16N)')
    plt.xlabel('Distance (meters)')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

weapons_violation_kmeans_centers_geom = np.array(list(weapons_violation_gdf.groupby('kmeans_cluster').geometry.apply(lambda x: (x.x.mean(), x.y.mean()))))
robbery_kmeans_centers_geom = np.array(list(robbery_gdf.groupby('kmeans_cluster').geometry.apply(lambda x: (x.x.mean(), x.y.mean()))))
narcotics_kmeans_centers_geom = np.array(list(narcotics_gdf.groupby('kmeans_cluster').geometry.apply(lambda x: (x.x.mean(), x.y.mean()))))

# Calculate and plot histograms for each crime type
plot_cluster_distance_histogram(weapons_violation_kmeans_centers_geom, 'WEAPONS VIOLATION')
plot_cluster_distance_histogram(robbery_kmeans_centers_geom, 'ROBBERY')
plot_cluster_distance_histogram(narcotics_kmeans_centers_geom, 'NARCOTICS')

### All in One Category

In [None]:
# Combine cluster centers from all three crime categories
#all_centers_geom = np.vstack((weapons_violation_dbscan_centers_geom, robbery_dbscan_centers_geom, narcotics_dbscan_centers_geom)) # dbscan
all_centers_geom = np.vstack((weapons_violation_kmeans_centers_geom, robbery_kmeans_centers_geom, narcotics_kmeans_centers_geom)) # kmeans

# Calculate pairwise distances between all combined cluster centers
all_distances_matrix = euclidean_distances(all_centers_geom)

# Flatten the upper triangle of the distance matrix (excluding the diagonal)
# and remove duplicate distances
all_distances = all_distances_matrix[np.triu_indices_from(all_distances_matrix, k=1)]

# Plot a histogram of the distances between all combined cluster centers
plt.figure(figsize=(10, 6))
plt.hist(all_distances, bins=30, edgecolor='black')
plt.title('Distribution of Distances Between All Cluster Centers Across Crime Categories (UTM Zone 16N)')
plt.xlabel('Distance (meters)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

### Interpretation of Spatial Structure

In [None]:
# Calculate pairwise distances between Weapons Violation and Robbery cluster centers
weapons_violation_robbery_distances = euclidean_distances(weapons_violation_kmeans_centers_geom, robbery_kmeans_centers_geom)

# Calculate pairwise distances between Weapons Violation and Narcotics cluster centers
weapons_violation_narcotics_distances = euclidean_distances(weapons_violation_kmeans_centers_geom, narcotics_kmeans_centers_geom)

# Calculate pairwise distances between Robbery and Narcotics cluster centers
robbery_narcotics_distances = euclidean_distances(robbery_kmeans_centers_geom, narcotics_kmeans_centers_geom)

print("Pairwise Distances between Weapons Violation and Robbery K-Means Cluster Centers:")
display(pd.DataFrame(weapons_violation_robbery_distances))

print("\nPairwise Distances between Weapons Violation and Narcotics K-Means Cluster Centers:")
display(pd.DataFrame(weapons_violation_narcotics_distances))

print("\nPairwise Distances between Robbery and Narcotics K-Means Cluster Centers:")
display(pd.DataFrame(robbery_narcotics_distances))

# You can further analyze these distance matrices, e.g., find minimum, maximum, mean distances
min_weapons_violation_robbery_distance = weapons_violation_robbery_distances.min()
min_weapons_violation_narcotics_distance = weapons_violation_narcotics_distances.min()
min_robbery_narcotics_distance = robbery_narcotics_distances.min()

mean_weapons_violation_robbery_distance = weapons_violation_robbery_distances.mean()
mean_weapons_violation_narcotics_distance = weapons_violation_narcotics_distances.mean()
mean_robbery_narcotics_distance = robbery_narcotics_distances.mean()

print(f"\nMinimum distance between Weapons Violation and Robbery cluster centers: {min_weapons_violation_robbery_distance} meters")
print(f"Minimum distance between Weapons Violation and Narcotics cluster centers: {min_weapons_violation_narcotics_distance} meters")
print(f"Minimum distance between Robbery and Narcotics cluster centers: {min_robbery_narcotics_distance} meters")

print(f"\nMean distance between Weapons Violation and Robbery cluster centers: {mean_weapons_violation_robbery_distance} meters")
print(f"Mean distance between Weapons Violation and Narcotics cluster centers: {mean_weapons_violation_narcotics_distance} meters") # Corrected variable name
print(f"Mean distance between Robbery and Narcotics cluster centers: {mean_robbery_narcotics_distance} meters")