In [2]:
# libraries and modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score, homogeneity_completeness_v_measure
from yellowbrick.cluster import SilhouetteVisualizer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
from folium.plugins import HeatMap
from kneed import KneeLocator

In [3]:
# import data
clustered = pd.read_csv('../data/processed/dog_bite_clustered.csv')

# display
clustered.head()

Unnamed: 0,date_of_bite,year,month,day,day_of_week,borough,zip_code,latitude,longitude,spay_neuter,...,german_shepherd,shih_tzu,chihuahua,yorkshire_terrier,bull_dog,labrador_retriever,maltese,husky,standard_poodle,cluster
0,2018-01-01,2018,1,1,0,brooklyn,11220,40.641026,-74.016688,False,...,False,False,False,False,False,False,False,False,False,0
1,2018-01-06,2018,1,6,5,brooklyn,11224,40.577372,-73.988706,False,...,False,False,False,False,False,False,False,False,False,0
2,2018-01-08,2018,1,8,0,brooklyn,11231,40.677916,-74.005154,False,...,False,False,False,False,False,False,False,False,False,0
3,2018-01-09,2018,1,9,1,brooklyn,11224,40.577372,-73.988706,False,...,False,False,False,False,False,False,False,False,False,0
4,2018-01-03,2018,1,3,2,brooklyn,11231,40.677916,-74.005154,False,...,False,False,False,False,False,False,False,False,False,0


In [None]:
# Calculate the proportion of spay_neuter within each cluster
spay_neuter_cluster = clustered.groupby('cluster')['spay_neuter'].value_counts(normalize=True).unstack().fillna(0)

# Plot the proportion of spay_neuter within each cluster
spay_neuter_cluster.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Proportion of Spay/Neuter by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Proportion')
plt.legend(title='Spay/Neuter', labels=['Not Spayed/Neutered', 'Spayed/Neutered'])
plt.show()

In [None]:
# Calculate the proportion of spay_neuter within each cluster
spay_neuter_cluster = clustered.groupby('cluster')['spay_neuter'].value_counts(normalize=True).unstack().fillna(0)

# Create a stacked bar plot for each cluster
for cluster in spay_neuter_cluster.index:
    plt.figure(figsize=(10, 6))
    cluster_data = clustered[clustered['cluster'] == cluster]
    spay_neuter_borough = cluster_data.groupby('borough')['spay_neuter'].value_counts(normalize=True).unstack().fillna(0)
    spay_neuter_borough.plot(kind='bar', stacked=True, figsize=(10, 6))
    plt.title(f'Proportion of Spay/Neuter by Borough for Cluster {cluster}')
    plt.xlabel('Borough')
    plt.ylabel('Proportion of Spay/Neuter')
    plt.ylim(0, 1)
    plt.legend(title='Spay/Neuter', labels=['Not Spayed/Neutered', 'Spayed/Neutered'])
    plt.show()

In [None]:
# List of dog breeds columns
dog_breeds = ['mixed/other', 'pit_bull', 'german_shepherd', 'shih_tzu', 'chihuahua', 'yorkshire_terrier', 'bull_dog', 'labrador_retriever', 'maltese', 'husky', 'standard_poodle']

# Calculate the proportion of each dog breed within each cluster
breed_cluster = clustered.groupby('cluster')[dog_breeds].mean()

# Plot the proportion of each dog breed within each cluster
breed_cluster.plot(kind='bar', stacked=True, figsize=(12, 8))
plt.title('Proportion of Dog Breeds by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Proportion')
plt.legend(title='Dog Breed', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Rank the counts of the breeds per cluster
breed_ranking = breed_cluster.rank(axis=1, ascending=False)

# Display the ranking
breed_ranking

In [None]:
# Create a bar plot for each cluster
for cluster in breed_cluster.index:
    plt.figure(figsize=(12, 8))
    cluster_data = clustered[clustered['cluster'] == cluster]
    breed_borough = cluster_data.groupby('borough')[dog_breeds].mean()
    breed_borough.plot(kind='bar', stacked=True)
    plt.title(f'Proportion of Dog Breeds by Borough for Cluster {cluster}')
    plt.xlabel('Borough')
    plt.ylabel('Proportion')
    plt.legend(title='Dog Breed', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.show()


In [None]:
# Number of dog bites per cluster per year
yearly_bites = clustered.groupby(['cluster', 'year']).size().reset_index(name='bite_count')
plt.figure(figsize=(12, 6))
sns.barplot(data=yearly_bites, x='year', y='bite_count', hue='cluster')
plt.title('Number of Dog Bites per Cluster per Year')
plt.xlabel('Year')
plt.ylabel('Bite Count')
plt.legend(title='Cluster')
plt.show()

# Number of dog bites per cluster per month
monthly_bites = clustered.groupby(['cluster', 'month']).size().reset_index(name='bite_count')
plt.figure(figsize=(12, 6))
sns.barplot(data=monthly_bites, x='month', y='bite_count', hue='cluster')
plt.title('Number of Dog Bites per Cluster per Month')
plt.xlabel('Month')
plt.ylabel('Bite Count')
plt.legend(title='Cluster')
plt.show()

# Number of dog bites per cluster per day of the week
day_of_week_bites = clustered.groupby(['cluster', 'day_of_week']).size().reset_index(name='bite_count')
plt.figure(figsize=(12, 6))
sns.barplot(data=day_of_week_bites, x='day_of_week', y='bite_count', hue='cluster')
plt.title('Number of Dog Bites per Cluster per Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Bite Count')
plt.legend(title='Cluster')
plt.show()

In [None]:
# Standardize the bite_count within each cluster for yearly, monthly, and day of the week data
yearly_bites['standardized_bite_count'] = yearly_bites.groupby('cluster')['bite_count'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
monthly_bites['standardized_bite_count'] = monthly_bites.groupby('cluster')['bite_count'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
day_of_week_bites['standardized_bite_count'] = day_of_week_bites.groupby('cluster')['bite_count'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

# Plot standardized values for yearly bites
plt.figure(figsize=(12, 6))
sns.barplot(data=yearly_bites, x='year', y='standardized_bite_count', hue='cluster')
plt.title('Standardized Number of Dog Bites per Cluster per Year')
plt.xlabel('Year')
plt.ylabel('Standardized Bite Count')
plt.legend(title='Cluster')
plt.show()

# Plot standardized values for monthly bites
plt.figure(figsize=(12, 6))
sns.barplot(data=monthly_bites, x='month', y='standardized_bite_count', hue='cluster')
plt.title('Standardized Number of Dog Bites per Cluster per Month')
plt.xlabel('Month')
plt.ylabel('Standardized Bite Count')
plt.legend(title='Cluster')
plt.show()

# Plot standardized values for day of the week bites
plt.figure(figsize=(12, 6))
sns.barplot(data=day_of_week_bites, x='day_of_week', y='standardized_bite_count', hue='cluster')
plt.title('Standardized Number of Dog Bites per Cluster per Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Standardized Bite Count')
plt.legend(title='Cluster')
plt.show()

In [None]:
# Number of dog bites per cluster per year
for cluster in yearly_bites['cluster'].unique():
    plt.figure(figsize=(12, 6))
    cluster_data = yearly_bites[yearly_bites['cluster'] == cluster]
    sns.barplot(data=cluster_data, x='year', y='bite_count')
    plt.title(f'Number of Dog Bites per Year for Cluster {cluster}')
    plt.xlabel('Year')
    plt.ylabel('Bite Count')
    plt.show()

# Number of dog bites per cluster per month
for cluster in monthly_bites['cluster'].unique():
    plt.figure(figsize=(12, 6))
    cluster_data = monthly_bites[monthly_bites['cluster'] == cluster]
    sns.barplot(data=cluster_data, x='month', y='bite_count')
    plt.title(f'Number of Dog Bites per Month for Cluster {cluster}')
    plt.xlabel('Month')
    plt.ylabel('Bite Count')
    plt.show()

# Number of dog bites per cluster per day of the week
for cluster in day_of_week_bites['cluster'].unique():
    plt.figure(figsize=(12, 6))
    cluster_data = day_of_week_bites[day_of_week_bites['cluster'] == cluster]
    sns.barplot(data=cluster_data, x='day_of_week', y='bite_count')
    plt.title(f'Number of Dog Bites per Day of the Week for Cluster {cluster}')
    plt.xlabel('Day of the Week')
    plt.ylabel('Bite Count')
    plt.show()