In [1]:
import calendar 
from datetime import datetime
import dask.dataframe as dd
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
plt.style.use('seaborn')
sns.set_style('whitegrid')

In [None]:
# load crimes parquet data into dask df
data_file_name = '../data/crimes-2017-7-4.parq'
crimes = dd.read_parquet(data_file_name, index='Date')

In [None]:
%%time
# log records count and load data partitions
print("{:,} total records in {} partitions".format(len(crimes), crimes.npartitions))
print("DataFrame size: {:,}".format(crimes.size.compute()))

In [None]:
# check data frame structure
crimes

In [None]:
# preview crimes data with Date index
crimes.head()

In [None]:
# get crime types
crime_types = crimes[['PrimaryType']]

In [None]:
crime_types.head()

In [None]:
# get crime counts by primary type
crime_type_total = crime_types.groupby('PrimaryType').size().compute()
print(crime_type_total)

In [None]:
# get arrests and domestic counts for each primary crime type
crime_type_counts = crimes[['PrimaryType', 'Arrest', 'Domestic']].groupby('PrimaryType').sum().compute()
print(crime_type_counts)

In [None]:
# add crime type totals column
crime_type_counts['Total'] = crime_type_total

In [None]:
print(crime_type_counts)

In [None]:
# plot crimes by primary type total ascending
plt.figure(figsize=(8,10))
crime_type_counts.sort_values(by='Total', ascending=True).plot(kind='barh')
plt.title('Number of Crimes by Type')
plt.ylabel('Crime Type')
plt.xlabel('Number of Crimes')
plt.tight_layout()
plt.show()

In [None]:
# plot crimes by arrests ascending
crime_type_counts.sort_values(by='Arrest', ascending=True).plot(kind='barh')
plt.title('Arrests')
plt.ylabel('Crime Type')
plt.xlabel('Number of Crimes')
plt.tight_layout()
plt.show()

In [None]:
# plot crimes by domestic incident reports ascending
crime_type_counts.sort_values(by='Domestic', ascending=True).plot(kind='barh')
plt.title('Domestic')
plt.ylabel('Crime Type')
plt.xlabel('Number of Crimes')
plt.tight_layout()
plt.show()

In [None]:
# top crimes
crime_type_counts[crime_type_counts['Total'] >= 1000].sort_values(by='Total', ascending=True)\
.plot(kind='barh')
plt.title('Top Crimes (> 1,000 reports)')

In [None]:
# less than 1000 crime reports
crime_type_counts[crime_type_counts['Total'] < 1000].sort_values(by='Total', ascending=True)\
.plot(kind='barh')
plt.title('Less than 1,000 Crime reports')

In [None]:
# less than 100 crime reports
crime_type_counts[crime_type_counts['Total'] < 100].sort_values(by='Total', ascending=True)\
.plot(kind='barh')
plt.title('Less than 100 Crime reports')

In [None]:
# get crime location counts
crime_locations = crimes.groupby('LocationDescription').size().compute()
crime_locations = crime_locations.sort_values(ascending=False).rename('Total') #.reset_index()
crime_locations.head()

In [None]:
# plot top 30 locations
crime_locations[:30].sort_values(ascending=True).plot(kind='barh')
plt.title('Top 30 Locations')

In [None]:
# get arrests and domestic counts for each primary crime type
crime_location_counts = crimes[['LocationDescription', 'Arrest', 'Domestic']]\
.groupby('LocationDescription').sum().compute()
# add crime type totals column
crime_location_counts['Total'] = crime_locations
crime_location_counts.head()

In [None]:
# plot top crime locations
crime_location_counts[crime_location_counts['Total'] >= 2500]\
.sort_values(by='Total', ascending=True).plot(kind='barh')
plt.title('Top Crime Locations (>2,500 Crime Reports)')

In [None]:
# plot next 20 top crime locations
crime_location_counts = crime_location_counts[crime_location_counts['Total'] <= 3000]\
.sort_values(by='Total', ascending=False)
crime_location_counts[:20].sort_values(by='Total', ascending=True).plot(kind='barh')
plt.title('Next Top 20 Crime Locations (<3,000 Crime Reports)')

In [None]:
crimes.index

In [None]:
# get arrests
arrests = crimes[crimes['Arrest'] == True]['Arrest']
arrests.head()

In [None]:
# plot monthly arrests
monthly_arrests = arrests.resample('M').sum().compute()
monthly_arrests.plot()
plt.title('Monthly Arrests')
plt.show()

In [None]:
# plot weekly arrests
weekly_arrests = arrests.resample('W').sum().compute()
weekly_arrests.plot()
plt.title('Weekly Arrests')
plt.show()

In [None]:
# plot daily arrests
daily_arrests = arrests.resample('D').sum().compute()
daily_arrests.plot()
plt.title('Daily Arrests')
plt.show()

In [None]:
# get domestic crimes
domestic = crimes[crimes['Domestic'] == True]['Domestic']
domestic.head()

In [None]:
# plot daily domestic
daily_domestic = domestic.resample('D').sum().compute()
daily_domestic.plot()
plt.title('Daily Domestic')
plt.show()