In [1]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import requests
import os

In [2]:
df_1 = pd.read_csv('Data/Oct_Dec_2022_Lethbridge_Police_Service_report.csv')
df_2 = pd.read_csv('Data/Jan_Mar_2023_Lethbridge_Police_Service_report.csv')
df_3 = pd.read_csv('Data/Apr_Jun_2023_Lethbridge_Police_Service_report.csv')

df = pd.concat([df_1, df_2, df_3])
df = df.drop_duplicates()
df.head()

Unnamed: 0,ccn,date,updateDate,city,state,postalCode,blocksizedAddress,incidentType,parentIncidentType,narrative
0,CA23018227,"12/25/2022, 12:00:00 AM","08/03/2023, 1:23:51 PM",LETHBRIDGE,AB,.,1600 Block ST GEORGE RD,RMS] Assault,Assault,Assault
1,CA23016523,"12/01/2022, 9:04:00 AM","08/03/2023, 8:23:45 AM",LETHBRIDGE,AB,T1J 5J4,100 Block GOLDENROD RD,RMS] Fraud - Identity Fraud,Other,Fraud - Identity Fraud
2,RM23054146,"10/21/2022, 4:00:00 PM","07/22/2023, 8:39:56 AM",LETHBRIDGE,AB,.,100 Block FAIRMONT BD,RMS] Proceeds Of Crime Criminal Cod,Other,Proceeds Of Crime Criminal Cod
3,CA23008687,"11/01/2022, 7:00:00 AM","07/06/2023, 9:00:30 AM",LETHBRIDGE,AB,.,1 Block BLACKFOOT CI,RMS] Sexual Assault,Sexual Offense,Sexual Assault
4,RM22058254,"12/16/2022, 1:00:00 PM","07/06/2023, 9:00:20 AM",LETHBRIDGE,AB,.,100 Block 1 AV,RMS] Assist Others,Other,Assist Others


In [3]:
df.describe()

Unnamed: 0,ccn,date,updateDate,city,state,postalCode,blocksizedAddress,incidentType,parentIncidentType,narrative
count,68538,68538,68538,68538,68538,68538,68526,68538,68538,68538
unique,68538,65515,14349,31,3,226,2990,536,13,289
top,CA23018227,"02/01/2023, 3:00:00 PM","11/24/2022, 1:24:20 AM",LETHBRIDGE,AB,.,UNKNOWN STREET,PUBLIC SERVICE,Other,PUBLIC SERVICE
freq,1,17,60,68451,68535,67657,3691,5772,37446,5772


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 68538 entries, 0 to 24689
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ccn                 68538 non-null  object
 1   date                68538 non-null  object
 2   updateDate          68538 non-null  object
 3   city                68538 non-null  object
 4   state               68538 non-null  object
 5   postalCode          68538 non-null  object
 6   blocksizedAddress   68526 non-null  object
 7   incidentType        68538 non-null  object
 8   parentIncidentType  68538 non-null  object
 9   narrative           68538 non-null  object
dtypes: object(10)
memory usage: 5.8+ MB


There are some null values in the 'blocksizedAddress' column, let's remove these

In [5]:
df = df.dropna(subset=['blocksizedAddress'], axis=0)

Let's look at the values in the 'parentIncidentType' column

In [6]:
df['parentIncidentType'].value_counts()

parentIncidentType
Other                  37435
Quality of Life         9761
Proactive Policing      9412
Traffic                 4154
Theft                   3198
Assault                 1665
Theft from Vehicle      1437
Property Crime           681
Sexual Offense           447
Theft of Vehicle         244
Robbery                   80
Breaking & Entering       10
Homicide                   2
Name: count, dtype: int64

Let's remove all 'Proactive Policing' and 'Traffic' incidents, as we only want crimes reported by the public.  'Proactive Policing' and 'Traffic' incidents are usually only registered when witnessed by police.  There may be 'Traffic" incidents that are reported by the public, but we'll assume that this is a very small number.

In [7]:
df = df[~df['parentIncidentType'].isin(['Proactive Policing', 'Traffic'])]
df.describe()

Unnamed: 0,ccn,date,updateDate,city,state,postalCode,blocksizedAddress,incidentType,parentIncidentType,narrative
count,54960,54960,54960,54960,54960,54960,54960,54960,54960,54960
unique,54960,52355,13406,30,3,109,2518,502,11,277
top,CA23018227,"04/02/2023, 8:30:00 AM","11/24/2022, 1:24:18 AM",LETHBRIDGE,AB,.,100 Block 1 AV,PUBLIC SERVICE,Other,PUBLIC SERVICE
freq,1,17,54,54894,54957,54615,2193,5772,37435,5772


We don't need the 'updateDate', 'incidentType' or 'postalCode' columns so let's drop them

In [8]:
df = df.drop(['updateDate','incidentType', 'postalCode'], axis=1)

Most of the values in the 'blocksizedAddress' refer to a city block, which is what we want. However some refer to a street corner and others are 'UNKNOWN STREET'.  Let's drop these ones for the sake of simplicity.

In [10]:
df = df[df['blocksizedAddress'].str.contains('Block')]

Now we'll make a new column for the full address

In [11]:
df['address'] = df['blocksizedAddress'].str.replace('Block ', '') + ', ' + df['city'] + ', ' + df['state'] + ', Canada'
df[['address']]

Unnamed: 0,address
0,"1600 ST GEORGE RD, LETHBRIDGE, AB, Canada"
1,"100 GOLDENROD RD, LETHBRIDGE, AB, Canada"
2,"100 FAIRMONT BD, LETHBRIDGE, AB, Canada"
3,"1 BLACKFOOT CI, LETHBRIDGE, AB, Canada"
4,"100 1 AV, LETHBRIDGE, AB, Canada"
...,...
24682,"300 8 ST, LETHBRIDGE, AB, Canada"
24683,"1 LAFAYETTE BD, LETHBRIDGE, AB, Canada"
24687,"800 2A AV, LETHBRIDGE, AB, Canada"
24688,"100 LAVAL BD, LETHBRIDGE, AB, Canada"


The longitude and latitude of each address was found in the 'Google Geocoding API' notebook. We'll import a dataframe of all unique addresses with their coordinates and indicent count here.

In [98]:
df_address = pd.read_csv('df_address.csv')
df_address

Unnamed: 0,address,lat,lng,incidentCount
0,"100 1 AV, LETHBRIDGE, AB, Canada",49.697816,-112.840058,2193
1,"800 2A AV, LETHBRIDGE, AB, Canada",49.701727,-112.831368,1361
2,"800 5 AV, LETHBRIDGE, AB, Canada",49.692594,-112.834855,925
3,"500 1 AV, LETHBRIDGE, AB, Canada",49.697940,-112.840158,864
4,"600 8 ST, LETHBRIDGE, AB, Canada",49.690751,-112.835657,832
...,...,...,...,...
2275,"2400 11 AV, LETHBRIDGE, AB, Canada",49.681863,-112.807164,1
2276,"2400 17 AV, LETHBRIDGE, AB, Canada",49.675228,-112.806886,1
2277,"2400 2A AV, LETHBRIDGE, AB, Canada",49.701727,-112.831368,1
2278,"2400 5 AV, LETHBRIDGE, AB, Canada",49.693458,-112.807515,1


# Creating Block-Hour DataFrame

Now we need to make a dataframe where each row is a unique 'block-hour'