In [1]:
import pandas as pd
pd.options.display.max_rows = 100

from datetime import datetime

# Display all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Dimension columns

date_dim_cols = ['day_of_week', 'month', 'year', 'weekend', 'holiday', 'holiday_name'] # and much more
locations_dim_cols = ['name', 'longitude', 'latitude', 'neighborhood', 'city', 'crime_rate'] # and neighborhood stats
crime_dim_cols = ['report_time', 'start_time', 'end_time', 'details', 'type', 'category', 'severity_index']

## Denver Crime data

In [3]:
denver_data_path = '../data/denver_crime_data/'

denver_crimes = pd.read_csv(denver_data_path + 'crime_denver.csv')

### Null

In [4]:
denver_crimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462640 entries, 0 to 462639
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   INCIDENT_ID             462640 non-null  int64  
 1   OFFENSE_ID              462640 non-null  int64  
 2   OFFENSE_CODE            462640 non-null  int64  
 3   OFFENSE_CODE_EXTENSION  462640 non-null  int64  
 4   OFFENSE_TYPE_ID         462640 non-null  object 
 5   OFFENSE_CATEGORY_ID     462640 non-null  object 
 6   FIRST_OCCURRENCE_DATE   462640 non-null  object 
 7   LAST_OCCURRENCE_DATE    145740 non-null  object 
 8   REPORTED_DATE           462640 non-null  object 
 9   INCIDENT_ADDRESS        418765 non-null  object 
 10  GEO_X                   458649 non-null  float64
 11  GEO_Y                   458649 non-null  float64
 12  GEO_LON                 458649 non-null  float64
 13  GEO_LAT                 458649 non-null  float64
 14  DISTRICT_ID         

In [5]:
denver_crimes.isnull().mean() * 100

INCIDENT_ID                0.000000
OFFENSE_ID                 0.000000
OFFENSE_CODE               0.000000
OFFENSE_CODE_EXTENSION     0.000000
OFFENSE_TYPE_ID            0.000000
OFFENSE_CATEGORY_ID        0.000000
FIRST_OCCURRENCE_DATE      0.000000
LAST_OCCURRENCE_DATE      68.498184
REPORTED_DATE              0.000000
INCIDENT_ADDRESS           9.483616
GEO_X                      0.862658
GEO_Y                      0.862658
GEO_LON                    0.862658
GEO_LAT                    0.862658
DISTRICT_ID                0.000000
PRECINCT_ID                0.000000
NEIGHBORHOOD_ID            0.000000
IS_CRIME                   0.000000
IS_TRAFFIC                 0.000000
dtype: float64

For now, we'll leave the null values as is  

- Incident address has 9.5% missing values which is decent so we'll keep these rows. We will leave them as null since no way to impute and doens't make sense to put a default value or potentially label them as 'MISSING'
- Last occurence date could either be because crime happens instantly and doesn't last long OR the last occurence date is missing. Because there is no information and there's a significant 68.5% of nulls, we will keep it and leave it as is
- X and Y will not be included in the dimension so it will be ignored
- Missing Long and Lat only make up 0.86% of the samples. We will impute them using the means of the Long/Lat of their corresponding neighborhood

In [6]:
# Impute Long and Lat using means of corresponding neighborhood coordinates

mean_coordinates = denver_crimes.groupby("NEIGHBORHOOD_ID")[['GEO_LON', 'GEO_LAT']].mean()
        
joined_coordinates = denver_crimes.drop(['GEO_LON', 'GEO_LAT'], axis=1).merge(mean_coordinates, 
                                                                             how = 'inner',
                                                                             on = 'NEIGHBORHOOD_ID', 
                                                                             right_index = True)

index = denver_crimes['GEO_LON'] != denver_crimes['GEO_LON']
denver_crimes.loc[index] = joined_coordinates.loc[index]

In [7]:
denver_crimes.isnull().mean() * 100

INCIDENT_ID                0.000000
OFFENSE_ID                 0.000000
OFFENSE_CODE               0.000000
OFFENSE_CODE_EXTENSION     0.000000
OFFENSE_TYPE_ID            0.000000
OFFENSE_CATEGORY_ID        0.000000
FIRST_OCCURRENCE_DATE      0.000000
LAST_OCCURRENCE_DATE      68.498184
REPORTED_DATE              0.000000
INCIDENT_ADDRESS           9.483616
GEO_X                      0.862658
GEO_Y                      0.862658
GEO_LON                    0.000000
GEO_LAT                    0.000000
DISTRICT_ID                0.000000
PRECINCT_ID                0.000000
NEIGHBORHOOD_ID            0.000000
IS_CRIME                   0.000000
IS_TRAFFIC                 0.000000
dtype: float64

### Dimensions

In [29]:
# Convert to datetime

denver_crimes['REPORTED_DATE'] = pd.DatetimeIndex(denver_crimes['REPORTED_DATE'])
denver_crimes['FIRST_OCCURRENCE_DATE'] = pd.DatetimeIndex(denver_crimes['FIRST_OCCURRENCE_DATE'])
denver_crimes['LAST_OCCURRENCE_DATE'] = pd.DatetimeIndex(denver_crimes['LAST_OCCURRENCE_DATE'])

For date, we'll set it to be based on the First occurence date of when the crime happened

In [55]:
# Set date attributes
# Day of week starts at 0 - Monday
# Months and days start at 1

denver_crimes['DAY'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.day
denver_crimes['MONTH'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.month
denver_crimes['YEAR'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.year
denver_crimes['DAY_OF_YEAR'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.dayofyear
denver_crimes['WEEK_OF_YEAR'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.weekofyear
denver_crimes['WEEKDAY'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.weekday + 1
denver_crimes['QUARTER'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.quarter
denver_crimes['IS_MONTH_START'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.is_month_start
denver_crimes['IS_MONTH_END'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.is_month_end
denver_crimes['IS_YEAR_START'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.is_year_start
denver_crimes['IS_YEAR_END'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.is_year_end

In [46]:
# Add city

denver_crimes['City'] = 'Denver'

In [53]:
# Add offense type and category names 

offense_codes = pd.read_csv(denver_data_path + 'offense_codes.csv')[['OFFENSE_CODE', 'OFFENSE_CODE_EXTENSION', 'OFFENSE_TYPE_NAME', 'OFFENSE_CATEGORY_NAME']]

denver_crimes = denver_crimes.merge(offense_codes, on = ['OFFENSE_CODE', 'OFFENSE_CODE_EXTENSION'])

#### Still need to add neighborhood stats and crime severity index

maybe capitalize neighboord names as well

# Vancouver Crime data

In [22]:
vancouver_data_path = '../data/vancouver_crime_data/'

vancouver_crimes = pd.read_csv(vancouver_data_path + 'crimedata_csv_all_years.csv')

### Convert X Y to Longitude Latitude using

Using https://webapp.geod.nrcan.gc.ca/geod/tools-outils/trx.php

In [116]:
# Set format for tool
vancouver_crimes.rename(columns = {'X': 'utm_e', 'Y': 'utm_n'}, inplace = True)
vancouver_crimes['height'] = 0
vancouver_crimes['utm_z'] = 'utm10'

# Save
vancouver_crimes.to_csv(vancouver_data_path + 'crime_data_prep.csv', index = False)

Upload crime_data_prep file to site with settings UTM/MTM/Sterero UTM10 to Geographic. 

In [30]:
# Add longitude and latitude and reformat
geographic_data = pd.read_csv(vancouver_data_path + 'TRX_2020-02-18_52723.csv')
vancouver_crimes.rename(columns = {'utm_e': 'X', 'utm_n': 'Y'}, inplace = True)
vancouver_crimes.drop(['height', 'utm_z'], axis = 1, inplace = True)
vancouver_crimes[['Latitude', 'Longitude']]= geographic_data[['lat', 'lon']]

# Save 
vancouver_crimes.to_csv(vancouver_data_path + 'crime_data.csv', index = False)

## Null values

In [222]:
# Load saved data
vancouver_crimes = pd.read_csv(vancouver_data_path + 'crime_data.csv')

In [223]:
vancouver_crimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 634278 entries, 0 to 634277
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TYPE           634278 non-null  object 
 1   YEAR           634278 non-null  int64  
 2   MONTH          634278 non-null  int64  
 3   DAY            634278 non-null  int64  
 4   HOUR           634278 non-null  int64  
 5   MINUTE         634278 non-null  int64  
 6   HUNDRED_BLOCK  634265 non-null  object 
 7   NEIGHBOURHOOD  568455 non-null  object 
 8   X              634157 non-null  float64
 9   Y              634157 non-null  float64
 10  Latitude       634278 non-null  float64
 11  Longitude      634278 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 58.1+ MB


Location variables for HUNDRED_BLOCK = 'OFFSET TO PROTECT PRIVACY' should be all null.  
"Coordinates data for records with “Offset to Protect Privacy” was not disclosed to provide privacy protection"

In [128]:
# Ensure privacy protected is null
vancouver_crimes.loc[vancouver_crimes['HUNDRED_BLOCK'] == 'OFFSET TO PROTECT PRIVACY', ['X', 'Y', 'Latitude', 'Longitude'] ] = [None, None, None, None]

In [129]:
vancouver_crimes.query("HUNDRED_BLOCK == 'OFFSET TO PROTECT PRIVACY'")

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
106755,Homicide,2018,12,22,0,0,OFFSET TO PROTECT PRIVACY,,,,,
106756,Homicide,2017,2,1,0,0,OFFSET TO PROTECT PRIVACY,,,,,
106757,Homicide,2017,7,9,0,0,OFFSET TO PROTECT PRIVACY,,,,,
106758,Homicide,2016,9,28,0,0,OFFSET TO PROTECT PRIVACY,,,,,
106759,Homicide,2017,10,13,0,0,OFFSET TO PROTECT PRIVACY,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
255614,Offence Against a Person,2018,4,4,0,0,OFFSET TO PROTECT PRIVACY,,,,,
255615,Offence Against a Person,2018,4,30,0,0,OFFSET TO PROTECT PRIVACY,,,,,
255616,Offence Against a Person,2018,9,21,0,0,OFFSET TO PROTECT PRIVACY,,,,,
255617,Offence Against a Person,2018,11,15,0,0,OFFSET TO PROTECT PRIVACY,,,,,


In [87]:
vancouver_crimes.query('X != X')

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
608515,Vehicle Collision or Pedestrian Struck (with F...,2005,6,7,0,47,CAMBIE ST BRIDGE,,,,49.219054,-123.116669
608627,Vehicle Collision or Pedestrian Struck (with F...,2003,8,1,17,26,S E MARINE DR/ KNIGHT ST,,,,49.219054,-123.116669
608678,Vehicle Collision or Pedestrian Struck (with I...,2003,6,22,18,58,0X TERMINAL AV / QUEBEC ST,,,,49.219054,-123.116669
608681,Vehicle Collision or Pedestrian Struck (with I...,2003,2,1,0,30,10XX BALFOUR AV / 3800 OAK ST,,,,49.219054,-123.116669
609562,Vehicle Collision or Pedestrian Struck (with I...,2004,11,7,18,24,13XX PACIFIC BLVD / 198 DRAKE ST,,,,49.219054,-123.116669
...,...,...,...,...,...,...,...,...,...,...,...,...
633824,Vehicle Collision or Pedestrian Struck (with I...,2003,4,27,12,57,WEST HASTINGS ST AT SEYMOUR ST,,,,49.219054,-123.116669
633825,Vehicle Collision or Pedestrian Struck (with I...,2007,10,31,15,50,WEST KING EDWARD AVE / ONTARIO ST,,,,49.219054,-123.116669
633826,Vehicle Collision or Pedestrian Struck (with I...,2003,6,2,17,0,WESTVIEW OVERPASS AT #1 HWY EASTBOUND,,,,49.219054,-123.116669
633843,Vehicle Collision or Pedestrian Struck (with I...,2003,2,21,20,57,WILLOW ST / W 41 AV,,,,49.219054,-123.116669


In [225]:
vancouver_crimes.query('HUNDRED_BLOCK == "OFFSET TO PROTECT PRIVACY"').describe()

Unnamed: 0,YEAR,MONTH,DAY,HOUR,MINUTE,X,Y,Latitude,Longitude
count,63159.0,63159.0,63159.0,63159.0,63159.0,63159.0,63159.0,63159.0,63159.0
mean,2010.696734,6.496113,15.487373,0.0,0.0,0.0,0.0,0.0,-127.4887
std,4.851167,3.423779,8.957918,0.0,0.0,0.0,0.0,0.0,1.184484e-10
min,2003.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-127.4887
25%,2007.0,4.0,8.0,0.0,0.0,0.0,0.0,0.0,-127.4887
50%,2010.0,7.0,15.0,0.0,0.0,0.0,0.0,0.0,-127.4887
75%,2015.0,9.0,23.0,0.0,0.0,0.0,0.0,0.0,-127.4887
max,2020.0,12.0,31.0,0.0,0.0,0.0,0.0,0.0,-127.4887
