In [43]:
import pandas as pd
pd.options.display.max_rows = 100
pd.options.display.max_columns = 0

from datetime import datetime
from datetime import time

# Display all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#### Still need to add neighborhood stats and crime severity index and holiday and crime rate. Use astral to get nighttime

# Background

Currently there is crime data for Denver and Vancouver but they are in different formats. The goal is to preprocess Denver and Vancouver crime data so they are in a consistent format and then combine them together

The data can be obtained from 
- Denver -  https://www.denvergov.org/opendata/dataset/city-and-county-of-denver-crime
- Vancouver https://geodash.vpd.ca/opendata/

# Denver Crime data

In [44]:
denver_data_path = '../data/denver_crime_data/'

denver_crimes = pd.read_csv(denver_data_path + 'crime_denver.csv')

## Null

In [45]:
denver_crimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462640 entries, 0 to 462639
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   INCIDENT_ID             462640 non-null  int64  
 1   OFFENSE_ID              462640 non-null  int64  
 2   OFFENSE_CODE            462640 non-null  int64  
 3   OFFENSE_CODE_EXTENSION  462640 non-null  int64  
 4   OFFENSE_TYPE_ID         462640 non-null  object 
 5   OFFENSE_CATEGORY_ID     462640 non-null  object 
 6   FIRST_OCCURRENCE_DATE   462640 non-null  object 
 7   LAST_OCCURRENCE_DATE    145740 non-null  object 
 8   REPORTED_DATE           462640 non-null  object 
 9   INCIDENT_ADDRESS        418765 non-null  object 
 10  GEO_X                   458649 non-null  float64
 11  GEO_Y                   458649 non-null  float64
 12  GEO_LON                 458649 non-null  float64
 13  GEO_LAT                 458649 non-null  float64
 14  DISTRICT_ID         

In [46]:
denver_crimes.isnull().mean() * 100

INCIDENT_ID                0.000000
OFFENSE_ID                 0.000000
OFFENSE_CODE               0.000000
OFFENSE_CODE_EXTENSION     0.000000
OFFENSE_TYPE_ID            0.000000
OFFENSE_CATEGORY_ID        0.000000
FIRST_OCCURRENCE_DATE      0.000000
LAST_OCCURRENCE_DATE      68.498184
REPORTED_DATE              0.000000
INCIDENT_ADDRESS           9.483616
GEO_X                      0.862658
GEO_Y                      0.862658
GEO_LON                    0.862658
GEO_LAT                    0.862658
DISTRICT_ID                0.000000
PRECINCT_ID                0.000000
NEIGHBORHOOD_ID            0.000000
IS_CRIME                   0.000000
IS_TRAFFIC                 0.000000
dtype: float64

For now, we'll leave the null values as is  

- Incident address has 9.5% missing values which is decent so we'll keep these rows. We will leave them as null since no way to impute and doens't make sense to put a default value or potentially label them as 'MISSING'
- Last occurence date could either be because crime happens instantly and doesn't last long OR the last occurence date is missing. Because there is no information and there's a significant 68.5% of nulls, we will keep it and leave it as is
- X and Y will not be included in the dimension so it will be ignored
- Missing Long and Lat only make up 0.86% of the samples. We will impute them using the means of the Long/Lat of their corresponding neighborhood

In [47]:
# Impute Long and Lat using means of corresponding neighborhood coordinates

mean_coordinates = denver_crimes.groupby("NEIGHBORHOOD_ID")[['GEO_LON', 'GEO_LAT']].mean()
        
joined_coordinates = denver_crimes.drop(['GEO_LON', 'GEO_LAT'], axis=1).merge(mean_coordinates, 
                                                                             how = 'inner',
                                                                             on = 'NEIGHBORHOOD_ID', 
                                                                             right_index = True)

index = denver_crimes['GEO_LON'] != denver_crimes['GEO_LON']
denver_crimes.loc[index] = joined_coordinates.loc[index]

In [48]:
denver_crimes.isnull().mean() * 100

INCIDENT_ID                0.000000
OFFENSE_ID                 0.000000
OFFENSE_CODE               0.000000
OFFENSE_CODE_EXTENSION     0.000000
OFFENSE_TYPE_ID            0.000000
OFFENSE_CATEGORY_ID        0.000000
FIRST_OCCURRENCE_DATE      0.000000
LAST_OCCURRENCE_DATE      68.498184
REPORTED_DATE              0.000000
INCIDENT_ADDRESS           9.483616
GEO_X                      0.862658
GEO_Y                      0.862658
GEO_LON                    0.000000
GEO_LAT                    0.000000
DISTRICT_ID                0.000000
PRECINCT_ID                0.000000
NEIGHBORHOOD_ID            0.000000
IS_CRIME                   0.000000
IS_TRAFFIC                 0.000000
dtype: float64

## Dimensions

Prep columns for creating dimensions later

### Date

In [49]:
pd.to_datetime(denver_crimes.REPORTED_DATE, format = "%m/%d/%Y %H:%M:%S %p").dt.day

0         15
1         29
2         26
3         30
4         23
          ..
462635     2
462636     4
462637    28
462638    17
462639    27
Name: REPORTED_DATE, Length: 462640, dtype: int64

In [50]:
# Convert to datetime

fmt = "%m/%d/%Y %H:%M:%S %p"

denver_crimes['REPORTED_DATE'] = pd.to_datetime(denver_crimes['REPORTED_DATE'], format = fmt)
denver_crimes['FIRST_OCCURRENCE_DATE'] = pd.to_datetime(denver_crimes['FIRST_OCCURRENCE_DATE'], format = fmt)
denver_crimes['LAST_OCCURRENCE_DATE'] = pd.to_datetime(denver_crimes['LAST_OCCURRENCE_DATE'], format = fmt)

# denver_crimes['REPORTED_DATE'] = pd.DatetimeIndex(denver_crimes['REPORTED_DATE'], dayfirst = False)
# denver_crimes['FIRST_OCCURRENCE_DATE'] = pd.DatetimeIndex(denver_crimes['FIRST_OCCURRENCE_DATE'], dayfirst = False)
# denver_crimes['LAST_OCCURRENCE_DATE'] = pd.DatetimeIndex(denver_crimes['LAST_OCCURRENCE_DATE'], dayfirst = False)

For date, we'll set it to be based on the First occurence date of when the crime happened

In [51]:
# Set date attributes
# Day of week starts at 0 - Monday
# Months and days start at 1

# denver_crimes['DAY'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.day
# denver_crimes['MONTH'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.month
# denver_crimes['YEAR'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.year
# denver_crimes['DAY_OF_YEAR'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.dayofyear
# denver_crimes['WEEK_OF_YEAR'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.weekofyear
# denver_crimes['WEEKDAY'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.weekday + 1
# denver_crimes['QUARTER'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.quarter
# denver_crimes['IS_MONTH_START'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.is_month_start
# denver_crimes['IS_MONTH_END'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.is_month_end
# denver_crimes['IS_YEAR_START'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.is_year_start
# denver_crimes['IS_YEAR_END'] = denver_crimes['FIRST_OCCURRENCE_DATE'].dt.is_year_end


# Set times
denver_crimes['FIRST_OCCURRENCE_TIME'] = denver_crimes.FIRST_OCCURRENCE_DATE.dt.time
denver_crimes['REPORTED_TIME'] = denver_crimes.REPORTED_DATE.dt.time
denver_crimes['LAST_OCCURRENCE_TIME'] = denver_crimes.LAST_OCCURRENCE_DATE.dt.time
denver_crimes['FIRST_OCCURRENCE_DATE'] = denver_crimes.FIRST_OCCURRENCE_DATE.dt.date

# Set missing Last occurence dates to None instead of NaT
denver_crimes.loc[denver_crimes.LAST_OCCURRENCE_TIME.isnull(), 'LAST_OCCURRENCE_TIME'] = None

### Location

In [52]:
# Add city

denver_crimes['CITY'] = 'Denver'

In [53]:
# Get neighborhood name from neighborhood id - Just convert kebab case to upper case

def capitalize_title(id: str) -> str:
    '''
    Capitalizes kebab case id
    '''
    words = id.split('-')
    words = [word.capitalize() for word in words]
    title = ' '.join(words)
    return title 


neighborhood_ids = list(set(denver_crimes.NEIGHBORHOOD_ID))
neighborhood_names = [capitalize_title(id) for id in neighborhood_ids]
neighborhood_mappings = pd.DataFrame({'NEIGHBORHOOD_ID': neighborhood_ids, 'NEIGHBORHOOD_NAME': neighborhood_names})

# Set Cbd to CBD and Dia to DIA
neighborhood_mappings.loc[neighborhood_mappings.NEIGHBORHOOD_ID == 'dia', 'NEIGHBORHOOD_NAME'] = 'DIA'
neighborhood_mappings.loc[neighborhood_mappings.NEIGHBORHOOD_ID == 'cbd', 'NEIGHBORHOOD_NAME'] = 'CBD'

# Join to get neighbourhood names
denver_crimes = denver_crimes.merge(neighborhood_mappings, how = 'left', on = 'NEIGHBORHOOD_ID')

### Crime

In [54]:
# Add offense type and category names 

offense_codes = pd.read_csv(denver_data_path + 'offense_codes.csv')[['OFFENSE_CODE', 'OFFENSE_CODE_EXTENSION', 'OFFENSE_TYPE_NAME', 'OFFENSE_CATEGORY_NAME']]

denver_crimes = denver_crimes.merge(offense_codes, on = ['OFFENSE_CODE', 'OFFENSE_CODE_EXTENSION'])

Add Violent/Non-Violent category based on offense category

In [55]:
# Mapping of category to violent/non-violent

violent_mappings = [
    ('Aggravated Assault', True),
    ('All Other Crimes', False),
    ('Arson', False),
    ('Auto Theft', False),
    ('Burglary', False),
    ('Drug & Alcohol', False),
    ('Larceny', False),
    ('Murder', True),
    ('Other Crimes Against Persons', False),
    ('Public Disorder', False),
    ('Robbery', False),
    ('Sexual Assault', True),
    ('Theft from Motor Vehicle', False),
    ('Traffic Accident', False),
    ('White Collar Crime', False)
]

violent_mappings = pd.DataFrame(violent_mappings, columns = ['OFFENSE_CATEGORY_NAME', 'IS_VIOLENT'])

denver_crimes = denver_crimes.merge(violent_mappings, how = 'left', on = 'OFFENSE_CATEGORY_NAME')

## Facts

Add facts/measures to each row since each row represents a crime

Simply define nighttime to be 8pm - 5am

In [56]:
# Is Traffic is already present
denver_crimes.IS_TRAFFIC = denver_crimes.IS_TRAFFIC.apply(bool)

# Is Fatal
denver_crimes['IS_FATAL'] = denver_crimes.OFFENSE_CATEGORY_NAME == 'Murder'

# Is Nighttime
night_start = time(hour = 20, minute = 0, second = 0)
night_end = time(hour = 5, minute = 0, second = 0)
denver_crimes['IS_NIGHTTIME'] = (denver_crimes.FIRST_OCCURRENCE_TIME > night_start) | (denver_crimes.FIRST_OCCURRENCE_TIME < night_end)

## Reformat dataframe

Reformat the dataframe columns to match with closer desired dimension columns

In [57]:
# Drop unused columns
cols_drop = ['INCIDENT_ID', 'OFFENSE_ID', 'OFFENSE_CODE', 'OFFENSE_CODE_EXTENSION', 'OFFENSE_TYPE_ID', 'OFFENSE_CATEGORY_ID', 'GEO_X', 'GEO_Y',
             'DISTRICT_ID', 'PRECINCT_ID', 'NEIGHBORHOOD_ID', 'IS_CRIME', 'LAST_OCCURRENCE_DATE', 'REPORTED_DATE']
denver_crimes.drop(cols_drop, axis = 1, inplace = True)

In [58]:
# Rename cols
denver_crimes = denver_crimes.rename(
    columns = {'GEO_LON': 'LONGITUDE', 
               'GEO_LAT': 'LATITUDE',
               'OFFENSE_TYPE_NAME': 'OFFENSE_TYPE',
               'OFFENSE_CATEGORY_NAME': 'OFFENSE_CATEGORY',
               'NEIGHBORHOOD_NAME': 'NEIGHBORHOOD',
               'INCIDENT_ADDRESS': 'LOCATION'}
)

In [59]:
# Check final preprocessed data

denver_crimes.head(5)
denver_crimes.isnull().mean() * 100

Unnamed: 0,FIRST_OCCURRENCE_DATE,LOCATION,LONGITUDE,LATITUDE,IS_TRAFFIC,FIRST_OCCURRENCE_TIME,REPORTED_TIME,LAST_OCCURRENCE_TIME,CITY,NEIGHBORHOOD,OFFENSE_TYPE,OFFENSE_CATEGORY,IS_VIOLENT,IS_FATAL,IS_NIGHTTIME
0,2016-06-15,,-104.809881,39.773188,False,11:31:00,11:31:00,,Denver,Montbello,Unlawful discharge of a weapon,All Other Crimes,False,False,False
1,2018-01-30,12900 BLOCK E ANDREWS DR,-104.841184,39.784667,False,10:14:00,10:14:00,,Denver,Montbello,Unlawful discharge of a weapon,All Other Crimes,False,False,False
2,2018-01-28,14000 BLK E MAXWELL PL,-104.824755,39.793246,False,01:30:00,04:26:00,01:30:00,Denver,Montbello,Unlawful discharge of a weapon,All Other Crimes,False,False,True
3,2018-01-30,1700 BLOCK E BRUCE RANDOLPH AVE,-104.96688,39.764431,False,11:08:00,08:43:00,,Denver,Cole,Unlawful discharge of a weapon,All Other Crimes,False,False,False
4,2018-04-17,900 BLOCK N NEWTON ST,-105.037222,39.731791,False,02:29:00,02:48:00,,Denver,Villa Park,Unlawful discharge of a weapon,All Other Crimes,False,False,True


FIRST_OCCURRENCE_DATE     0.000000
LOCATION                  9.483616
LONGITUDE                 0.000000
LATITUDE                  0.000000
IS_TRAFFIC                0.000000
FIRST_OCCURRENCE_TIME     0.000000
REPORTED_TIME             0.000000
LAST_OCCURRENCE_TIME     68.498184
CITY                      0.000000
NEIGHBORHOOD              0.000000
OFFENSE_TYPE              0.000000
OFFENSE_CATEGORY          0.000000
IS_VIOLENT                0.000000
IS_FATAL                  0.000000
IS_NIGHTTIME              0.000000
dtype: float64

In [60]:
# Save preprocessed crime data

denver_crimes.to_csv(denver_data_path + 'crime_denver_preprocessed.csv', index = False)

---

# Vancouver Crime data

In [61]:
vancouver_data_path = '../data/vancouver_crime_data/'

vancouver_crimes = pd.read_csv(vancouver_data_path + 'crimedata_csv_all_years.csv')

## Convert X Y to Longitude Latitude using

Using https://webapp.geod.nrcan.gc.ca/geod/tools-outils/trx.php

In [62]:
# Set format for tool
vancouver_crimes.rename(columns = {'X': 'utm_e', 'Y': 'utm_n'}, inplace = True)
vancouver_crimes['height'] = 0
vancouver_crimes['utm_z'] = 'utm10'

# Save
vancouver_crimes.to_csv(vancouver_data_path + 'crime_vancouver_prep.csv', index = False)

Upload crime_data_prep file to site with settings UTM/MTM/Sterero UTM10 to Geographic. 

In [63]:
# Add longitude and latitude and reformat
geographic_data = pd.read_csv(vancouver_data_path + 'TRX_2020-02-18_52723.csv')
vancouver_crimes.rename(columns = {'utm_e': 'X', 'utm_n': 'Y'}, inplace = True)
vancouver_crimes.drop(['height', 'utm_z'], axis = 1, inplace = True)
vancouver_crimes[['Latitude', 'Longitude']]= geographic_data[['lat', 'lon']]

# Save 
vancouver_crimes.to_csv(vancouver_data_path + 'crime_vancouver.csv', index = False)

## Null values

In [64]:
# Load saved data
vancouver_crimes = pd.read_csv(vancouver_data_path + 'crime_vancouver.csv')

In [65]:
vancouver_crimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 634278 entries, 0 to 634277
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TYPE           634278 non-null  object 
 1   YEAR           634278 non-null  int64  
 2   MONTH          634278 non-null  int64  
 3   DAY            634278 non-null  int64  
 4   HOUR           634278 non-null  int64  
 5   MINUTE         634278 non-null  int64  
 6   HUNDRED_BLOCK  634265 non-null  object 
 7   NEIGHBOURHOOD  568455 non-null  object 
 8   X              634157 non-null  float64
 9   Y              634157 non-null  float64
 10  Latitude       634278 non-null  float64
 11  Longitude      634278 non-null  float64
dtypes: float64(4), int64(5), object(3)
memory usage: 58.1+ MB


In [66]:
vancouver_crimes.isnull().mean() * 100

TYPE              0.000000
YEAR              0.000000
MONTH             0.000000
DAY               0.000000
HOUR              0.000000
MINUTE            0.000000
HUNDRED_BLOCK     0.002050
NEIGHBOURHOOD    10.377626
X                 0.019077
Y                 0.019077
Latitude          0.000000
Longitude         0.000000
dtype: float64

Null X and Y from the using tool above give non null Longitude and Latitude, just set them to null

In [67]:
vancouver_crimes.loc[vancouver_crimes['X'].isnull(), ['Longitude', 'Latitude'] ] = [None, None]

There are some default values used to indicate null/missing values in the data (seems that most are for privacy reasons). We will replace them with null so it is explicit

- Location variables for HUNDRED_BLOCK = "OFFSET TO PROTECT PRIVACY" should be all null. "Coordinates data for records with “Offset to Protect Privacy” was not disclosed to provide privacy protection"

- HUNDRED_BLOCK = "X NK_LOC ST" is default location for incidents with unknown location and is geolocated to 312 Main street. Set the coordinates to None instead and replace hundredblock with "UNKNOWN" so it is clear it is unknown and not missing due to error

- For "Offences Against a Person" crime type, time data isn't given so HOUR and MINUTE should be null whereas they are currently default value 0 so we will replace with nulls

In [68]:
# Ensure default values for nulls are actually replaced with nulls
vancouver_crimes.loc[vancouver_crimes['HUNDRED_BLOCK'] == 'OFFSET TO PROTECT PRIVACY', ['X', 'Y', 'Latitude', 'Longitude'] ] = [None, None, None, None]
vancouver_crimes.loc[vancouver_crimes['TYPE'] == 'Offence Against a Person', ['HOUR', 'MINUTE'] ] = [None, None]

# Default location for X NK_LOC ST
vancouver_crimes.loc[vancouver_crimes['HUNDRED_BLOCK'] == 'X NK_LOC ST', ['HUNDRED_BLOCK', 'X', 'Y', 'Latitude', 'Longitude']] = ['UNKNOWN', None, None, None, None]


In [69]:
# View current nulls
vancouver_crimes.isnull().sum()
vancouver_crimes.isnull().mean() * 100

TYPE                 0
YEAR                 0
MONTH                0
DAY                  0
HOUR             62902
MINUTE           62902
HUNDRED_BLOCK       13
NEIGHBOURHOOD    65823
X                65799
Y                65799
Latitude         65799
Longitude        65799
dtype: int64

TYPE              0.000000
YEAR              0.000000
MONTH             0.000000
DAY               0.000000
HOUR              9.917103
MINUTE            9.917103
HUNDRED_BLOCK     0.002050
NEIGHBOURHOOD    10.377626
X                10.373842
Y                10.373842
Latitude         10.373842
Longitude        10.373842
dtype: float64

How we'll handle each of the columns with null values

- Hour and Minute correspond to crime types of Offences against a Person. Since we need this data for is_night in the fact table, our only options are to either make the is_night measure null for these rows OR to remove all crimes of Offences against a Person. https://www.kimballgroup.com/2003/02/design-tip-43-dealing-with-nulls-in-the-dimensional-model/ based on link we will leave the Hour and Minute as null and keep the rows

- Hundred block, only 13 missing rows (most likely due to data errors). We will keep them as it still has good information

- Neighbourhood with 10% missing rows. Keep them as no way to impute and still provides good crime information

- Longitude and latitude, cannot impute as we did for Denver as neighbourhood is also missing in some cases. Keep them anyway for same reason as above

- X and Y will be dropped later 

## Dimensions

Prep columns for creating dimensions later

### Date

According to the PDF data description for Vancouver, the times and dates correspond to when the crime occurred

In [70]:
# Set date attributes
# Day of week starts at 0 - Monday
# Months and days start at 1

fmt = "%Y/%m/%d"
dates = vancouver_crimes['YEAR'].apply(str) + '/' + vancouver_crimes['MONTH'].apply(str) + '/' + vancouver_crimes['DAY'].apply(str)
datetime = pd.DatetimeIndex(pd.to_datetime(dates, format = fmt))

# vancouver_crimes['DAY_OF_YEAR'] = datetime.dayofyear
# vancouver_crimes['WEEK_OF_YEAR'] = datetime.weekofyear
# vancouver_crimes['WEEKDAY'] = datetime.weekday + 1
# vancouver_crimes['QUARTER'] = datetime.quarter
# vancouver_crimes['IS_MONTH_START'] = datetime.is_month_start
# vancouver_crimes['IS_MONTH_END'] = datetime.is_month_end
# vancouver_crimes['IS_YEAR_START'] = datetime.is_year_start
# vancouver_crimes['IS_YEAR_END'] = datetime.is_year_end
vancouver_crimes['FIRST_OCCURRENCE_DATE'] = datetime.date

### Location

In [71]:
# Add city

vancouver_crimes['CITY'] = 'Vancouver'

### Crime

In [72]:
# Add crime times

subset = ~vancouver_crimes.HOUR.isnull()


fmt = "%Y/%m/%d %H:%M:%S"
times = "2000/01/01 " + vancouver_crimes.loc[subset, 'HOUR'].apply(int).apply(str) + ':' + vancouver_crimes.loc[subset, 'MINUTE'].apply(int).apply(str) + ':00'
times = pd.DatetimeIndex(pd.to_datetime(times, format = fmt))

vancouver_crimes['FIRST_OCCURRENCE_TIME'] = None
vancouver_crimes.loc[subset, 'FIRST_OCCURRENCE_TIME'] = times.time

Crime reported time and end time are not given so they will be null  
NOTE: Technically in the Vancouver crime data description, the HOUR and MINUTE correspond to when the crime occurred - "A numeric field that indicates the month when the reported crime activity occurred" and so matches with the Day/Month/Year data as well. However, in the deliverable PDF we will assume HOUR and MINUTE are when the crime was reported

In [73]:
# Set start and end time null

vancouver_crimes['REPORTED_TIME'] = None
vancouver_crimes['LAST_OCCURRENCE_TIME'] = None

First let's add violent/non-violent category based on the type name

In [74]:
# Mappings for violent/non-violent

violent_mappings = [
    ('Break and Enter Commercial', False),
    ('Break and Enter Residential/Other', False),
    ('Homicide', True),
    ('Mischief', False),
    ('Offence Against a Person', True),
    ('Other Theft', False),
    ('Theft from Vehicle', False),
    ('Theft of Bicycle', False),
    ('Theft of Vehicle', False),
    ('Vehicle Collision or Pedestrian Struck (with Fatality)', False),
    ('Vehicle Collision or Pedestrian Struck (with Injury)', False)
]

violent_mappings = pd.DataFrame(violent_mappings, columns = ['TYPE', 'IS_VIOLENT'])

# Add is violent column
vancouver_crimes = vancouver_crimes.merge(violent_mappings, how = 'left', on = 'TYPE')

To keep consistent crime types and categories with Denver crime data, we will map each type to an Offense Type in the Denver data. We will then assign each crime type a category from the Denver data as wel. The mapping is manually created by finding the best match for given Type. For those which are too vague, None is assigned instead

In [75]:
# Create mappings for Vancouver crime types
category_mappings = [
    ('Break and Enter Commercial', 'Burglary'),
    ('Break and Enter Residential/Other', 'Burglary'),
    ('Homicide', 'Murder'),
    ('Mischief', 'Public Disorder'),
    ('Offence Against a Person', 'Other Crimes Against Persons'),
    ('Other Theft', 'Larceny'),
    ('Theft from Vehicle', 'Theft from Motor Vehicle'),
    ('Theft of Bicycle', 'Larceny'),
    ('Theft of Vehicle', 'Auto Theft'),
    ('Vehicle Collision or Pedestrian Struck (with Fatality)', 'Traffic Accident'),
    ('Vehicle Collision or Pedestrian Struck (with Injury)', 'Traffic Accident')
]

category_mappings = pd.DataFrame(category_mappings, columns = ['TYPE', 'OFFENSE_CATEGORY_NAME'])

type_mappings = [
    ('Break and Enter Commercial', 'Burglary of a business with forced entry'),
    ('Break and Enter Residential/Other', 'Burglary of a residence with forced entry'),
    ('Homicide', None),
    ('Mischief', None),
    ('Offence Against a Person', None),
    ('Other Theft', None),
    ('Theft from Vehicle', None),
    ('Theft of Bicycle', 'Bicycle theft'),
    ('Theft of Vehicle', 'Motor vehicle theft'),
    ('Vehicle Collision or Pedestrian Struck (with Fatality)', None),
    ('Vehicle Collision or Pedestrian Struck (with Injury)', None)
]

type_mappings = pd.DataFrame(type_mappings, columns = ['TYPE', 'OFFENSE_TYPE_NAME'])

# Join mappings
vancouver_crimes = vancouver_crimes.merge(type_mappings, how = 'left', on = 'TYPE')
vancouver_crimes = vancouver_crimes.merge(category_mappings, how = 'left', on = 'TYPE')

## Facts

Add facts/measures to each row since each row represents a crime

In [76]:
# Is Traffic
vancouver_crimes['IS_TRAFFIC'] = vancouver_crimes.OFFENSE_CATEGORY_NAME == 'Traffic Accident'

# Is Fatal
vancouver_crimes['IS_FATAL'] = vancouver_crimes.OFFENSE_CATEGORY_NAME == 'Murder'

# Is Nighttime
night_start = time(hour = 20, minute = 0, second = 0)
night_end = time(hour = 5, minute = 0, second = 0)
vancouver_crimes['IS_NIGHTTIME'] = (vancouver_crimes.FIRST_OCCURRENCE_TIME > night_start) | (vancouver_crimes.FIRST_OCCURRENCE_TIME < night_end)
vancouver_crimes.loc[vancouver_crimes.FIRST_OCCURRENCE_TIME.isnull(), 'IS_NIGHTTIME'] = pd.NA

## Reformat

Reformat to match closer with dimensional model

In [77]:
# Drop unused
cols_drop = ['X', 'Y', 'TYPE', 'YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE']
vancouver_crimes.drop(cols_drop, axis = 1, inplace = True)

In [78]:
# Rename
vancouver_crimes = vancouver_crimes.rename(
    columns = 
    {'OFFENSE_TYPE_NAME': 'OFFENSE_TYPE',
     'OFFENSE_CATEGORY_NAME': 'OFFENSE_CATEGORY',
     'NEIGHBOURHOOD': 'NEIGHBORHOOD',
     'Latitude': 'LATITUDE',
     'Longitude': 'LONGITUDE',
     'HUNDRED_BLOCK': 'LOCATION'
    }
)

In [79]:
# View preprocessed data

vancouver_crimes.head()
vancouver_crimes.isnull().mean() * 100

Unnamed: 0,LOCATION,NEIGHBORHOOD,LATITUDE,LONGITUDE,FIRST_OCCURRENCE_DATE,CITY,FIRST_OCCURRENCE_TIME,REPORTED_TIME,LAST_OCCURRENCE_TIME,IS_VIOLENT,OFFENSE_TYPE,OFFENSE_CATEGORY,IS_TRAFFIC,IS_FATAL,IS_NIGHTTIME
0,,Oakridge,49.233614,-123.119712,2012-12-14,Vancouver,08:52:00,,,False,Burglary of a business with forced entry,Burglary,False,False,False
1,10XX SITKA SQ,Fairview,49.266678,-123.129029,2019-03-07,Vancouver,02:06:00,,,False,Burglary of a business with forced entry,Burglary,False,False,True
2,10XX ALBERNI ST,West End,49.285255,-123.123649,2019-08-27,Vancouver,04:12:00,,,False,Burglary of a business with forced entry,Burglary,False,False,True
3,10XX ALBERNI ST,West End,49.285181,-123.123536,2014-08-08,Vancouver,05:13:00,,,False,Burglary of a business with forced entry,Burglary,False,False,False
4,10XX ALBERNI ST,West End,49.285132,-123.123461,2005-11-14,Vancouver,03:09:00,,,False,Burglary of a business with forced entry,Burglary,False,False,True


LOCATION                   0.002050
NEIGHBORHOOD              10.377626
LATITUDE                  10.373842
LONGITUDE                 10.373842
FIRST_OCCURRENCE_DATE      0.000000
CITY                       0.000000
FIRST_OCCURRENCE_TIME      9.917103
REPORTED_TIME            100.000000
LAST_OCCURRENCE_TIME     100.000000
IS_VIOLENT                 0.000000
OFFENSE_TYPE              71.624745
OFFENSE_CATEGORY           0.000000
IS_TRAFFIC                 0.000000
IS_FATAL                   0.000000
IS_NIGHTTIME               9.917103
dtype: float64

In [80]:
# Save preprocessed crime data

vancouver_crimes.to_csv(vancouver_data_path + 'crime_vancouver_preprocessed.csv', index = False)

# Combine Denver and Vancouver data

First we must only consider crimes in Denver and Vancouver that cover the same time periods

In [81]:
# Intersection of date ranges

min_date = max(denver_crimes.FIRST_OCCURRENCE_DATE.min(), vancouver_crimes.FIRST_OCCURRENCE_DATE.min())

denver_crimes = denver_crimes.loc[denver_crimes.FIRST_OCCURRENCE_DATE >= min_date]
vancouver_crimes = vancouver_crimes.loc[vancouver_crimes.FIRST_OCCURRENCE_DATE >= min_date]

In [82]:
# Combine and view

all_crimes = pd.concat([vancouver_crimes, denver_crimes], ignore_index = True)
all_crimes.isnull().mean() * 100
all_crimes.head()

LOCATION                  6.689017
NEIGHBORHOOD              2.620570
LATITUDE                  2.618283
LONGITUDE                 2.618283
FIRST_OCCURRENCE_DATE     0.000000
CITY                      0.000000
FIRST_OCCURRENCE_TIME     2.531231
REPORTED_TIME            29.467653
LAST_OCCURRENCE_TIME     77.781030
IS_VIOLENT                0.000000
OFFENSE_TYPE             22.598433
OFFENSE_CATEGORY          0.000000
IS_TRAFFIC                0.000000
IS_FATAL                  0.000000
IS_NIGHTTIME              2.531231
dtype: float64

Unnamed: 0,LOCATION,NEIGHBORHOOD,LATITUDE,LONGITUDE,FIRST_OCCURRENCE_DATE,CITY,FIRST_OCCURRENCE_TIME,REPORTED_TIME,LAST_OCCURRENCE_TIME,IS_VIOLENT,OFFENSE_TYPE,OFFENSE_CATEGORY,IS_TRAFFIC,IS_FATAL,IS_NIGHTTIME
0,10XX SITKA SQ,Fairview,49.266678,-123.129029,2019-03-07,Vancouver,02:06:00,,,False,Burglary of a business with forced entry,Burglary,False,False,True
1,10XX ALBERNI ST,West End,49.285255,-123.123649,2019-08-27,Vancouver,04:12:00,,,False,Burglary of a business with forced entry,Burglary,False,False,True
2,10XX ALBERNI ST,West End,49.284981,-123.123053,2017-11-14,Vancouver,20:00:00,,,False,Burglary of a business with forced entry,Burglary,False,False,False
3,10XX ALBERNI ST,West End,49.284794,-123.122946,2018-03-02,Vancouver,06:17:00,,,False,Burglary of a business with forced entry,Burglary,False,False,False
4,10XX ALBERNI ST,West End,49.284715,-123.122824,2015-02-04,Vancouver,20:53:00,,,False,Burglary of a business with forced entry,Burglary,False,False,True


In [83]:
# Save
all_crimes.to_csv('../data/all_crimes.csv', index = False)