In [12]:
from datetime import datetime, date, time, timedelta
import dtale
import eventful
import holidays
import numpy as np
import pandas as pd

pd.options.display.max_rows = 100
pd.options.display.max_columns = 0

# Display all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

ModuleNotFoundError: No module named 'md5'

# Purpose

Create DataFrames for each dimension and fact table

In [2]:
# Paths to data
denver_data_path = '../data/denver_crime_data/'
vancouver_data_path = '../data/vancouver_crime_data/'
dimensions_path = '../data/dimensional_model/'

# Load all crimes
all_crimes = pd.read_csv('../data/all_crimes.csv')

all_crimes.FIRST_OCCURRENCE_DATE = pd.DatetimeIndex(all_crimes.FIRST_OCCURRENCE_DATE)

  interactivity=interactivity, compiler=compiler, result=result)


# Date Dimension

In [3]:
# Create date dim with all dates from earliest crime date to current date

# Date dim date ranges
start_date = all_crimes.FIRST_OCCURRENCE_DATE.min()
# start_date = pd.to_datetime(datetime(year = int(start_date[0]), month = int(start_date[1]), day = int(start_date[2])))
end_date = pd.to_datetime(datetime.now())

# Holiday information
can_holidays = holidays.Canada()
us_holidays = holidays.UnitedStates()

# Get all dates
day_delta = timedelta(days = 1)
dates = [start_date]
while start_date < end_date:
    start_date += day_delta
    dates.append(pd.to_datetime(start_date))
    
# Get some date attributes
days = [date.day for date in dates]
months = [date.month for date in dates]
years = [date.year for date in dates]
day_of_years = [date.dayofyear for date in dates]
week_of_years = [date.weekofyear for date in dates]
weekdays = [date.weekday() + 1 for date in dates]
quarters = [date.quarter for date in dates]
month_starts = [date.is_month_start for date in dates]
month_ends = [date.is_month_end for date in dates]
year_starts = [date.is_year_start for date in dates]
year_ends = [date.is_year_end for date in dates]

# Get holidays
date_can_holidays = [can_holidays.get(date) for date in dates]
date_is_can_holidays = [not holiday is None for holiday in date_can_holidays]
date_us_holidays = [us_holidays.get(date) for date in dates]
date_is_us_holidays = [not holiday is None for holiday in date_us_holidays]

# Data dimension
date_dim = pd.DataFrame(
    {
        'DATE': dates,
        'DAY': days,
        'MONTH': months,
        'YEAR': years,
        'DAY_OF_YEAR': day_of_years,
        'WEEK_OF_YEAR': week_of_years,
        'WEEKDAY': weekdays,
        'QUARTER': quarters,
        'IS_MONTH_START': month_starts,
        'IS_MONTH_END': month_ends,
        'IS_YEAR_START': year_starts,
        'IS_YEAR_END': year_ends,
        'IS_CAN_HOLIDAY': date_is_can_holidays,
        'CAN_HOLIDAY_NAME': date_can_holidays,
        'IS_US_HOLIDAY': date_is_us_holidays,
        'US_HOLIDAY_NAME': date_us_holidays
    }
)

# Add surrogate date key as just date YYYYMMDD (From Kimball textbook)
date_dim['DATE_PK'] = date_dim.DATE.dt.strftime('%Y%m%d').apply(int)

date_dim.head(10)

Unnamed: 0,DATE,DAY,MONTH,YEAR,DAY_OF_YEAR,WEEK_OF_YEAR,WEEKDAY,QUARTER,IS_MONTH_START,IS_MONTH_END,IS_YEAR_START,IS_YEAR_END,IS_CAN_HOLIDAY,CAN_HOLIDAY_NAME,IS_US_HOLIDAY,US_HOLIDAY_NAME,DATE_PK
0,2015-01-02,2,1,2015,2,1,5,1,False,False,False,False,False,,False,,20150102
1,2015-01-03,3,1,2015,3,1,6,1,False,False,False,False,False,,False,,20150103
2,2015-01-04,4,1,2015,4,1,7,1,False,False,False,False,False,,False,,20150104
3,2015-01-05,5,1,2015,5,2,1,1,False,False,False,False,False,,False,,20150105
4,2015-01-06,6,1,2015,6,2,2,1,False,False,False,False,False,,False,,20150106
5,2015-01-07,7,1,2015,7,2,3,1,False,False,False,False,False,,False,,20150107
6,2015-01-08,8,1,2015,8,2,4,1,False,False,False,False,False,,False,,20150108
7,2015-01-09,9,1,2015,9,2,5,1,False,False,False,False,False,,False,,20150109
8,2015-01-10,10,1,2015,10,2,6,1,False,False,False,False,False,,False,,20150110
9,2015-01-11,11,1,2015,11,2,7,1,False,False,False,False,False,,False,,20150111


In [4]:
# Add surrogate key to all_crimes fact table
all_crimes = all_crimes.merge(date_dim[['DATE_PK', 'DATE']], how = 'left', left_on = 'FIRST_OCCURRENCE_DATE', right_on = 'DATE').drop('DATE', axis = 1)

# Save dimension
date_dim.to_csv(dimensions_path + 'date_dimension.csv', index = False)

# Location Dimension

No data could be found for yearly demographic data on the population for both Denver and Vancouver and for per neighborhood. So instead, we will just store the city and neighborhood population for one year  

- https://www12.statcan.gc.ca/census-recensement/2016/dp-pd/prof/details/page.cfm?Lang=E&Geo1=CSD&Code1=5915022&Geo2=PR&Code2=01&Data=Count&SearchText=5915022&SearchType=Begins&SearchPR=01&B1=All&Custom=&TABID=3
- http://worldpopulationreview.com/us-cities/denver-population/

In [5]:
# Location dimension
location_cols = ['LOCATION', 'NEIGHBORHOOD', 'LATITUDE', 'LONGITUDE', 'CITY']

location_dim = all_crimes[location_cols].copy()
location_dim.drop_duplicates(keep = 'first', inplace = True)

# Add demographic data - Population and Crime Rate
all_crimes['YEAR'] = all_crimes.FIRST_OCCURRENCE_DATE.dt.year

# Add population
denver_population = 716492 # Copy pasted from site above

vancouver_demographics = pd.read_csv(vancouver_data_path + 'CensusProfile2016-ProfilRecensement2016-20200221013316.csv', encoding = 'ISO-8859-1', skiprows = 1)
vancouver_population = int(vancouver_demographics.Total[1])

city_populations = pd.DataFrame({'CITY': ['Denver', 'Vancouver'], 
                                 'CITY_POPULATION': [denver_population, vancouver_population],
                                 'YEAR': [2018, 2016]})
location_dim = location_dim.merge(city_populations, how = 'left', on = 'CITY')

# Get number of crimes for given city and year 
num_crimes = city_populations.merge(all_crimes, how = 'left', on = ['CITY', 'YEAR']).CITY.value_counts().rename_axis('CITY').to_frame('NUMBER_CRIMES')
location_dim = location_dim.merge(num_crimes, how = 'left', on = 'CITY')

# Calculate crime rate 
location_dim['CRIME_RATE'] = (location_dim.NUMBER_CRIMES / location_dim.CITY_POPULATION) * 1e5

# Drop temp columns
all_crimes.drop('YEAR', axis = 1, inplace = True)
location_dim.drop(['CITY_POPULATION', 'YEAR', 'NUMBER_CRIMES'], axis = 1, inplace = True)

# Add surrogate PK
location_dim['LOCATION_PK'] = np.arange(1, location_dim.shape[0] + 1)

location_dim.head(10)

Unnamed: 0,LOCATION,NEIGHBORHOOD,LATITUDE,LONGITUDE,CITY,CRIME_RATE,LOCATION_PK
0,10XX SITKA SQ,Fairview,49.266678,-123.129029,Vancouver,5995.38232,1
1,10XX ALBERNI ST,West End,49.285255,-123.123649,Vancouver,5995.38232,2
2,10XX ALBERNI ST,West End,49.284981,-123.123053,Vancouver,5995.38232,3
3,10XX ALBERNI ST,West End,49.284794,-123.122946,Vancouver,5995.38232,4
4,10XX ALBERNI ST,West End,49.284715,-123.122824,Vancouver,5995.38232,5
5,10XX ALBERNI ST,West End,49.284666,-123.122749,Vancouver,5995.38232,6
6,10XX ALBERNI ST,West End,49.284445,-123.122412,Vancouver,5995.38232,7
7,10XX ALBERNI ST,West End,49.284519,-123.122348,Vancouver,5995.38232,8
8,10XX ALBERNI ST,West End,49.284396,-123.122337,Vancouver,5995.38232,9
9,10XX BARCLAY ST,West End,49.282661,-123.126206,Vancouver,5995.38232,10


In [6]:
# Add surrogate key to crime fact table
all_crimes = all_crimes.merge(location_dim, how = 'left', on = location_cols)

# Save dimension
location_dim.to_csv(dimensions_path + 'location_dimension.csv', index = False)

# Events