In [131]:
import pandas as pd
import numpy as np
import math
from datetime import datetime

### Load data
Load data from all dimensions

In [132]:
onset_date_dimension = pd.read_csv("../Dates/Onset_Date_dimension.csv")
reported_date_dimension = pd.read_csv("../Dates/Reported_Date_dimension.csv")
test_date_dimension = pd.read_csv("../Dates/Test_Date_dimension.csv")
specimen_date_dimension = pd.read_csv("../Dates/Specimen_Date_dimension.csv")
patient_dimension = pd.read_csv("../PHU & Patient dimensions/Patient_dimension.csv")
phu_dimension = pd.read_csv("../PHU & Patient dimensions/PHU_dimension.csv")
mobility_dimension = pd.read_csv("../Mobility/mobility_dimension.csv")
special_measures_dimension = pd.read_csv("../Special_Measures/Restrictions.csv")
weather_dimension = pd.read_csv("../Weather/weather_dimension.csv")

In [133]:
# fact_table = pd.DataFrame(columns=['Onset_date_key', 'Reported_date_key', 'Test_date_key', 'Specimen_date_key', 'Patient_key', 'PHU_key',
#                                 'Mobility_key', 'Weather_key', 'Special_measures_key', 'Weather_key', 'Resolved', 'Unresolved', 'Fatal'])
fact_table = pd.DataFrame(columns=['PHU_key', 'Mobility_key', 'Weather_key','Special_measures_key'])

### Rename columns
Rename some columns to avoid duplicate keys

In [134]:
weather_dimension=weather_dimension.rename(columns={"surrogate_key": "Weather_key"})
special_measures_dimension=special_measures_dimension.rename(columns={"surrogate_key": "Special_measures_key"})
specimen_date_dimension=specimen_date_dimension.rename(columns={"Date_Full_Format" : "Specimen_Date"})
test_date_dimension=test_date_dimension.rename(columns={"Date_Full_Format" : "Test_Date"})
reported_date_dimension=reported_date_dimension.rename(columns={"Date_Full_Format" : "Reported_Date"})
onset_date_dimension=onset_date_dimension.rename(columns={"Date_Full_Format" : "Onset_Date"})

### Normalize Location attributes
Replace locations with just Ottawa for Toronto. For example, "Ottawa Division" is replaced with "Ottawa".

In [135]:
mobility_dimension["Subregion"]=mobility_dimension["Subregion"].replace(["Ottawa Division"], "Ottawa")
mobility_dimension["Subregion"]=mobility_dimension["Subregion"].replace(["Toronto Division"], "Toronto")
weather_dimension["Station Name"]=weather_dimension["Station Name"].replace(["OTTAWA INTL A", "OTTAWA CDA RCS"], "Ottawa")
weather_dimension["Station Name"]=weather_dimension["Station Name"].replace(["TORONTO CITY", "TORONTO INTL A"], "Toronto")

### Surrogate key pipeline
Assign surrogate keys to data starting from 0. Skip some dimensions that already did this step during preprocessing.

In [136]:
phu_dimension=phu_dimension.drop(columns=["PHU_Surrogate_Key"])
phu_dimension.insert(0, "PHU_Surrogate_Key",  np.arange(len(phu_dimension)))
patient_dimension=patient_dimension.drop(columns=["Patient_Surrogate_Key"])
patient_dimension.insert(0, "Patient_Surrogate_Key",  np.arange(len(patient_dimension)))
specimen_date_dimension=specimen_date_dimension.drop(columns=["Specimen_Date_Surrogate_Key"])
specimen_date_dimension.insert(0, "Specimen_Date_Surrogate_Key",  np.arange(len(specimen_date_dimension)))
test_date_dimension=test_date_dimension.drop(columns=["Test_Date_Surrogate_Key"])
test_date_dimension.insert(0, "Test_Date_Surrogate_Key",  np.arange(len(test_date_dimension)))
reported_date_dimension=reported_date_dimension.drop(columns=["Reported_Date_Surrogate_Key"])
reported_date_dimension.insert(0, "Reported_Date_Surrogate_Key",  np.arange(len(reported_date_dimension)))
onset_date_dimension=onset_date_dimension.drop(columns=["Onset_Date_Surrogate_Key"])
onset_date_dimension.insert(0, "Onset_Date_Surrogate_Key",  np.arange(len(onset_date_dimension)))

### Map Patient to Date dimensions
Create an intersection of Patient to specimen, test, reported and onset date based on original data set

In [137]:
# Concat dates together and remove duplicate columns
dates_dimensions = [onset_date_dimension, reported_date_dimension, test_date_dimension, specimen_date_dimension]
dates_merged = pd.concat(dates_dimensions, axis=1)
dates_merged = dates_merged.loc[:,~dates_merged.columns.duplicated()]

In [138]:
# Drop columns not needed for fact table creation
dates_merged=dates_merged.drop(columns=["Day", "Month", "Year", "Day_of_Week", "Week_of_Year", "Weekend", "Holiday", "Season",
                                       "Onset_Date_Key", "Reported_Date_Key", "Test_Date_Key", "Specimen_Date_Key"])

In [None]:
# Load original COVID data and merged based on dates
covid_data = pd.read_csv("covid_data.csv")
# Filter only Ottawa and Toronto and Date
filtered_covid_data = pd.DataFrame(columns=["Index", "Accurate_Episode_Date", "Case_Reported_Date", "Test_Reported_Date", "Specimen_Date",
                                           "Age_Group", "Client_Gender", "Case_AcquisitionInfo", "Outbreak_Related", "Reporting_PHU"])
for idx, row in covid_data.iterrows():
    city = row["Reporting_PHU"]
    startDate =  datetime.strptime("2020-10-27", "%Y-%m-%d")
    endDate = datetime.strptime("2021-02-27", "%Y-%m-%d")
    date=datetime.strptime(row["Case_Reported_Date"], "%Y-%m-%d")
    if not (city != city):
        if ("ottawa" in city.lower()) or ("toronto" in city.lower()):
            if startDate <= date <= endDate:
                filtered_covid_row = [idx]
                filtered_covid_row.append(row["Accurate_Episode_Date"])
                filtered_covid_row.append(row["Case_Reported_Date"])
                filtered_covid_row.append(row["Test_Reported_Date"])
                filtered_covid_row.append(row["Specimen_Date"])
                filtered_covid_row.append(row["Age_Group"])
                filtered_covid_row.append(row["Client_Gender"])
                filtered_covid_row.append(row["Case_AcquisitionInfo"])
                filtered_covid_row.append(row["Outbreak_Related"])
                filtered_covid_row.append(row["Reporting_PHU"])
                filtered_covid_data.loc[len(filtered_covid_data)] = filtered_covid_row

covid_data_merge_dates = pd.merge(covid_data, dates_merged, 
                           left_on=["Accurate_Episode_Date", "Case_Reported_Date", "Test_Reported_Date", "Specimen_Date"], 
                           right_on=["Onset_Date", "Reported_Date", "Test_Date", "Specimen_Date"], 
                           how="inner")
len(covid_data_merge_dates)

In [140]:
# Join COVID data and patient dimension 
covid_data_final = pd.merge(covid_data_merge_dates, patient_dimension, 
                           left_on=["Age_Group", "Client_Gender", "Case_AcquisitionInfo", "Outbreak_Related"], 
                           right_on=["Age", "Gender", "Acquisition_Group", "Outbreak_Related"], 
                           how="inner")

### Map PHU, Mobility, Special Measures and Weather
Make intersection between these dimensions to map based on date and location.

In [141]:
# Map mobility and weather 
fact_table_data = pd.merge(mobility_dimension, weather_dimension, 
                           left_on=["Date", "Subregion"], 
                           right_on=["Date/Time", "Station Name"], 
                           how="inner")

In [142]:
# Create new columns for surrogate keys
fact_table_data["Special_measures_key"] = ""
fact_table_data["PHU_key"] = ""
# Map dimensions to fact table by location and/or date range
for idx, row in fact_table_data.iterrows():
    if row["Station Name"]=="Ottawa":
        fact_table_data.at[idx, 'PHU_key']=1
        for idx_sm, row_sm in special_measures_dimension.head(5).iterrows():
            special_measures_key=row_sm["Special_measures_key"]
            start_date=datetime.strptime(row_sm["start_date"], "%Y-%m-%d")
            end_date=datetime.strptime(row_sm["end_date"], "%Y-%m-%d")
            date=datetime.strptime(row["Date"], "%Y-%m-%d")
            if (start_date <= date < end_date):
                fact_table_data.at[idx, 'Special_measures_key']=special_measures_key
    if row["Station Name"]=="Toronto":
        fact_table_data.at[idx, 'PHU_key']=0
        for idx_sm, row_sm in special_measures_dimension.tail(6).iterrows():
            special_measures_key=row_sm["Special_measures_key"]
            start_date=datetime.strptime(row_sm["start_date"], "%Y-%m-%d")
            end_date=datetime.strptime(row_sm["end_date"], "%Y-%m-%d")
            date=datetime.strptime(row["Date"], "%Y-%m-%d")
            if (start_date <= date < end_date):
                fact_table_data.at[idx, 'Special_measures_key']=special_measures_key

In [143]:
# Drop columns not needed for fact table creation
fact_table_data=fact_table_data.drop(columns=["Province", "Grocery_and_phramacy", "Parks", "Transit_stations", "Workplaces", "Residential", 
                              "Retail_and_recreation", "Mean Temp (°C)", "Min Temp (°C)", "Max Temp (°C)", "Total Precip (mm)",
                                              "Station Name", "Date/Time"])

### Map COVID Data to Fact Table Data
Map datasets based on Reported date and city

In [150]:
# Create new columns for surrogate keys
fact_table_data["Onset_Date_Key"] = ""
fact_table_data["Reported_Date_Key"] = ""
fact_table_data["Test_Date_Key"] = ""
fact_table_data["Specimen_Date_Key"] = ""
fact_table_data["Patient_Key"] = ""

0                             Peel Public Health
1                             Peel Public Health
2                          Toronto Public Health
3                          Toronto Public Health
4                          Toronto Public Health
                          ...                   
3243          York Region Public Health Services
3244                Middlesex-London Health Unit
3245                       Toronto Public Health
3246                       Toronto Public Health
3247    Wellington-Dufferin-Guelph Public Health
Name: Reporting_PHU, Length: 3248, dtype: object

In [144]:
# Map fact table data to fact table rows
for idx, row in fact_table_data.iterrows():
    fact_row=[row["PHU_key"], row["Mobility_key"], row["Weather_key"], row["Special_measures_key"]]
    fact_table.loc[len(fact_table)] = fact_row

In [145]:
fact_table.to_csv("fact_table.csv", index=False)

In [146]:
fact_table_data

Unnamed: 0,Mobility_key,Date,Subregion,Weather_key,Special_measures_key,PHU_key
0,0,2020-10-27,Ottawa,0,0,1
1,1,2020-10-28,Ottawa,1,0,1
2,2,2020-10-29,Ottawa,2,0,1
3,3,2020-10-30,Ottawa,3,0,1
4,4,2020-10-31,Ottawa,4,0,1
...,...,...,...,...,...,...
241,242,2021-02-22,Toronto,241,10,0
242,243,2021-02-23,Toronto,242,10,0
243,244,2021-02-24,Toronto,243,10,0
244,245,2021-02-25,Toronto,244,10,0
