## Import packages

In [4]:
# preprocessing
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from functools import reduce

# viz
import altair as alt
alt.data_transformers.enable("vegafusion")


DataTransformerRegistry.enable('vegafusion')

## Loading external datasets

In [23]:
# URLs of the datasets
url_employment_by_industry = 'https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1410035501&pickMembers%5B0%5D=1.11&pickMembers%5B1%5D=3.1&pickMembers%5B2%5D=4.1&cubeTimeFrame.startMonth=01&cubeTimeFrame.startYear=1997&cubeTimeFrame.endMonth=10&cubeTimeFrame.endYear=2023&referencePeriods=19970101%2C20231001'
url_gdp_by_industry = 'https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3610043401'
url_investment_in_building_construction = 'https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=3410017501'
url_consumer_price_index = 'https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1810025601'

# loading the datasets from local data
df_consumer_price_index = pd.read_csv('data/consumer_price_index.csv')
df_employment_by_industry = pd.read_csv('data/employment_by_industry.csv')
df_gdp_by_industry = pd.read_csv('data/gdp_by_industry.csv')
df_investment_in_building_construction = pd.read_csv('data/investment_in_building_construction.csv')

# Standardize date formats to YYYY-MM
df_consumer_price_index['REF_DATE'] = pd.to_datetime(df_consumer_price_index['REF_DATE']).dt.to_period('M')
df_employment_by_industry['REF_DATE'] = pd.to_datetime(df_employment_by_industry['REF_DATE']).dt.to_period('M')
df_gdp_by_industry['REF_DATE'] = pd.to_datetime(df_gdp_by_industry['REF_DATE']).dt.to_period('M')
df_investment_in_building_construction['REF_DATE'] = pd.to_datetime(df_investment_in_building_construction['REF_DATE']).dt.to_period('M')

# Merging the datasets by 'REF_DATE' only and adding suffixes to 'GEO' columns to specify the dataset source
merged_df = pd.merge(df_consumer_price_index, df_employment_by_industry, on='REF_DATE', how='inner', suffixes=('_cpi', '_emp'))
merged_df = pd.merge(merged_df, df_gdp_by_industry, on='REF_DATE', how='inner', suffixes=('', '_gdp'))
merged_df = pd.merge(merged_df, df_investment_in_building_construction, on='REF_DATE', how='inner', suffixes=('', '_const'))

# Rename columns for clarity
merged_df.rename(columns={'VALUE': 'CPI', 'VALUE_emp': 'Employment_Value', 'VALUE_gdp': 'GDP_Value', 'VALUE_const': 'Construction_Value'}, inplace=True)

merged_df

Unnamed: 0,REF_DATE,GEO_cpi,Alternative measures,UOM_cpi,SCALAR_FACTOR_cpi,VALUE_cpi,GEO_emp,North American Industry Classification System (NAICS),Statistics,Data type,...,UOM,SCALAR_FACTOR,CPI,GEO_const,Type of structure,Type of work,Investment Value,UOM_const,SCALAR_FACTOR_const,Construction_Value
0,2010-01,Canada,Measure of core inflation based on a factor mo...,Percent,units,1.8,British Columbia,"Total employed, all industries",Estimate,Seasonally adjusted,...,Dollars,millions,1601650,"Vancouver, British Columbia",Total residential and non-residential,"Types of work, total",Seasonally adjusted - current,Dollars,units,795594700
1,2010-02,Canada,Measure of core inflation based on a factor mo...,Percent,units,1.8,British Columbia,"Total employed, all industries",Estimate,Seasonally adjusted,...,Dollars,millions,1606768,"Vancouver, British Columbia",Total residential and non-residential,"Types of work, total",Seasonally adjusted - current,Dollars,units,805290960
2,2010-03,Canada,Measure of core inflation based on a factor mo...,Percent,units,1.5,British Columbia,"Total employed, all industries",Estimate,Seasonally adjusted,...,Dollars,millions,1614763,"Vancouver, British Columbia",Total residential and non-residential,"Types of work, total",Seasonally adjusted - current,Dollars,units,813680925
3,2010-04,Canada,Measure of core inflation based on a factor mo...,Percent,units,1.4,British Columbia,"Total employed, all industries",Estimate,Seasonally adjusted,...,Dollars,millions,1613317,"Vancouver, British Columbia",Total residential and non-residential,"Types of work, total",Seasonally adjusted - current,Dollars,units,773128728
4,2010-05,Canada,Measure of core inflation based on a factor mo...,Percent,units,1.1,British Columbia,"Total employed, all industries",Estimate,Seasonally adjusted,...,Dollars,millions,1622249,"Vancouver, British Columbia",Total residential and non-residential,"Types of work, total",Seasonally adjusted - current,Dollars,units,759925015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159,2023-04,Canada,Measure of core inflation based on a factor mo...,Percent,units,5.6,British Columbia,"Total employed, all industries",Estimate,Seasonally adjusted,...,Dollars,millions,2092622,"Vancouver, British Columbia",Total residential and non-residential,"Types of work, total",Seasonally adjusted - current,Dollars,units,1834160767
160,2023-05,Canada,Measure of core inflation based on a factor mo...,Percent,units,5.2,British Columbia,"Total employed, all industries",Estimate,Seasonally adjusted,...,Dollars,millions,2095744,"Vancouver, British Columbia",Total residential and non-residential,"Types of work, total",Seasonally adjusted - current,Dollars,units,1798335645
161,2023-06,Canada,Measure of core inflation based on a factor mo...,Percent,units,5.1,British Columbia,"Total employed, all industries",Estimate,Seasonally adjusted,...,Dollars,millions,2091606,"Vancouver, British Columbia",Total residential and non-residential,"Types of work, total",Seasonally adjusted - current,Dollars,units,1801010672
162,2023-07,Canada,Measure of core inflation based on a factor mo...,Percent,units,4.8,British Columbia,"Total employed, all industries",Estimate,Seasonally adjusted,...,Dollars,millions,2093784,"Vancouver, British Columbia",Total residential and non-residential,"Types of work, total",Seasonally adjusted - current,Dollars,units,1828066588


## Loading dataset

In [None]:
# csv_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/business-licences/exports/csv?lang=en&timezone=America%2FLos_Angeles&use_labels=true&delimiter=%3B'
# business = pd.read_csv(csv_url, delimiter = ';')

# It takes a while to load data from the url, so... here's the shortcut!
# Just download the file above to your local machine, and put the file in the data folder
business = pd.read_csv('data/business-licences.csv', delimiter = ';')

## Preprocessing

### Cleaning data
- Drop rows where `ExpiredDate` and `IssuedDate` are NA.
- Transform `ExpiredDate` and `IssuedDate` to date.
- Calculate the survival interval of each company, which is the difference between the maximum of ExpiredDate and the minimum of IssuedDate.
- Keep only the newest issued record of each company.
- Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022 because for those licenses issued in year 2023, the dafault `ExpiredDate` are `2023-12-31` and we cannot know whether it would survive until then.

In [None]:
# Drop rows where ExpiredDate and IssuedDate are NA
business = business.dropna(subset = ["ExpiredDate", "IssuedDate"])

# Transform ExpiredDate and IssuedDate to date
business[["ExpiredDate", "IssuedDate"]] = business[["ExpiredDate", "IssuedDate"]].apply(pd.to_datetime, utc=True)
business['ExpiredDate'] = business['ExpiredDate'].dt.date
business['IssuedDate'] = business['IssuedDate'].dt.date

# Calculate the survival interval of each company
business['survival_days'] = (business.groupby('BusinessName')['ExpiredDate'].transform('max')-
                            business.groupby('BusinessName')['IssuedDate'].transform('min'))
business['survival_days'] = pd.to_timedelta(business['survival_days']).dt.days

# Keep only the newest issued record of each company
business.sort_values(by='ExpiredDate', ascending=True)
business = business.drop_duplicates(subset='BusinessName', keep='last')

# Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022.
business = business[business['ExpiredDate'] <= dt.date(2022, 12, 31)]

### Response Variable for Classification: survival_status

In [None]:
# To balance the amount of True & False, I set the threshold to 2 years 
survival_threshold = 730
business['survival_status'] = business['survival_days'] >= survival_threshold

# Adjust Boolean to 0, 1
business["survival_status"] = business["survival_status"].astype(int)
business