## Import packages

In [1]:
# preprocessing
import pandas as pd
import numpy as np
import datetime as dt

# viz
import altair as alt
alt.data_transformers.enable("vegafusion")
import matplotlib.pyplot as plt

## Loading dataset

In [2]:
# csv_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/business-licences/exports/csv?lang=en&timezone=America%2FLos_Angeles&use_labels=true&delimiter=%3B'
# business = pd.read_csv(csv_url, delimiter = ';')


# It takes a while to load data from the url, so... here's the shortcut!
# Just download the file above to your local machine, and put the file in the data folder
business = pd.read_csv('data/business-licences.csv', delimiter = ';')
gdp  = pd.read_csv('data/gdp.csv')

  business = pd.read_csv('data/business-licences.csv', delimiter = ';')
  gdp  = pd.read_csv('data/gdp.csv')


## Preprocessing

### Business Lisence data
- Drop rows where `ExpiredDate` and `IssuedDate` are NA.
- Transform `ExpiredDate` and `IssuedDate` to date.
- Calculate the survival interval of each company, which is the difference between the maximum of ExpiredDate and the minimum of IssuedDate.
- Keep only the newest issued record of each company.
- Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022 because for those licenses issued in year 2023, the dafault `ExpiredDate` are `2023-12-31` and we cannot know whether it would survive until then.

In [3]:
# Drop rows where ExpiredDate and IssuedDate are NA
business = business.dropna(subset = ["ExpiredDate", "IssuedDate"])

# Transform ExpiredDate and IssuedDate to date
business[["ExpiredDate", "IssuedDate"]] = business[["ExpiredDate", "IssuedDate"]].apply(pd.to_datetime, utc=True)
business['ExpiredDate'] = business['ExpiredDate'].dt.date
business['IssuedDate'] = business['IssuedDate'].dt.date

# Calculate the survival interval of each company
business['survival_days'] = (business.groupby('BusinessName')['ExpiredDate'].transform('max')-
                            business.groupby('BusinessName')['IssuedDate'].transform('min'))
business['survival_days'] = pd.to_timedelta(business['survival_days']).dt.days

# Keep only the newest issued record of each company
business.sort_values(by='ExpiredDate', ascending=True)
business = business.drop_duplicates(subset='BusinessName', keep='last')

# Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022.
business = business[business['ExpiredDate'] <= dt.date(2022, 12, 31)]

business['FOLDERYEAR'] = business['FOLDERYEAR'].apply(lambda x : '20' + str(x))

### Macroeconomics Data
- Create a column `REF_YEAR` representing the year of `REF_DATE`
- Adjust `REF_DATE` from `%y-%m` to `%y-%m-01` by applying `pd.to_datetime`
- Keep rows where `North American Industry Classification System (NAICS) == 'All industries'`, since it is time-consuming to manually map the `BusinessType` in business lisence dataset to the related industries, we will merely consider the overall GDP performance in this project.
- Keep rows where `REF_YEAR >= 2012`
- Keep columns `REF_YEAR`, `REF_DATE` and `VALUE`

In [8]:
gdp  = pd.read_csv('data/gdp.csv')
gdp = gdp[gdp['North American Industry Classification System (NAICS)'] == 'All industries [T001]'][['REF_DATE', 'VALUE']]
gdp['REF_YEAR'] = gdp['REF_DATE'].apply(lambda x : int(str(x)[:4]))
gdp = gdp[gdp['REF_YEAR'] >= 2012]
gdp['REF_DATE'] = pd.to_datetime(gdp['REF_DATE'])
gdp['REF_YEAR'] = gdp['REF_YEAR'].astype(str)
gdp = gdp.rename(columns = {'VALUE': 'GdpValue', 'REF_YEAR': 'FOLDERYEAR'})

  gdp  = pd.read_csv('data/gdp.csv')


### Combine business lisence and macroeconomics data
- Map the yearly GDP value to the first lisence issued year of each company
- _Discussion: Since the threshold of survival for now is 2 years, whether or not we map the average of `GDP value of the first lisence issued year and the next year` instead of merely the first lisence issued year?_

In [9]:
business = business.merge(gdp, on='FOLDERYEAR', how='inner')
business

Unnamed: 0,FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,...,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom,geo_point_2d,survival_days,REF_DATE,GdpValue
0,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-01-01,1729586.0
1,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-01-01,1728397.0
2,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-01-01,136666.0
3,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-02-01,1734805.0
4,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-02-01,1736155.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1832071,2021,3724406,21-141890,0,Altra Urban Construction Ltd,Altra Urban Construction,Issued,2020-11-26,2021-12-31,Plumber & Gas Contractor,...,,Victoria-Fraserview,1.0,155.0,2023-11-01T02:38:57-07:00,,,3683.0,2021-11-01,2028221.0
1832072,2021,3724406,21-141890,0,Altra Urban Construction Ltd,Altra Urban Construction,Issued,2020-11-26,2021-12-31,Plumber & Gas Contractor,...,,Victoria-Fraserview,1.0,155.0,2023-11-01T02:38:57-07:00,,,3683.0,2021-11-01,173272.0
1832073,2021,3724406,21-141890,0,Altra Urban Construction Ltd,Altra Urban Construction,Issued,2020-11-26,2021-12-31,Plumber & Gas Contractor,...,,Victoria-Fraserview,1.0,155.0,2023-11-01T02:38:57-07:00,,,3683.0,2021-12-01,2022291.0
1832074,2021,3724406,21-141890,0,Altra Urban Construction Ltd,Altra Urban Construction,Issued,2020-11-26,2021-12-31,Plumber & Gas Contractor,...,,Victoria-Fraserview,1.0,155.0,2023-11-01T02:38:57-07:00,,,3683.0,2021-12-01,2029942.0


### Response Variable for Classification: survival_status

In [10]:
# To balance the amount of True & False, I set the threshold to 2 years 
survival_threshold = 730
business['survival_status'] = business['survival_days'] >= survival_threshold

# Adjust Boolean to 0, 1
business["survival_status"] = business["survival_status"].astype(int)
business

Unnamed: 0,FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,...,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom,geo_point_2d,survival_days,REF_DATE,GdpValue,survival_status
0,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-01-01,1729586.0,0
1,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-01-01,1728397.0,0
2,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-01-01,136666.0,0
3,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-02-01,1734805.0,0
4,2013,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,374.0,2013-02-01,1736155.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1832071,2021,3724406,21-141890,0,Altra Urban Construction Ltd,Altra Urban Construction,Issued,2020-11-26,2021-12-31,Plumber & Gas Contractor,...,Victoria-Fraserview,1.0,155.0,2023-11-01T02:38:57-07:00,,,3683.0,2021-11-01,2028221.0,1
1832072,2021,3724406,21-141890,0,Altra Urban Construction Ltd,Altra Urban Construction,Issued,2020-11-26,2021-12-31,Plumber & Gas Contractor,...,Victoria-Fraserview,1.0,155.0,2023-11-01T02:38:57-07:00,,,3683.0,2021-11-01,173272.0,1
1832073,2021,3724406,21-141890,0,Altra Urban Construction Ltd,Altra Urban Construction,Issued,2020-11-26,2021-12-31,Plumber & Gas Contractor,...,Victoria-Fraserview,1.0,155.0,2023-11-01T02:38:57-07:00,,,3683.0,2021-12-01,2022291.0,1
1832074,2021,3724406,21-141890,0,Altra Urban Construction Ltd,Altra Urban Construction,Issued,2020-11-26,2021-12-31,Plumber & Gas Contractor,...,Victoria-Fraserview,1.0,155.0,2023-11-01T02:38:57-07:00,,,3683.0,2021-12-01,2029942.0,1
