## Import packages

In [6]:
# preprocessing
import pandas as pd
import numpy as np
import datetime as dt
from functools import reduce

# viz
import altair as alt
alt.data_transformers.enable("vegafusion")
import matplotlib.pyplot as plt

## Loading dataset

In [17]:
# csv_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/business-licences/exports/csv?lang=en&timezone=America%2FLos_Angeles&use_labels=true&delimiter=%3B'
# business = pd.read_csv(csv_url, delimiter = ';')


# It takes a while to load data from the url, so... here's the shortcut!
# Just download the file above to your local machine, and put the file in the data folder
business = pd.read_csv('data/business-licences.csv', delimiter = ';')

raw_econ_index_data_dict = {
    'GDP': pd.read_csv('data/gdp_by_industry.csv'),
    'ConsumerPrice': pd.read_csv('data/consumer_price_index.csv'),
    'Employment': pd.read_csv('data/employment_by_industry.csv'),
    'InvestmentConstruction': pd.read_csv('data/investment_in_building_construction.csv')
}

  business = pd.read_csv('data/business-licences.csv', delimiter = ';')


## Preprocessing

### Business Lisence data
- Drop rows where `ExpiredDate` and `IssuedDate` are NA.
- Transform `ExpiredDate` and `IssuedDate` to date.
- Calculate the survival interval of each company, which is the difference between the maximum of ExpiredDate and the minimum of IssuedDate.
- Keep only the newest issued record of each company.
- Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022 because for those licenses issued in year 2023, the dafault `ExpiredDate` are `2023-12-31` and we cannot know whether it would survive until then.

In [8]:
# Drop rows where ExpiredDate and IssuedDate are NA
business = business.dropna(subset = ["ExpiredDate", "IssuedDate"])

# Transform ExpiredDate and IssuedDate to date
business[["ExpiredDate", "IssuedDate"]] = business[["ExpiredDate", "IssuedDate"]].apply(pd.to_datetime, utc=True)
business['ExpiredDate'] = business['ExpiredDate'].dt.date
business['IssuedDate'] = business['IssuedDate'].dt.date

# Calculate the survival interval of each company
business['survival_days'] = (business.groupby('BusinessName')['ExpiredDate'].transform('max')-
                            business.groupby('BusinessName')['IssuedDate'].transform('min'))
business['survival_days'] = pd.to_timedelta(business['survival_days']).dt.days

# Keep only the first issued record of each company (to obtain the year when a company starts it business)
business.sort_values(by='ExpiredDate', ascending=True)
business = business.drop_duplicates(subset='BusinessName', keep='first')

# Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022.
business = business[business['ExpiredDate'] <= dt.date(2022, 12, 31)]

business['FOLDERYEAR'] = business['FOLDERYEAR'].apply(lambda x : '20' + str(x))

### Macroeconomics Data
- Create a column `REF_YEAR` representing the year of `REF_DATE`
- Keep rows where `North American Industry Classification System (NAICS) == 'All industries'`, since it is time-consuming to manually map the `BusinessType` in business license dataset to the related industries, we will merely consider the overall GDP performance in this project.
- Keep rows where `REF_YEAR >= 2012`
- Keep columns `REF_YEAR` and `VALUE`

_Delete for now: Adjust `REF_DATE` from `%y-%m` to `%y-%m-01` by applying `pd.to_datetime`_

In [18]:
econList = []
for index_name, data in raw_econ_index_data_dict.items():
    data = data[['REF_DATE', 'VALUE']]
    data['REF_YEAR'] = data['REF_DATE'].apply(lambda x : int(str(x)[:4]))
    data = data[data['REF_YEAR'] >= 2012]
    data['REF_YEAR'] = data['REF_YEAR'].astype(str)
    data = data.drop(columns=['REF_DATE'])
    econList.append(data.rename(columns = {'VALUE': f'{index_name}Value', 
                                            'REF_YEAR': 'FOLDERYEAR'}
                                ).groupby('FOLDERYEAR').mean().reset_index())
    
econ = reduce(lambda df1, df2 : pd.merge(df1, df2, on='FOLDERYEAR', how='inner'), econList).drop_duplicates()
econ

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['REF_YEAR'] = data['REF_DATE'].apply(lambda x : int(str(x)[:4]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['REF_YEAR'] = data['REF_DATE'].apply(lambda x : int(str(x)[:4]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['REF_YEAR'] = data['REF_DATE'].apply(lambda x : int(str(x)[:

Unnamed: 0,FOLDERYEAR,GDPValue,ConsumerPriceValue,EmploymentValue,InvestmentConstructionValue
0,2012,1710429.0,1.758333,2296.708333,912810600.0
1,2013,1754173.0,1.266667,2320.475,987533900.0
2,2014,1803636.0,1.475,2348.983333,1036283000.0
3,2015,1820026.0,1.891667,2390.0,1146144000.0
4,2016,1839614.0,1.708333,2468.166667,1267791000.0
5,2017,1901971.0,1.225,2560.616667,1287776000.0
6,2018,1958470.0,1.8,2607.116667,1434572000.0
7,2019,1996744.0,2.166667,2676.116667,1657493000.0
8,2020,1897187.0,1.633333,2509.85,1519591000.0
9,2021,1991978.0,2.566667,2665.416667,1504690000.0


### Combine business lisence and macroeconomics data
- Map the yearly GDP value to the first lisence issued year of each company
- _Discussion: Since the threshold of survival for now is 2 years, whether or not we map the average of `GDP value of the first lisence issued year and the next year` instead of merely the first license issued year?_

In [14]:
business = business.merge(econ, on='FOLDERYEAR', how='inner')
business

Unnamed: 0,FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,...,NumberofEmployees,FeePaid,ExtractDate,Geom,geo_point_2d,survival_days,GDPValue,ConsumerPriceValue,EmploymentValue,InvestmentConstructionValue
0,2013,1786043,13-166627,0,Melissa Cheryl Aston (Melissa Aston),Kazoomko Productions,Issued,2012-12-29,2013-12-31,Entertainment Services,...,0.0,129.0,2019-07-21T13:49:06-07:00,,,1828.0,1.754173e+06,1.266667,2320.475000,9.875339e+08
1,2013,1786044,13-166628,0,Corus Radio Company,CHMJ AM730 and CFOX 99.3FM,Issued,2013-01-14,2013-12-31,Entertainment Services,...,0.0,129.0,2019-07-21T13:49:06-07:00,"{""coordinates"": [-123.119500778402, 49.2822434...","49.2822434350563, -123.119500778402",2908.0,1.754173e+06,1.266667,2320.475000,9.875339e+08
2,2013,1786048,13-166632,0,Jamieson Productions Inc,Jamieson Prod Inc,Issued,2013-09-12,2013-12-31,Entertainment Services,...,0.0,191.0,2019-07-21T13:49:06-07:00,,,1936.0,1.754173e+06,1.266667,2320.475000,9.875339e+08
3,2013,1786055,13-166639,0,(Jessica Minnie),Petite Pearl Wedding and Event Planning,Issued,2013-06-17,2013-12-31,Entertainment Services,...,0.0,191.0,2019-07-21T13:49:06-07:00,,,3849.0,1.754173e+06,1.266667,2320.475000,9.875339e+08
4,2013,1786065,13-166649,0,Holly Perrin Yoos (Holly Yoos),Copperplate Communications,Issued,2012-11-29,2013-12-31,Entertainment Services,...,0.0,129.0,2019-07-21T13:49:06-07:00,,,4049.0,1.754173e+06,1.266667,2320.475000,9.875339e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92312,2021,3859186,21-260945,0,Betterbite Ltd,,Issued,2021-08-16,2021-12-31,Moving/Transfer Service,...,1.0,138.0,2023-11-01T02:38:58-07:00,,,867.0,1.991978e+06,2.566667,2665.416667,1.504690e+09
92313,2021,3859492,21-261247,0,Vancouver Charcuterie Inc,Charcuterie Vancouver,Issued,2021-10-06,2021-12-31,Ltd Service Food Establishment,...,1.0,155.0,2023-11-01T02:38:58-07:00,"{""coordinates"": [-123.167908743383, 49.2680903...","49.2680903409461, -123.167908743383",816.0,1.991978e+06,2.566667,2665.416667,1.504690e+09
92314,2021,3859939,21-261673,0,Ian Martin Information Technology Inc,,Issued,2021-08-11,2021-12-31,Employment Agency,...,2.0,125.0,2023-11-01T02:38:58-07:00,"{""coordinates"": [-123.121522823224, 49.2870270...","49.2870270555211, -123.121522823224",142.0,1.991978e+06,2.566667,2665.416667,1.504690e+09
92315,2021,3860003,21-261730,0,Glee Road Productions Ltd,,Issued,2021-07-19,2021-12-31,Production Company,...,60.0,138.0,2023-11-01T02:38:58-07:00,"{""coordinates"": [-123.06398007395, 49.28178427...","49.2817842705027, -123.06398007395",165.0,1.991978e+06,2.566667,2665.416667,1.504690e+09


### Response Variable for Classification: survival_status

In [15]:
# To balance the amount of True & False, I set the threshold to 2 years 
survival_threshold = 730
business['survival_status'] = business['survival_days'] >= survival_threshold

# Adjust Boolean to 0, 1
business["survival_status"] = business["survival_status"].astype(int)
business

Unnamed: 0,FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,...,FeePaid,ExtractDate,Geom,geo_point_2d,survival_days,GDPValue,ConsumerPriceValue,EmploymentValue,InvestmentConstructionValue,survival_status
0,2013,1786043,13-166627,0,Melissa Cheryl Aston (Melissa Aston),Kazoomko Productions,Issued,2012-12-29,2013-12-31,Entertainment Services,...,129.0,2019-07-21T13:49:06-07:00,,,1828.0,1.754173e+06,1.266667,2320.475000,9.875339e+08,1
1,2013,1786044,13-166628,0,Corus Radio Company,CHMJ AM730 and CFOX 99.3FM,Issued,2013-01-14,2013-12-31,Entertainment Services,...,129.0,2019-07-21T13:49:06-07:00,"{""coordinates"": [-123.119500778402, 49.2822434...","49.2822434350563, -123.119500778402",2908.0,1.754173e+06,1.266667,2320.475000,9.875339e+08,1
2,2013,1786048,13-166632,0,Jamieson Productions Inc,Jamieson Prod Inc,Issued,2013-09-12,2013-12-31,Entertainment Services,...,191.0,2019-07-21T13:49:06-07:00,,,1936.0,1.754173e+06,1.266667,2320.475000,9.875339e+08,1
3,2013,1786055,13-166639,0,(Jessica Minnie),Petite Pearl Wedding and Event Planning,Issued,2013-06-17,2013-12-31,Entertainment Services,...,191.0,2019-07-21T13:49:06-07:00,,,3849.0,1.754173e+06,1.266667,2320.475000,9.875339e+08,1
4,2013,1786065,13-166649,0,Holly Perrin Yoos (Holly Yoos),Copperplate Communications,Issued,2012-11-29,2013-12-31,Entertainment Services,...,129.0,2019-07-21T13:49:06-07:00,,,4049.0,1.754173e+06,1.266667,2320.475000,9.875339e+08,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92312,2021,3859186,21-260945,0,Betterbite Ltd,,Issued,2021-08-16,2021-12-31,Moving/Transfer Service,...,138.0,2023-11-01T02:38:58-07:00,,,867.0,1.991978e+06,2.566667,2665.416667,1.504690e+09,1
92313,2021,3859492,21-261247,0,Vancouver Charcuterie Inc,Charcuterie Vancouver,Issued,2021-10-06,2021-12-31,Ltd Service Food Establishment,...,155.0,2023-11-01T02:38:58-07:00,"{""coordinates"": [-123.167908743383, 49.2680903...","49.2680903409461, -123.167908743383",816.0,1.991978e+06,2.566667,2665.416667,1.504690e+09,1
92314,2021,3859939,21-261673,0,Ian Martin Information Technology Inc,,Issued,2021-08-11,2021-12-31,Employment Agency,...,125.0,2023-11-01T02:38:58-07:00,"{""coordinates"": [-123.121522823224, 49.2870270...","49.2870270555211, -123.121522823224",142.0,1.991978e+06,2.566667,2665.416667,1.504690e+09,0
92315,2021,3860003,21-261730,0,Glee Road Productions Ltd,,Issued,2021-07-19,2021-12-31,Production Company,...,138.0,2023-11-01T02:38:58-07:00,"{""coordinates"": [-123.06398007395, 49.28178427...","49.2817842705027, -123.06398007395",165.0,1.991978e+06,2.566667,2665.416667,1.504690e+09,0
