## Import packages

In [31]:
# preprocessing
import pandas as pd
import numpy as np
import datetime as dt

# viz
import altair as alt
alt.data_transformers.enable("vegafusion")
import matplotlib.pyplot as plt


## Loading dataset

In [32]:
# csv_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/business-licences/exports/csv?lang=en&timezone=America%2FLos_Angeles&use_labels=true&delimiter=%3B'
# business = pd.read_csv(csv_url, delimiter = ';')

# It takes a while to load data from the url, so... here's the shortcut!
# Just download the file above to your local machine, and put the file in the data folder
business = pd.read_csv('data/business-licences.csv', delimiter = ';')

  business = pd.read_csv('data/business-licences.csv', delimiter = ';')


## Preprocessing

### Cleaning data
- Drop rows where `ExpiredDate` and `IssuedDate` are NA.
- Transform `ExpiredDate` and `IssuedDate` to date.
- Calculate the survival interval of each company, which is the difference between the maximum of ExpiredDate and the minimum of IssuedDate.
- Keep only the newest issued record of each company.
- Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022 because for those licenses issued in year 2023, the dafault `ExpiredDate` are `2023-12-31` and we cannot know whether it would survive until then.

In [33]:
# Drop rows where ExpiredDate and IssuedDate are NA
business = business.dropna(subset = ["ExpiredDate", "IssuedDate"])

# Transform ExpiredDate and IssuedDate to date
business[["ExpiredDate", "IssuedDate"]] = business[["ExpiredDate", "IssuedDate"]].apply(pd.to_datetime, utc=True)
business['ExpiredDate'] = business['ExpiredDate'].dt.date
business['IssuedDate'] = business['IssuedDate'].dt.date

# Calculate the survival interval of each company
business['survival_days'] = (business.groupby('BusinessName')['ExpiredDate'].transform('max')-
                            business.groupby('BusinessName')['IssuedDate'].transform('min'))
business['survival_days'] = pd.to_timedelta(business['survival_days']).dt.days

# Keep only the newest issued record of each company
business.sort_values(by='ExpiredDate', ascending=True)
business = business.drop_duplicates(subset='BusinessName', keep='last')

# Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022.
business = business[business['ExpiredDate'] <= dt.date(2022, 12, 31)]

### Response Variable for Classification: survival_status

In [34]:
survival_threshold = 730
business['survival_status'] = business['survival_days'] >= survival_threshold
business["survival_status"] = business["survival_status"].astype(int)
business

Unnamed: 0,FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,...,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom,geo_point_2d,survival_days,survival_status
553,15,2335251,15-105551,0,Augustin Eduardo Carrasco Barrera (Augustin Ba...,MexiChurros Cafe,Issued,2014-12-08,2015-12-31,Caterer,...,CA,V6A 1Z5,Strathcona,1.0,379.0,2019-07-21T13:49:14-07:00,"{""coordinates"": [-123.078587976369, 49.2792344...","49.2792344627795, -123.078587976369",388.0,0
712,15,2335735,15-106035,0,Eweb Domains Inc,Eweb Development Group,Issued,2014-12-03,2015-12-31,Computer Services,...,CA,V6B 2P6,Downtown,1.0,143.0,2019-07-21T13:49:14-07:00,"{""coordinates"": [-123.116083523591, 49.2770930...","49.2770930304835, -123.116083523591",393.0,0
1271,15,2338409,15-108708,0,So Well Development,,Issued,2015-04-20,2015-12-31,Contractor,...,CA,,Victoria-Fraserview,1.0,184.0,2019-07-21T13:49:14-07:00,,,255.0,0
1557,15,2340320,15-110617,0,Techwise Services Limited,,Issued,2015-02-10,2015-12-31,Contractor - Special Trades,...,CA,V6A 2A9,Strathcona,1.0,,2019-07-21T13:49:14-07:00,"{""coordinates"": [-123.078354441262, 49.2788469...","49.2788469290828, -123.078354441262",324.0,0
2198,17,2800432,17-124135,0,Nu Nu San,Nu Nu's Sandwich Bar,Gone Out of Business,2016-11-30,2017-12-31,Ltd Service Food Establishment,...,CA,V6J 1W6,Fairview,0.0,503.0,2020-01-01T02:32:03-08:00,"{""coordinates"": [-123.138969787586, 49.2638982...","49.2638982459797, -123.138969787586",396.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646897,22,4040891,22-215254,0,(Mario Bessette),,Issued,2022-05-30,2022-12-31,Retail Dealer,...,CA,,Mount Pleasant,0.0,185.0,2023-11-01T02:39:02-07:00,,,215.0,0
646901,22,4040898,22-215261,0,DGrand Design & Build Inc,DG Design & Build,Issued,2022-05-05,2022-12-31,Contractor,...,CA,,Marpole,0.0,210.0,2023-11-01T02:39:02-07:00,,,240.0,0
646902,22,4040899,22-215262,0,Spell Love Road Productions Ltd,,Issued,2022-05-16,2022-12-31,Production Company,...,CA,V5L 1R2,Grandview-Woodland,60.0,185.0,2023-11-01T02:39:02-07:00,"{""coordinates"": [-123.06398007395, 49.28178427...","49.2817842705027, -123.06398007395",229.0,0
646921,22,4041005,22-215360,0,TT Fasteners Ltd,,Issued,2022-05-20,2022-12-31,Retail Dealer,...,CA,,Kerrisdale,1.0,172.0,2023-11-01T02:39:02-07:00,,,225.0,0


## EDA & Visualization

In [35]:
business.describe()

Unnamed: 0,FOLDERYEAR,LicenceRSN,LicenceRevisionNumber,NumberofEmployees,FeePaid,survival_days,survival_status
count,50884.0,50884.0,50884.0,50884.0,50564.0,50884.0,50884.0
mean,16.496246,2716622.0,0.024153,4.381574,234.853841,1179.180725,0.568017
std,2.948977,691169.5,0.160412,27.305559,697.643942,959.213107,0.495357
min,13.0,1771189.0,0.0,0.0,1.0,-147.0,0.0
25%,14.0,2149366.0,0.0,0.0,129.0,379.0,0.0
50%,16.0,2589331.0,0.0,1.0,152.0,859.0,1.0
75%,19.0,3273749.0,0.0,3.0,187.0,1805.0,1.0
max,22.0,4201432.0,3.0,1966.0,49089.0,3697.0,1.0


In [36]:
business.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50884 entries, 553 to 646960
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   FOLDERYEAR             50884 non-null  int64  
 1   LicenceRSN             50884 non-null  int64  
 2   LicenceNumber          50884 non-null  object 
 3   LicenceRevisionNumber  50884 non-null  int64  
 4   BusinessName           50884 non-null  object 
 5   BusinessTradeName      24160 non-null  object 
 6   Status                 50884 non-null  object 
 7   IssuedDate             50884 non-null  object 
 8   ExpiredDate            50884 non-null  object 
 9   BusinessType           50884 non-null  object 
 10  BusinessSubType        31724 non-null  object 
 11  Unit                   12811 non-null  object 
 12  UnitType               12799 non-null  object 
 13  House                  24550 non-null  object 
 14  Street                 24551 non-null  object 
 15  City

### survival_status value_counts

In [37]:
business['survival_status'].value_counts()

survival_status
1    28903
0    21981
Name: count, dtype: int64

### survival_status rate v.s. BusinessType

In [41]:
busi_type_rate = business[['BusinessType', 'survival_status']]
busi_type_rate = busi_type_rate.groupby('BusinessType').agg(count=('survival_status', 'size'), survival_rate=('survival_status', 'mean')).reset_index()

alt.data_transformers.enable('vegafusion')

alt.Chart(busi_type_rate).mark_point().encode(
    x=alt.X('survival_rate'),
    y=alt.Y('count'),
    tooltip='BusinessType'
)

ImportError: The "vegafusion" data transformer and chart.transformed_data feature requires
version 1.4.0 or greater of the 'vegafusion-python-embed' and 'vegafusion' packages.
These can be installed with pip using:
    pip install "vegafusion[embed]>=1.4.0"
Or with conda using:
    conda install -c conda-forge "vegafusion-python-embed>=1.4.0" "vegafusion>=1.4.0"

ImportError: vegafusion

alt.Chart(...)

### survival_status rate v.s. City
- Limited in Vancouver, for most of the businesses in this dataset are in Vancouver, BC
- Most of the businesses are located in Downtown 
- There are no significant differences among the survival rates across areas.

In [None]:
business.groupby('Province').size().reset_index()

Unnamed: 0,Province,0
0,78,1
1,AB,101
2,AL,1
3,AZ,1
4,Ab,1
5,BC,50308
6,British Columbia,1
7,CA,73
8,CO,3
9,CT,2


In [None]:
business[business['Province'] == 'BC'].groupby('City').size().reset_index().sort_values(by=0, ascending = False)

Unnamed: 0,City,0
166,Vancouver,41343
18,Burnaby,1652
155,Surrey,1555
131,Richmond,931
107,North Vancouver,881
...,...,...
75,Lantzville,1
74,Langley Township,1
72,Langey,1
70,Ladysmith,1


In [None]:
city_rate = business[business['Province'] == 'BC'][['City', 'survival_status']]
city_rate = city_rate.groupby('City').agg(count=('survival_status', 'size'), survival_rate=('survival_status', 'mean')).reset_index()

alt.Chart(city_rate).mark_point().encode(
    x=alt.X('survival_rate'),
    y=alt.Y('count'),
    tooltip='City'
)

In [None]:
local_area_rate = business[business['City'] == 'Vancouver'][['LocalArea', 'survival_status']]
local_area_rate = local_area_rate.groupby('LocalArea').agg(count=('survival_status', 'size'), survival_rate=('survival_status', 'mean')).reset_index()

alt.Chart(local_area_rate).mark_point().encode(
    x=alt.X('survival_rate'),
    y=alt.Y('count'),
    tooltip='LocalArea'
)

### survival_status v.s. NumberofEmployees
As last part, we focus in the Vancouver city only.
- There seems no specific threshold in regards of NumberofEmployees. But we can still use LogisticRegression to see whether it does.

In [None]:
alt.Chart(business[business['City'] == 'Vancouver'][['NumberofEmployees', 'survival_status']]).mark_point().encode(
    x=alt.X('NumberofEmployees'),
    y=alt.Y('survival_status'),
)

ImportError: The "vegafusion" data transformer and chart.transformed_data feature requires
version 1.4.0 or greater of the 'vegafusion-python-embed' and 'vegafusion' packages.
These can be installed with pip using:
    pip install "vegafusion[embed]>=1.4.0"
Or with conda using:
    conda install -c conda-forge "vegafusion-python-embed>=1.4.0" "vegafusion>=1.4.0"

ImportError: vegafusion

alt.Chart(...)

In [42]:
from pandas_profiling import ProfileReport

profile = ProfileReport(business, title="Pandas Profiling Report")  # , minimal=True)
profile.to_notebook_iframe()

PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.5/migration/#basesettings-has-moved-to-pydantic-settings for more details.

For further information visit https://errors.pydantic.dev/2.5/u/import-error