## Import packages

In [87]:
import pandas as pd
import datetime as dt

## Loading dataset

In [88]:
# csv_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/business-licences/exports/csv?lang=en&timezone=America%2FLos_Angeles&use_labels=true&delimiter=%3B'
# business = pd.read_csv(csv_url, delimiter = ';')

# It takes a while to load data from the url, so... here's the shortcut!
# Just download the file above to your local machine, and put the file in the data folder
business = pd.read_csv('data/business-licences.csv', delimiter = ';')

  business = pd.read_csv('business-licences.csv', delimiter = ';')


## Preprocessing

### Cleaning data
- Drop rows where `ExpiredDate` and `IssuedDate` are NA.
- Transform `ExpiredDate` and `IssuedDate` to date.
- Calculate the survival interval of each company, which is the difference between the maximum of ExpiredDate and the minimum of IssuedDate.
- Keep only the newest issued record of each company.
- Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022 because for those licenses issued in year 2023, the dafault `ExpiredDate` are `2023-12-31` and we cannot know whether it would survive until then.

In [89]:
# Drop rows where ExpiredDate and IssuedDate are NA
business = business.dropna(subset = ["ExpiredDate", "IssuedDate"])

# Transform ExpiredDate and IssuedDate to date
business[["ExpiredDate", "IssuedDate"]] = business[["ExpiredDate", "IssuedDate"]].apply(pd.to_datetime, utc=True)
business['ExpiredDate'] = business['ExpiredDate'].dt.date
business['IssuedDate'] = business['IssuedDate'].dt.date

# Calculate the survival interval of each company
business['survival_days'] = (business.groupby('BusinessName')['ExpiredDate'].transform('max')-
                            business.groupby('BusinessName')['IssuedDate'].transform('min'))

# Keep only the newest issued record of each company
business.sort_values(by='ExpiredDate', ascending=True)
business = business.drop_duplicates(subset='BusinessName', keep='last')

# Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022.
business = business[business['ExpiredDate'] <= dt.date(2022, 12, 31)]

In [86]:
# business[(business["IssuedDate"] >= dt.date(2023, 1, 1)) & (business["Status"] == 'Issued')]
# business.iloc[381746, :]
# business[business['BusinessName'] == 'Lehail Construction Ltd']
# business.sort_values(by='ExpiredDate', ascending=True)

### Response Variable for Classification: survival_status

In [91]:
survival_threshold = dt.timedelta(days=365)
business['survival_status'] = business['survival_days'] >= survival_threshold
business

Unnamed: 0,FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,...,Country,PostalCode,LocalArea,NumberofEmployees,FeePaid,ExtractDate,Geom,geo_point_2d,survival_days,survival_status
21,13,1786109,13-166693,0,David Andrew Goodfellow (David Goodfellow),Bramblebutt Productions,Issued,2012-12-22,2013-12-31,Entertainment Services,...,CA,,Mount Pleasant,1.0,129.0,2019-07-21T13:49:06-07:00,,,"374 days, 0:00:00",True
26,13,1786136,13-166720,0,William David McKnight (William McKnight),,Issued,2012-12-28,2013-12-31,Entertainment Services,...,CA,,Grandview-Woodland,0.0,129.0,2019-07-21T13:49:06-07:00,,,"368 days, 0:00:00",True
27,13,1786138,13-166722,0,(Thomas MacDonald),Mr. MacDonald Music,Issued,2013-01-30,2013-12-31,Entertainment Services,...,CA,,Grandview-Woodland,0.0,169.0,2019-07-21T13:49:06-07:00,,,"335 days, 0:00:00",False
29,13,1786144,13-166728,0,(Janet Morrison),Janet Love Morrison Editing,Issued,2012-12-22,2013-12-31,Entertainment Services,...,CA,,West End,1.0,129.0,2019-07-21T13:49:06-07:00,,,"374 days, 0:00:00",True
36,13,1786165,13-166749,0,Laura Rose Martin Barreca (Laura Barreca),,Issued,2013-06-14,2013-12-31,Entertainment Services,...,CA,,Mount Pleasant,0.0,151.0,2019-07-21T13:49:06-07:00,,,"200 days, 0:00:00",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646924,21,3724155,21-141639,0,Six Oh Four Printing Ltd,Made in Print,Issued,2020-12-01,2021-12-31,Printing Services,...,CA,V6B 1C2,Downtown,5.0,155.0,2023-11-01T02:38:57-07:00,"{""coordinates"": [-123.114538756358, 49.2766297...","49.2766297368584, -123.114538756358","2348 days, 0:00:00",True
646928,21,3724163,21-141647,0,Saing Chun Derek Chan (Saing Chan),RSVP Wedding Invitations & Printing,Issued,2021-02-10,2021-12-31,Printing Services,...,CA,V6A 1H9,Strathcona,1.0,195.0,2023-11-01T02:38:57-07:00,"{""coordinates"": [-123.08364749238, 49.28233634...","49.2823363433113, -123.08364749238","3672 days, 0:00:00",True
646935,21,3724233,21-141717,0,Ricardo W Thaller (Ricardo Thaller),,Issued,2021-04-07,2021-12-31,Plumber & Gas Contractor,...,CA,,South Cambie,1.0,390.0,2023-11-01T02:38:57-07:00,,,"3220 days, 0:00:00",True
646950,21,3724334,21-141818,0,Vital Plumbing And Heating Inc,,Inactive,2021-02-08,2021-12-31,Plumber & Gas Contractor,...,CA,,West End,0.0,195.0,2023-11-01T02:38:57-07:00,,,"1255 days, 0:00:00",True


## EDA & Visualization