## Import packages

In [None]:
# preprocessing
from DataPreprocess import *
from DataFetch import fetch_business_license, fetch_econ_indicators

# viz
import altair as alt
alt.data_transformers.enable("vegafusion")
import matplotlib.pyplot as plt

## Loading dataset

In [None]:
# Fetch data by urls --> already modulized
# business = fetch_business_license()
# raw_econ_index_data_dict = fetch_econ_indicators()

# It takes a while to load data from the url, so... here's the shortcut!
# Just download the file above to your local machine, and put the file in the data folder
business = pd.read_csv('data/business-licences.csv', delimiter = ';')

raw_econ_index_data_dict = {
    'GDP': pd.read_csv('data/gdp_by_industry.csv'),
    'ConsumerPrice': pd.read_csv('data/consumer_price_index.csv'),
    'Employment': pd.read_csv('data/employment_by_industry.csv'),
    'InvestmentConstruction': pd.read_csv('data/investment_in_building_construction.csv')
}

## Preprocessing

In [None]:
# Drop rows where ExpiredDate and IssuedDate are NA
business = business_datacleaning(business = business, survival_threshold = 365 * 2)
business

In [None]:
econ = econ_datacleaning(raw_econ_index_data_dict)
econ

In [None]:
business_econ = merge_business_econ_by_year(business, econ)

In [None]:
business_econ.columns

## EDA & Visualization

In [None]:
business_econ.info()

In [None]:
business.describe(include='all')

### Numeric Features

In [None]:
numeric_features = ['GDPValue', 'ConsumerPriceValue', 'EmploymentValue', 'InvestmentConstructionValue'] 
# Save Numberofemployees and FeePaid for later due to their large variance

In [None]:
# Create a chart object for each feature.
charts_numeric = [alt.Chart(business_econ).transform_density(
    feature,
    as_=[feature, 'density'],
    groupby=['survival_status']
).mark_area(opacity=0.5).encode(
    x=alt.X(feature, title=feature).stack(False),
    y='density:Q',
    color=alt.Color('survival_status:O').scale(scheme='dark2')
).properties(
    width=180,
    height=120
) for feature in numeric_features]


# Combine the charts.
chart_grid = alt.vconcat(*[
    alt.hconcat(*charts_numeric[i:i+2]) for i in range(0, len(charts_numeric), 2)
])

In [None]:
employee = alt.Chart(business_econ).transform_density(
    'NumberofEmployees',
    as_=['NumberofEmployees', 'density'],
    groupby=['survival_status']
).mark_area(opacity=0.5).encode(
    x=alt.X('NumberofEmployees', title='NumberofEmployees', scale=alt.Scale(domain=[0, 5000])).stack(False),
    y='density:Q',
    color=alt.Color('survival_status:O').scale(scheme='dark2')
).properties(
    width=180,
    height=120
)

In [None]:
feepaid = alt.Chart(business_econ).transform_density(
    'FeePaid',
    as_=['FeePaid', 'density'],
    groupby=['survival_status']
).mark_area(opacity=0.5).encode(
    x=alt.X('FeePaid', title='FeePaid', scale=alt.Scale(domain=[0, 5000])).stack(False),
    y='density:Q',
    color=alt.Color('survival_status:O').scale(scheme='dark2')
).properties(
    width=180,
    height=120
)


In [None]:
chart_grid

In [None]:
employee & feepaid

### Caregorical Features

In [None]:
categorical_features = ['Province', 'LocalArea', 'BusinessType'] 

In [None]:
alt.Chart(business_econ).mark_bar(opacity=0.5).encode(
    alt.X('LocalArea', sort='-y').stack(False),
    y='count()',
    color=alt.Color('survival_status:O').scale(scheme='dark2')
).facet(
    'survival_status:O', columns = 2
)

In [None]:
business_econ['Province'].value_counts() # Since BC contains most of the data, we first look into records in BC Province

In [None]:
alt.Chart(business_econ).transform_filter(
    alt.datum.Province == 'BC'
).mark_bar(opacity=0.5).encode(
    x='Province:N',
    y='count()',
    color=alt.Color('survival_status:O', scale=alt.Scale(scheme='dark2'))
).facet(
    column='survival_status:O',
    columns=2
)

In [None]:
business_econ['BusinessType'].value_counts()

In [None]:
top_20_provinces = business_econ['BusinessType'].value_counts().head(20).index.tolist()

# Filter to include only the top 20 business types
filtered = business_econ[business_econ['BusinessType'].isin(top_20_provinces)]

alt.Chart(filtered).mark_bar(opacity=0.5).encode(
    x=alt.X('BusinessType:N', sort='-y'),
    y='count()',
    color=alt.Color('survival_status:O', scale=alt.Scale(scheme='dark2'))
).facet(
    column='survival_status:O',
    columns=2
)