## Import packages

In [1]:
# preprocessing
from DataPreprocess import *
from DataFetch import fetch_business_license, fetch_econ_indicators

# viz
import altair as alt
alt.data_transformers.enable("vegafusion")
import matplotlib.pyplot as plt

## Loading dataset

In [None]:
# Fetch data by urls --> already modulized
business = fetch_business_license()
raw_econ_index_data_dict = fetch_econ_indicators()

# It takes a while to load data from the url, so... here's the shortcut!
# Just download the file above to your local machine, and put the file in the data folder
business = pd.read_csv('../data/business-licences.csv', delimiter = ';')

raw_econ_index_data_dict = {
    'GDP': pd.read_csv('../data/gdp_by_industry.csv'),
    'ConsumerPrice': pd.read_csv('../data/consumer_price_index.csv'),
    'Employment': pd.read_csv('../data/employment_by_industry.csv'),
    'InvestmentConstruction': pd.read_csv('../data/investment_in_building_construction.csv')
}

Now loading: business_license data
Now loading: GDP data
Now loading: ConsumerPrice data
Now loading: Employment data
Now loading: InvestmentConstruction data


## Preprocessing

### Business Lisence data
#### Clean-up
- Drop rows where `ExpiredDate` and `IssuedDate` are NA.
- Transform `ExpiredDate` and `IssuedDate` to date.
- Calculate the survival interval of each company, which is the difference between the maximum of ExpiredDate and the minimum of IssuedDate.
- Keep only the newest issued record of each company.
- Filter to keep those records where the latest `ExpiredDate` is before or equal to year 2022 because for those licenses issued in year 2023, the dafault `ExpiredDate` are `2023-12-31` and we cannot know whether it would survive until then.

#### Response Variable for Classification: survival_status
- To balance the amount of True & False, set the threshold to 2 years 
- Adjust Boolean value to 0, 1

In [None]:
# Drop rows where ExpiredDate and IssuedDate are NA
business = business_datacleaning(business = business, survival_threshold = 365 * 2)
business

### Macroeconomics Data
- Create a column `REF_YEAR` representing the year of `REF_DATE`
- Keep rows where `North American Industry Classification System (NAICS) == 'All industries'`, since it is time-consuming to manually map the `BusinessType` in business license dataset to the related industries, we will merely consider the overall GDP performance in this project.
- Keep rows where `REF_YEAR >= 2012`
- Keep columns `REF_YEAR` and `VALUE`

In [None]:
econ = econ_datacleaning(raw_econ_index_data_dict)
econ

### Combine business lisence and macroeconomics data
- Map the yearly GDP value to the first lisence issued year of each company (the year when a company starts it business).

In [None]:
business_econ = merge_business_econ_by_year(business, econ)

In [None]:
business_econ