In [None]:
from DataPreprocess import *
from DataFetch import *

import pandas as pd
import numpy as np

#sklearn tool
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_validate

# Preprocess / transform
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)

# models
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression

### Analysis

We have used Logistic Regression and BernoulliNB for predicting business survival in Vancouver due to the nature of the data. 

##### Why Logistic Regression and BernoulliNB?
Logistic Regression is effective when the outcome is binary, making it appropriate for predicting whether a business survives or not. Easier interpretability of the model results is another reason why we chose Logistic Regression. It's a linear model that provides coefficients for each predictor variable, making it easy to interpret the impact of each variable on the predicted outcome. This can be crucial for understanding the economic and demographic factors influencing business survival. 

BernoulliNB, a variant of Naive Bayes, accommodates binary outcomes, aligning with the nature of the task where businesses either survive or fail. It excels in handling categorical features and is effective in scenarios with sparse data, making it well-suited for the diverse and potentially sparse economic and demographic factors influencing business longevity in the city.

##### Train Test Split
We are using 70% of our data as training data and the remaining 30% is used as test data.


##### Results
Logistic Regression is performing better and we are getting a cross-validation accuracy of ~80% (79.2%) on whether a business will survive or not. BernoulliNB has slightly lower cross-validation accuracy of 74.6%.

In [2]:
# business = fetch_business_license()
# raw_econ_index_data_dict = fetch_econ_indices()

business = business_datacleaning(pd.read_csv('data/business-licences.csv', delimiter = ';'), survival_threshold = 730)
raw_econ_index_data_dict = {
    'GDP': pd.read_csv('data/gdp_by_industry.csv'),
    'ConsumerPrice': pd.read_csv('data/consumer_price_index.csv'),
    'Employment': pd.read_csv('data/employment_by_industry.csv'),
    'InvestmentConstruction': pd.read_csv('data/investment_in_building_construction.csv')
}

business = business[business['City'] == 'Vancouver']
econ = econ_datacleaning(raw_econ_index_data_dict)
business_econ = merge_business_econ_by_year(business, econ)
business_econ

  business = business_datacleaning(pd.read_csv('data/business-licences.csv', delimiter = ';'), survival_threshold = 730)


Unnamed: 0,FOLDERYEAR,LicenceRSN,LicenceNumber,LicenceRevisionNumber,BusinessName,BusinessTradeName,Status,IssuedDate,ExpiredDate,BusinessType,...,FeePaid,ExtractDate,Geom,geo_point_2d,survival_days,survival_status,GDPValue,ConsumerPriceValue,EmploymentValue,InvestmentConstructionValue
0,2015,2333488,15-103790,0,Hollyhock Properties Ltd,,Issued,2014-12-03,2015-12-31,Apartment House Strata,...,64.0,2019-07-21T13:49:14-07:00,"{""coordinates"": [-123.116856730836, 49.2678622...","49.2678622929998, -123.116856730836",4048.0,1,1.820026e+06,1.891667,2390.000000,1.146144e+09
1,2015,2333496,15-103798,0,(Zandra Paleczny),,Issued,2014-11-06,2015-12-31,Apartment House Strata,...,64.0,2019-07-21T13:49:14-07:00,"{""coordinates"": [-123.133925222671, 49.2796620...","49.2796620031115, -123.133925222671",4045.0,1,1.820026e+06,1.891667,2390.000000,1.146144e+09
2,2015,2333501,15-103803,0,(Dave Dixon),,Issued,2014-11-14,2015-12-31,Apartment House Strata,...,64.0,2019-07-21T13:49:14-07:00,"{""coordinates"": [-123.124998311257, 49.2836868...","49.2836868407842, -123.124998311257",2208.0,1,1.820026e+06,1.891667,2390.000000,1.146144e+09
3,2015,2333502,15-103804,0,Henry B Yuen (Henry Yuen),,Issued,2014-12-05,2015-12-31,Apartment House Strata,...,64.0,2019-07-21T13:49:14-07:00,"{""coordinates"": [-123.132003087572, 49.2741705...","49.2741705397492, -123.132003087572",4007.0,1,1.820026e+06,1.891667,2390.000000,1.146144e+09
4,2015,2333506,15-103808,0,Tsang & Lee Enterprises Inc,,Issued,2015-01-07,2015-12-31,Apartment House Strata,...,64.0,2019-07-21T13:49:14-07:00,"{""coordinates"": [-123.117431658016, 49.2687000...","49.2687000536747, -123.117431658016",4015.0,1,1.820026e+06,1.891667,2390.000000,1.146144e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79390,2022,4042383,22-216683,0,(XueYang Hu),,Issued,2022-05-27,2022-12-31,Apartment House Strata,...,123.0,2023-11-01T02:39:02-07:00,,,583.0,0,2.064208e+06,5.425000,2748.241667,1.794664e+09
79391,2022,4042384,22-216684,0,Frank F Wu & Su-Chi L Wu,,Issued,2022-09-14,2022-12-31,Apartment House Strata,...,143.0,2023-11-01T02:39:02-07:00,,,473.0,0,2.064208e+06,5.425000,2748.241667,1.794664e+09
79392,2022,4042552,22-216852,0,Evermark Real Estate Services Inc,Evermark Real Estate Services,Issued,2022-04-21,2022-12-31,Real Estate Dealer,...,185.0,2023-11-01T02:39:02-07:00,"{""coordinates"": [-123.133618556976, 49.2039954...","49.2039954491368, -123.133618556976",619.0,0,2.064208e+06,5.425000,2748.241667,1.794664e+09
79393,2022,4042611,22-142223,1,RG Plumbing Ltd,,Issued,2022-04-22,2022-12-31,Plumber & Sprinkler Contractor,...,11.0,2023-11-01T02:39:02-07:00,,,618.0,0,2.064208e+06,5.425000,2748.241667,1.794664e+09


In [3]:
econ.columns

Index(['FOLDERYEAR', 'GDPValue', 'ConsumerPriceValue', 'EmploymentValue',
       'InvestmentConstructionValue'],
      dtype='object')

In [4]:
## Create the column transformer
# imp = make_column_transformer(
#     ("drop", drop_features),
#     (SimpleImputer(strategy="most_frequent"), word_features + categorical_features),  # missing_values='NaN'
#     (SimpleImputer(strategy="median"), numeric_features),  # missing_values='NaN'
# )
# preprocessor = make_column_transformer(  
#     (CountVectorizer(binary=True), [0]),  # BusinessType
#     (OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown='ignore'), [1, 2]),  # categorical
#     (StandardScaler(), [3, 4])  # numeric
# )


In [5]:
def transform(df, word_features, categorical_features, numeric_features):
    # drop_features = ['Status', 'BusinessSubType', 'FOLDERYEAR', 'LicenceRSN', 'LicenceNumber', 'LicenceRevisionNumber',
    #     'BusinessName', 'BusinessTradeName', 'IssuedDate', 'ExpiredDate', 
    #     'Unit', 'UnitType', 'House', 'Street', 'ExtractDate', 'Geom', 'geo_point_2d']
    
    word_transformer = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        FunctionTransformer(np.reshape, kw_args={'newshape':-1}),
        CountVectorizer(binary=True)
    )

    categorical_transformer = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown='ignore')
    )

    numeric_transformer = make_pipeline(
        SimpleImputer(strategy="median"),
        StandardScaler()
    )
    
    word_trans_arr = word_transformer.fit_transform(df[word_features])
    categorical_trans_arr = categorical_transformer.fit_transform(df[categorical_features])
    numeric_trans_arr = numeric_transformer.fit_transform(df[numeric_features])
    
    return np.hstack((word_trans_arr.toarray(), categorical_trans_arr, numeric_trans_arr))


In [6]:
train_df, test_df = train_test_split(business_econ, test_size=0.3, random_state=123)

word_features = ['BusinessType']
categorical_features = ['City', 'LocalArea']
numeric_features = ['NumberofEmployees', 'FeePaid', 
                     'GDPValue', 'ConsumerPriceValue', 'EmploymentValue', 'InvestmentConstructionValue']

X_train = train_df[word_features + categorical_features + numeric_features]
X_test = test_df[word_features + categorical_features + numeric_features]
y_train = train_df["survival_status"]
y_test = test_df["survival_status"]

X_train_transformed = transform(X_train, word_features, categorical_features, numeric_features)

In [8]:
bnb = BernoulliNB()
pd.DataFrame(cross_validate(bnb, X_train_transformed, y_train, cv=10, return_train_score=True))

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.083821,0.005145,0.751169,0.746031
1,0.051737,0.00483,0.746312,0.746291
2,0.051149,0.004826,0.747751,0.745572
3,0.052329,0.004608,0.740914,0.746171
4,0.050895,0.004974,0.743793,0.745772
5,0.05262,0.005117,0.741994,0.746191
6,0.05592,0.005003,0.745186,0.746716
7,0.060156,0.005425,0.741587,0.746736
8,0.062785,0.005698,0.744466,0.746936
9,0.058283,0.005274,0.746986,0.746976


In [7]:
logreg = LogisticRegression(random_state=123, max_iter=1000)
pd.DataFrame(cross_validate(logreg, X_train_transformed, y_train, cv=10, return_train_score=True))

Unnamed: 0,fit_time,score_time,test_score,train_score
0,5.330089,0.001129,0.790752,0.791295
1,4.685447,0.001117,0.79525,0.791455
2,4.428624,0.001107,0.792012,0.791975
3,5.179295,0.001085,0.785534,0.791975
4,5.024096,0.00111,0.787693,0.792235
5,3.567812,0.001132,0.79435,0.791555
6,4.858253,0.002255,0.795573,0.792259
7,4.522573,0.001158,0.791974,0.791899
8,4.145999,0.001141,0.785316,0.792179
9,4.459368,0.001088,0.788555,0.792419


#### Conclusion and Improvements

The Logistic Regression model gives a decent accuracy of ~80% here and can be used as an assistive model for making decisions on whether business licence will be renewed or not. 

We can further improve the model results:

- Trying out more complex models like Random Forest, Neural Networks etc. (which are currently out of our MDS syllabus scope as of now).
- By combining other economic and socio-economic factors in our dataset

In [9]:
# Still takes too much time, over 20 mins --> https://stackoverflow.com/questions/53940258/svc-classifier-taking-too-much-time-for-training
# svc_auto_gamma = SVC(gamma='auto')
# pd.DataFrame(cross_validate(svc_auto_gamma, X_train_transformed, y_train, cv=5, return_train_score=True))

In [None]:
# Takes too much time, over 20 mins --> https://stackoverflow.com/questions/53940258/svc-classifier-taking-too-much-time-for-training
# svc = SVC(gamma=0.01)
# pd.DataFrame(cross_validate(svc, X_train_transformed, y_train, cv=5, return_train_score=True))

KeyboardInterrupt: 