In [10]:
from DataCleaning import datacleaning
import pandas as pd
import numpy as np

#sklearn tool
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_validate

# Preprocess / transform
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    StandardScaler,
)

# models
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression

# csv_url = 'https://opendata.vancouver.ca/api/explore/v2.1/catalog/datasets/business-licences/exports/csv?lang=en&timezone=America%2FLos_Angeles&use_labels=true&delimiter=%3B'
# business = pd.read_csv(csv_url, delimiter = ';')
business = datacleaning(pd.read_csv('data/business-licences.csv', delimiter = ';'), survival_threshold = 730)

In [4]:
business = business[business['City'] == 'Vancouver']

In [5]:
## Create the column transformer
# imp = make_column_transformer(
#     ("drop", drop_features),
#     (SimpleImputer(strategy="most_frequent"), word_features + categorical_features),  # missing_values='NaN'
#     (SimpleImputer(strategy="median"), numeric_features),  # missing_values='NaN'
# )
# preprocessor = make_column_transformer(  
#     (CountVectorizer(binary=True), [0]),  # BusinessType
#     (OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown='ignore'), [1, 2]),  # categorical
#     (StandardScaler(), [3, 4])  # numeric
# )


In [6]:
def transform(df, word_features, categorical_features, numeric_features):
    # drop_features = ['Status', 'BusinessSubType', 'FOLDERYEAR', 'LicenceRSN', 'LicenceNumber', 'LicenceRevisionNumber',
    #     'BusinessName', 'BusinessTradeName', 'IssuedDate', 'ExpiredDate', 
    #     'Unit', 'UnitType', 'House', 'Street', 'ExtractDate', 'Geom', 'geo_point_2d']
    
    word_transformer = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        FunctionTransformer(np.reshape, kw_args={'newshape':-1}),
        CountVectorizer(binary=True)
    )

    categorical_transformer = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder(drop="if_binary", sparse_output=False, handle_unknown='ignore')
    )

    numeric_transformer = make_pipeline(
        SimpleImputer(strategy="median"),
        StandardScaler()
    )
    
    word_trans_arr = word_transformer.fit_transform(df[word_features])
    categorical_trans_arr = categorical_transformer.fit_transform(df[categorical_features])
    numeric_trans_arr = numeric_transformer.fit_transform(df[numeric_features])
    
    return np.hstack((word_trans_arr.toarray(), categorical_trans_arr, numeric_trans_arr))

# logreg = LogisticRegression(random_state=123, max_iter=1000)
svc = SVC(gamma=0.01)

In [7]:
train_df, test_df = train_test_split(business, test_size=0.3, random_state=123)

word_features = ['BusinessType']
categorical_features = ['City', 'LocalArea']
numeric_features = ['NumberofEmployees', 'FeePaid']

X_train = train_df[word_features + categorical_features + numeric_features]
X_test = test_df[word_features + categorical_features + numeric_features]
y_train = train_df["survival_status"]
y_test = test_df["survival_status"]

X_train_transformed = transform(X_train, word_features, categorical_features, numeric_features)

In [8]:
logreg = LogisticRegression(random_state=123, max_iter=1000)
pd.DataFrame(cross_validate(logreg, X_train_transformed, y_train, cv=10, return_train_score=True))

Unnamed: 0,fit_time,score_time,test_score,train_score
0,1.222418,0.00089,0.619606,0.620449
1,1.102384,0.001539,0.614774,0.622444
2,1.034654,0.000813,0.61913,0.622497
3,0.945515,0.000799,0.620511,0.620809
4,0.949513,0.00091,0.615331,0.621154
5,0.897459,0.001047,0.613605,0.621883
6,0.88307,0.000789,0.614986,0.621077
7,0.952803,0.000783,0.617749,0.621998
8,0.823973,0.000801,0.617403,0.620118
9,0.830648,0.000896,0.610152,0.621499


In [11]:
bnb = BernoulliNB()
pd.DataFrame(cross_validate(bnb, X_train_transformed, y_train, cv=10, return_train_score=True))

Unnamed: 0,fit_time,score_time,test_score,train_score
0,0.083352,0.002824,0.606144,0.619413
1,0.026893,0.002996,0.590266,0.6216
2,0.027567,0.002637,0.610497,0.619428
3,0.028582,0.002369,0.611533,0.618852
4,0.025412,0.002247,0.605318,0.619619
5,0.02873,0.002423,0.617058,0.620885
6,0.028624,0.002701,0.63018,0.617778
7,0.065262,0.002565,0.606354,0.61797
8,0.058476,0.002688,0.620511,0.619044
9,0.072502,0.003198,0.617749,0.617701


In [12]:
svc = SVC(gamma=0.01)
pd.DataFrame(cross_validate(svc, X_train_transformed, y_train, cv=10, return_train_score=True))

Unnamed: 0,fit_time,score_time,test_score,train_score
0,81.745414,6.320986,0.61719,0.613351
1,80.600726,6.273905,0.612012,0.61385
2,78.426644,6.243794,0.611188,0.613942
3,80.630651,6.413934,0.614986,0.61352
4,78.372289,6.259256,0.614296,0.613596
5,76.482762,6.322846,0.615677,0.613481
6,79.746148,6.400947,0.610497,0.614018
7,77.98734,6.227241,0.621202,0.612867
8,79.647548,6.232965,0.609461,0.614172
9,79.083524,6.257414,0.609116,0.61421
