In [39]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import ourfunctions

from matplotlib import pyplot as plt
import seaborn as sns

import logging

#from sklearnex import patch_sklearn
#patch_sklearn(verbose=False)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import plot_confusion_matrix, recall_score, accuracy_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from xgboost import XGBRegressor
from sklearn import svm
from catboost import CatBoostClassifier

In [40]:
X = pd.read_csv('data/Training-set-values.csv')
y = pd.read_csv('data/Training-set-labels.csv')
y = pd.DataFrame(LabelEncoder().fit_transform(y.status_group))

X['date_recorded'] = pd.to_datetime(X['date_recorded']).astype(np.int64)

#### Preprocessors

In [41]:
# Super basic numeric transformer

numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median'))]
)

numeric_preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, make_column_selector(dtype_include=np.number)),
    ]
)

### Models

In [42]:
# kNearestNeighbors
knn = {'classifier': KNeighborsClassifier(n_jobs=3), 'preprocessor': None}

# Logistic Regressoion
log_reg_basic = {'classifier': LogisticRegression(C=1e6, n_jobs=3), 'preprocessor': None}

# Decision Trees
DecisionTrees = {'classifier': DecisionTreeClassifier,'preprocessor': None}
# Decision Trees - adjusted
DecisionTreesAd = {'classifier': DecisionTreeClassifier(criterion=['gini','entropy'], max_depth=[90,100], min_samples_split=[2,3], class_weight=['balaced']),'preprocessor': numeric_preprocessor}

# Random Forest with numeric processor
RandomFM_basic = {'classifier': RandomForestClassifier(max_depth=20, min_samples_split=4, n_jobs=3), 'preprocessor': numeric_preprocessor}
# Random Forest no processor
RandomFM_all_cols = {'classifier': RandomForestClassifier(max_depth=20, min_samples_split=4, n_jobs=3), 'preprocessor': None}
# Random Forest default
# Included for RandomCVSearch later on
RandomFM_default = {'classifier': RandomForestClassifier(n_jobs=3), 'preprocessor': None} 

# Adaptive Boosting
AdaBoost = {'classifier': AdaBoostClassifier(), 'preprocessor': numeric_preprocessor}
# Gradient Boost
GradBoost = {'classifier': GradientBoostingClassifier,'preprocessor': numeric_preprocessor}
# XGradient Boosting
XGBoost = {'classifier': XGBRegressor(objective='reg:squarederror'), 'preprocessor': numeric_preprocessor}
# CatBoost 
CatBoost = {'classifier': CatBoostClassifier(max_depth=3),'preprocessor': numeric_preprocessor}

# Support Vector Machine
SVM = {'classifier': svm.SVC,'preprocessor': numeric_preprocessor}

models = {'knn': knn, 
    'log_reg_basic': log_reg_basic, 
    'DecisionTrees': DecisionTrees,
    'DecisionTreesAd': DecisionTreesAd,
    'RandomFM_basic': RandomFM_basic, 
    'RandomFM_all_cols': RandomFM_all_cols, 
    'RandomFM_default': RandomFM_default,
    'AdaBoost': AdaBoost,
    'GradBoost': GradBoost,
    'XGBoost': XGBoost,
    'CatBoost': CatBoost,
    "SVM": SVM
    }


### Modeler

In [43]:
model_run = ourfunctions.Modeler(models, X=X, y=y)

# after the model_run object is created so we can add onto the default preprocessor.
log_reg_regularized = {'classifier': LogisticRegression(n_jobs=3), 'preprocessor': model_run.create_default_prep(num_add=[('scaling', StandardScaler())])}
model_run.add_model('log_reg_regularized', log_reg_regularized)

### Search parameters and kwargs

In [44]:
LogRegRCV_params = dict(penalty=['l1', 'l2', 'elasticnet'],
                        C=stats.uniform(loc=1, scale=10),
                        max_iter=list(range(100,400)))

DecisionTree_params = dict(criterion=['gini', 'entropy'],
                        max_depth = list(range(20,50)),
                        min_samples_split = list(range(2, 10)))

RandForestRCV_params = dict(n_estimators=list(range(100,300)),
                            criterion=['gini', 'entropy'],
                            max_depth = list(range(20,50)),
                            min_samples_split = list(range(2, 10)))

AdaBoost_params = dict(n_estimators=[10, 50, 100, 500],
                        learning_rate=[0.001, 0.01, 0.1, 1.0])

GradBoost_params = dict(n_estimators=[10, 30, 100],
                    criterion=['friedman_mse', 'squared_error'],
                    max_depth=[2, 6, 10],
                    min_samples_split=[5, 10],
                    min_samples_leaf=[3, 6])

XGBoost_params = dict(learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=4,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    nthread=4,
                    scale_pos_weight=1,
                    seed=27)

CatBoost_params = dict(max_depth =[3,4,5],
                        n_estimators = [100,200,300])

SVM_params = dict(C=[0.1,1, 10, 100], 
                gamma=[1,0.1,0.01,0.001],
                kernel=['rbf', 'poly', 'sigmoid'])

search_options = {'n_jobs': 3, 'random_state': 9280210, 'n_iter': 20}

## Grid Search

In [45]:
model_run.hyper_search('log_reg_regularized', params=LogRegRCV_params, searcher_kwargs=search_options)

In [46]:
model_run.hyper_search('RandomFM_default', params=RandForestRCV_params, searcher_kwargs=search_options, set_to_train=True)



In [47]:
model_run.hyper_search('AdaBoost', params=AdaBoost_params, searcher_kwargs=search_options, set_to_train=True)



In [48]:
model_run.hyper_search('GradBoost', params=GradBoost_params, searcher_kwargs=search_options, set_to_train=True)

TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.

In [49]:
model_run.hyper_search('XGBoost', params=XGBoost_params, searcher_kwargs=search_options, set_to_train=True)

TypeError: Parameter value is not iterable or distribution (key='learning_rate', value=0.1)

In [None]:
model_run.hyper_search('CatBoost', params=CatBoost_params, searcher_kwargs=search_options, set_to_train=True)

In [None]:
model_run.hyper_search('SVM', params=SVM_params, searcher_kwargs=search_options, set_to_train=True)

## Test Models

In [None]:
model_run.test_model('RandomFM_default')

## Plotting

In [None]:
model_run.plot_models(save='wednesday_models_graph')