In [42]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import ourfunctions

from matplotlib import pyplot as plt
import seaborn as sns

import logging

#from sklearnex import patch_sklearn
#patch_sklearn(verbose=False)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import plot_confusion_matrix, recall_score, accuracy_score, precision_score, f1_score
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from xgboost import XGBRegressor
from sklearn import svm
from catboost import CatBoostClassifier

In [43]:
X = pd.read_csv('data/Training-set-values.csv')
y = pd.read_csv('data/Training-set-labels.csv')

X['date_recorded'] = pd.to_datetime(X['date_recorded']).astype(np.int64)

#### Preprocessors

In [44]:
# Super basic numeric transformer

numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median'))]
)

numeric_preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, make_column_selector(dtype_include=np.number)),
    ]
)

### Models

In [45]:
# kNearestNeighbors
knn = {'classifier': KNeighborsClassifier(n_jobs=3), 'preprocessor': None}

# Logistic Regressoion
log_reg_basic = {'classifier': LogisticRegression(C=1e6, n_jobs=3), 'preprocessor': None}

# Decision Trees
DecisionTrees = {'classifier': DecisionTreeClassifier,'preprocessor': None}
# Decision Trees - adjusted
DecisionTreesAd = {'classifier': DecisionTreeClassifier(criterion=['gini','entropy'], max_depth=[90,100], min_samples_split=[2,3], class_weight=['balaced']),'preprocessor': numeric_preprocessor}

# Random Forest with numeric processor
RandomFM_basic = {'classifier': RandomForestClassifier(max_depth=20, min_samples_split=4, n_jobs=3), 'preprocessor': numeric_preprocessor}
# Random Forest no processor
RandomFM_all_cols = {'classifier': RandomForestClassifier(max_depth=20, min_samples_split=4, n_jobs=3), 'preprocessor': None}
# Random Forest default
# Included for RandomCVSearch later on
RandomFM_default = {'classifier': RandomForestClassifier(n_jobs=3), 'preprocessor': None} 

# Adaptive Boosting
AdaBoost = {'classifier': AdaBoostClassifier(), 'preprocessor': numeric_preprocessor}
# Gradient Boost
GradBoost = {'classifier': GradientBoostingClassifier,'preprocessor': numeric_preprocessor}
# XGradient Boosting
XGBoost = {'classifier': XGBRegressor(objective='reg:squarederror'), 'preprocessor': numeric_preprocessor}
# CatBoost 
CatBoost = {'classifier': CatBoostClassifier(max_depth=3),'preprocessor': numeric_preprocessor}

# Support Vector Machine
SVM = {'classifier': svm.SVC,'preprocessor': numeric_preprocessor}

models = {'knn': knn, 
    'log_reg_basic': log_reg_basic, 
    'DecisionTree': DecisionTrees,
    'DecisionTreeAd': DecisionTreesAd,
    'RandomFM_basic': RandomFM_basic, 
    'RandomFM_all_cols': RandomFM_all_cols, 
    'RandomFM_default': RandomFM_default,
    'AdaBoost': AdaBoost,
    'GradBoost': GradBoost,
    'XGBoost': XGBoost,
    'CatBoost': CatBoost,
    "SVM": SVM
    }


### Modeler

In [46]:
model_run = ourfunctions.Modeler(models, X=X, y=y)

# after the model_run object is created so we can add onto the default preprocessor.
log_reg_regularized = {'classifier': LogisticRegression(n_jobs=3), 'preprocessor': model_run.create_default_prep(num_add=[('scaling', StandardScaler())])}
model_run.add_model('log_reg_regularized', log_reg_regularized)

### Search parameters and kwargs

In [50]:
#kNN_params = 

LogRegRCV_params = dict(penalty=['l1', 'l2', 'elasticnet'],
                        C=stats.uniform(loc=1, scale=10),
                        max_iter=list(range(100,400)))

DecisionTree_params = dict(criterion=['gini', 'entropy'],
                        max_depth = list(range(20,50)),
                        min_samples_split = list(range(2, 10)),
                        class_weight = 'balanced')

RandForestRCV_params = dict(n_estimators=list(range(100,300)),
                            criterion=['gini', 'entropy'],
                            max_depth = list(range(20,50)),
                            min_samples_split = list(range(2, 10)))

AdaBoost_params = dict(n_estimators=[10, 50, 100, 500],
                        learning_rate=[0.001, 0.01, 0.1, 1.0])

GradBoost_params = dict(n_estimators=[10, 30, 100],
                    criterion=['friedman_mse', 'squared_error'],
                    max_depth=[2, 6, 10],
                    min_samples_split=[5, 10],
                    min_samples_leaf=[3, 6])

XGBoost_params = dict(learning_rate =[0.01, 0.1], 
                    n_estimators=[100, 200, 500],
                    max_depth=[2, 4, 6, 8],
                    colsample_bytree=[0.6, 0.8],
                    objective= 'binary:logistic')

CatBoost_params = dict(max_depth =[3,4,5],
                        n_estimators = [100,200,300])

SVM_params = dict(C=[0.1,1, 10, 100], 
                gamma=[1,0.1,0.01,0.001],
                kernel=['rbf', 'poly', 'sigmoid'])

search_options = {'n_jobs': 3, 'random_state': 9280210, 'n_iter': 20}

## Grid Search

In [36]:
model_run.hyper_search('log_reg_regularized', params=LogRegRCV_params, searcher_kwargs=search_options, set_to_train=True)

In [37]:
#model_run.hyper_search('kNN', params=kNN_params, searcher_kwargs=search_options, set_to_train=True)

In [51]:
model_run.hyper_search('DecisionTreeAd', params=DecisionTree_params, searcher_kwargs=search_options, set_to_train=True)

ValueError: The only valid preset for class_weight is "balanced". Given "c".

In [18]:
model_run.hyper_search('RandomFM_default', params=RandForestRCV_params, searcher_kwargs=search_options, set_to_train=True)

In [19]:
model_run.hyper_search('AdaBoost', params=AdaBoost_params, searcher_kwargs=search_options, set_to_train=True)



In [20]:
model_run.hyper_search('GradBoost', params=GradBoost_params, searcher_kwargs=search_options, set_to_train=True)

TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.

In [21]:
model_run.hyper_search('XGBoost', params=XGBoost_params, searcher_kwargs=search_options, set_to_train=True)

XGBoostError: [19:19:14] /Users/runner/miniforge3/conda-bld/xgboost_1598185652448/work/src/objective/objective.cc:26: Unknown objective function: `c`
Objective candidate: survival:aft
Objective candidate: binary:hinge
Objective candidate: multi:softmax
Objective candidate: multi:softprob
Objective candidate: rank:pairwise
Objective candidate: rank:ndcg
Objective candidate: rank:map
Objective candidate: reg:squarederror
Objective candidate: reg:squaredlogerror
Objective candidate: reg:logistic
Objective candidate: reg:pseudohubererror
Objective candidate: binary:logistic
Objective candidate: binary:logitraw
Objective candidate: reg:linear
Objective candidate: count:poisson
Objective candidate: survival:cox
Objective candidate: reg:gamma
Objective candidate: reg:tweedie

Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000170c5ce4e dmlc::LogMessageFatal::~LogMessageFatal() + 110
  [bt] (1) 2   libxgboost.dylib                    0x0000000170d48257 xgboost::ObjFunction::Create(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, xgboost::GenericParameter const*) + 759
  [bt] (2) 3   libxgboost.dylib                    0x0000000170d0ac76 xgboost::LearnerConfiguration::ConfigureObjective(xgboost::LearnerTrainParam const&, std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > >*) + 1926
  [bt] (3) 4   libxgboost.dylib                    0x0000000170cff90f xgboost::LearnerConfiguration::Configure() + 1327
  [bt] (4) 5   libxgboost.dylib                    0x0000000170cffea7 xgboost::LearnerImpl::UpdateOneIter(int, std::__1::shared_ptr<xgboost::DMatrix>) + 119
  [bt] (5) 6   libxgboost.dylib                    0x0000000170c54c9a XGBoosterUpdateOneIter + 154
  [bt] (6) 7   libffi.7.dylib                      0x000000010b68aead ffi_call_unix64 + 85
  [bt] (7) 8   ???                                 0x0000000305c0cf10 0x0 + 12981423888



In [22]:
model_run.hyper_search('CatBoost', params=CatBoost_params, searcher_kwargs=search_options, set_to_train=True)



Learning rate set to 0.265612
0:	learn: 0.9862021	total: 68.7ms	remaining: 20.5s
1:	learn: 0.9216414	total: 77.4ms	remaining: 11.5s
2:	learn: 0.8824521	total: 83.4ms	remaining: 8.26s
3:	learn: 0.8555248	total: 91.7ms	remaining: 6.79s
4:	learn: 0.8364673	total: 97.2ms	remaining: 5.73s
5:	learn: 0.8218166	total: 102ms	remaining: 5s
6:	learn: 0.8129340	total: 107ms	remaining: 4.5s
7:	learn: 0.8058792	total: 113ms	remaining: 4.13s
8:	learn: 0.7989142	total: 119ms	remaining: 3.86s
9:	learn: 0.7937780	total: 125ms	remaining: 3.63s
10:	learn: 0.7907538	total: 131ms	remaining: 3.44s
11:	learn: 0.7854812	total: 136ms	remaining: 3.27s
12:	learn: 0.7833159	total: 142ms	remaining: 3.13s
13:	learn: 0.7807781	total: 147ms	remaining: 3.01s
14:	learn: 0.7768205	total: 153ms	remaining: 2.91s
15:	learn: 0.7742835	total: 159ms	remaining: 2.81s
16:	learn: 0.7717098	total: 166ms	remaining: 2.76s
17:	learn: 0.7701826	total: 171ms	remaining: 2.69s
18:	learn: 0.7680539	total: 179ms	remaining: 2.65s
19:	learn:

In [23]:
model_run.hyper_search('SVM', params=SVM_params, searcher_kwargs=search_options, set_to_train=True)

TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.

## Test Models

In [27]:
model_run.test_model('log_reg_regularized')

Exception: This model has not been fit yet.

In [26]:
model_run.test_model('kNN')

NotFittedError: This ColumnTransformer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [24]:
model_run.test_model('RandomFM_default')

root - INFO - RandomFM_default test score: 0.7924579124579124


In [29]:
model_run.test_model('DecisionTreesAd')

Exception: This model has not been fit yet.

In [30]:
model_run.test_model('AdaBoost')

root - INFO - AdaBoost test score: 0.637979797979798


In [31]:
model_run.test_model('GradBoost')

Exception: This model has not been fit yet.

In [32]:
model_run.test_model('XGBoost')

Exception: This model has not been fit yet.

In [33]:
model_run.test_model('CatBoost')

root - INFO - CatBoost test score: 0.6808754208754209


In [34]:
model_run.test_model('SVM')

Exception: This model has not been fit yet.

## Plotting

In [35]:
model_run.plot_models(save='wednesday_models_graph')

KeyError: 'test_output'