In [17]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import ourfunctions

from matplotlib import pyplot as plt
import seaborn as sns
#from sklearnex import patch_sklearn
#patch_sklearn(verbose=False)
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, LabelEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import plot_confusion_matrix, recall_score, accuracy_score, precision_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImPipeline
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBRegressor
from catboost import CatBoostClassifier


In [18]:
X = pd.read_csv('data/Training-set-values.csv')
y = pd.read_csv('data/Training-set-labels.csv')

X['date_recorded'] = pd.to_datetime(X['date_recorded']).astype(np.int64)

#### Preprocessors

In [19]:
# Super basic numeric transformer

numeric_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median'))]
)

numeric_preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_transformer, make_column_selector(dtype_include=np.number)),
    ]
)

### Models

In [20]:
# Gradient Boost
GradBoost = {'classifier': GradientBoostingClassifier(),'preprocessor': numeric_preprocessor}
GradBoost2 = {'classifier': GradientBoostingClassifier(),'preprocessor': None}
GradBoost3 = {'classifier': GradientBoostingClassifier(),'preprocessor': None}
# XGradient Boosting
XGBoost = {'classifier': XGBRegressor(objective='reg:squarederror'), 'preprocessor': numeric_preprocessor}
# CatBoost
CatBoost = {'classifier': CatBoostClassifier(max_depth=3),'preprocessor': numeric_preprocessor}



models = {'GradientBoost': GradBoost,
    'GradientBoost2': GradBoost2,
    'GradientBoost3': GradBoost3,
    'XGBoost': XGBoost,
    'CatBoost': CatBoost
    }


### Modeler

In [21]:
model_run = ourfunctions.Modeler(models, X=X, y=y)

# after the model_run object is created so we can add onto the default preprocessor.
log_reg_regularized = {'classifier': LogisticRegression(n_jobs=3), 'preprocessor': model_run.create_default_prep(num_add=[('scaling', StandardScaler())])}
model_run.add_model('log_reg_regularized', log_reg_regularized)

### Search parameters and kwargs

In [22]:
GradBoost_params = dict(n_estimators=np.array(range(100, 400)),
                    criterion=['friedman_mse', 'squared_error'],
                    max_depth=np.array(range(2, 10)),
                    min_samples_split=np.array(range(2, 10)),
                    min_samples_leaf=np.array(range(1, 10)),
                    learning_rate=stats.uniform(loc=0.01, scale=1))

GradBoost3_params = dict(n_estimators=np.array(range(200, 1000)),
                    criterion=['friedman_mse', 'squared_error'],
                    max_depth=np.array(range(2, 10)),
                    min_samples_split=np.array(range(2, 10)),
                    min_samples_leaf=np.array(range(1, 10)),
                    learning_rate=stats.uniform(loc=0.001, scale=1))

XGBoost_params = dict(learning_rate =stats.uniform(loc=0.1, scale=0.1),
                    n_estimators=np.array(range(100,1200)),
                    max_depth=np.array(range(4,30)))

CatBoost_params = dict(max_depth =[3,4,5],
                         n_estimators = [100,200,300])

search_options = {'n_jobs': 3, 'random_state': 9280210, 'n_iter': 20}

## RandomizedSearchCV

In [23]:
model_run.hyper_search('GradientBoost', params=GradBoost_params, searcher_kwargs=search_options, set_to_train=True)

In [24]:
model_run.hyper_search('GradientBoost2', params=GradBoost_params, searcher_kwargs=search_options, set_to_train=True)



In [25]:
model_run.hyper_search('GradientBoost3', params=GradBoost3_params, searcher_kwargs=search_options, set_to_train=True)

In [26]:
model_run.hyper_search('XGBoost', params=XGBoost_params, searcher_kwargs=search_options, set_to_train=True)



In [27]:
model_run.hyper_search('CatBoost', params=CatBoost_params, searcher_kwargs=search_options, set_to_train=True)



Learning rate set to 0.265612
0:	learn: 0.9862021	total: 72.3ms	remaining: 21.6s
1:	learn: 0.9216414	total: 78.8ms	remaining: 11.7s
2:	learn: 0.8824521	total: 84.7ms	remaining: 8.38s
3:	learn: 0.8555248	total: 90.3ms	remaining: 6.68s
4:	learn: 0.8364673	total: 95.2ms	remaining: 5.62s
5:	learn: 0.8218166	total: 100ms	remaining: 4.9s
6:	learn: 0.8129340	total: 105ms	remaining: 4.39s
7:	learn: 0.8058792	total: 111ms	remaining: 4.03s
8:	learn: 0.7989142	total: 116ms	remaining: 3.75s
9:	learn: 0.7937780	total: 121ms	remaining: 3.52s
10:	learn: 0.7907538	total: 127ms	remaining: 3.33s
11:	learn: 0.7854812	total: 132ms	remaining: 3.17s
12:	learn: 0.7833159	total: 137ms	remaining: 3.02s
13:	learn: 0.7807781	total: 142ms	remaining: 2.9s
14:	learn: 0.7768205	total: 148ms	remaining: 2.8s
15:	learn: 0.7742835	total: 153ms	remaining: 2.71s
16:	learn: 0.7717098	total: 159ms	remaining: 2.64s
17:	learn: 0.7701826	total: 164ms	remaining: 2.57s
18:	learn: 0.7680539	total: 169ms	remaining: 2.5s
19:	learn:

## Test Models

In [43]:
# Gradient Boost
gb_model=model_run.get_model('GradientBoost')['model_pipeline']
Gradient_Boost = gb_model.score(X=model_run._X_test, y=model_run._y_test)
Gradient_Boost

0.705993265993266

In [44]:
# Gradient Boost 2
gb2_model= model_run.get_model('GradientBoost2')['model_pipeline']
Gradient_Boost2 = gb2_model.score(X=model_run._X_test, y=model_run._y_test)
Gradient_Boost2 

0.8046464646464646

In [45]:
# Gradient Boost 3
gb3_model= model_run.get_model('GradientBoost3')['model_pipeline']
Gradient_Boost3 = gb3_model.score(X=model_run._X_test, y=model_run._y_test)
Gradient_Boost3 

0.8052525252525252

In [46]:
# XGBoost
xgb_model= model_run.get_model('XGBoost')['model_pipeline']
XG_Boost = xgb_model.score(X=model_run._X_test, y=model_run._y_test)
XG_Boost

0.29846332133089515

In [47]:
# CatBoost
cb_model= model_run.get_model('CatBoost')['model_pipeline']
Cat_Boost = cb_model.score(X=model_run._X_test, y=model_run._y_test)
Cat_Boost

0.6808754208754209

In [48]:
boost_models = {'Gradient_Boost': 0.705993265993266,
    'Gradient_Boost2': 0.8046464646464646,
    'Gradient_Boost3': 0.8052525252525252,
    'XG_Boost': 0.29846332133089515,
    'Cat_Boost': 0.6808754208754209
    }

## Plotting

In [40]:
plot_models(self, sns_style='darkgrid', sns_context='talk', palette='coolwarm', save=None, labels=None):
        """
        Skylar slide style, with thanks to Matt. Has options for seaborn plotting. If you want to save the plot,
        give the save option a filename, exactly as would be done with plt.savefig() Labels must be provided as a
        dictionary with the model names as keys and the Label you'd like to display as a value.
        """
        logger.removeHandler(c_handler)
        logger.removeHandler(f_handler)

        xticklabels = [labels[key] for key in self._models.keys()] if labels else list(self._models.keys())
        y = [model['test_output'] for model in self._models.values()]

        sns.set_style(sns_style)
        sns.set_context(sns_context)
        fig, ax = plt.subplots(figsize=(20, 10))

        fig.set_tight_layout(True)

        sns.barplot(x=xticklabels, y=y, palette=palette)
        ax.set(ylim=(0, 1))
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')

        # ax2 = ax.twinx()
        # sns.lineplot(x=xticklabels, y=x_error, linewidth=5)
        # ax2.set(ylim=(0, 300000))
        # ax2.set_yticks(np.linspace(0,300000,num=6))
        # ax2.set_yticklabels(np.linspace(0,300,num=6,dtype=int))

        ax.set_ylabel('Accuracy Score')
        # ax2.set_ylabel('Error, USD (thousands)')
        ax.set_title('Model Effectiveness');

        if save:
            plt.savefig(save)

KeyError: 'test_output'

## Modeler

### Model 1

In [None]:
model_run.model_evaluation('')

In [None]:
importance_kwargs = dict(n_repeats=10, n_jobs=3)
model_run.permutation_importance('', perm_kwargs=importance_kwargs)

### Model 2

In [None]:
model_run.model_evaluation('')

In [None]:
importance_kwargs = dict(n_repeats=10, n_jobs=3)
model_run.permutation_importance('', perm_kwargs=importance_kwargs)