In [None]:
!pip install --upgrade xverse scikit-optimize catboost --user


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
from xverse.transformer import WOE
from xverse.graph import BarCharts
from xverse.ensemble import VotingSelector
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier, Pool, cv, CatBoost
from catboost.utils import get_roc_curve
from catboost.utils import get_fpr_curve
from catboost.utils import get_fnr_curve
import pprint
# Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef

from sklearn.linear_model import LogisticRegression

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, VerboseCallback, DeltaXStopper
from skopt.space import Real, Categorical, Integer

from time import time
import shap
from sklearn.base import BaseEstimator, TransformerMixin
import warnings
import numexpr as ne
import psutil
import random

In [None]:

print(psutil.cpu_count(logical=False))
print( ne.detect_number_of_cores())

In [None]:

RANDOM_STATE = 1242
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
os.environ['PYTHONHASHSEED'] = str(RANDOM_STATE)

NUM_THREADS = 2
NUM_THREADS_FILE = 2
NUM_THREADS_PRED = 2

In [None]:
pd.set_option('max_colwidth', 500)
pd.set_option('max_columns', 500)

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
input_path = '/kaggle/input/hr-analytics-case-study/'
output_path = '/kaggle/working/'

In [None]:
target_col = 'Attrition'
index_col = 'EmployeeID'

In [None]:

class DataPrepare(BaseEstimator, TransformerMixin):
    """
    Process data
    Parameters:
    -----------
    cat_encode: encode categorical and apply WOE binning
    na_fill: fill NA


    Attributes:
    -----------


    """
    def __init__(self,
                 cat_encode: bool = False,
                 na_fill: bool = False,
                 target_col: str = 'Attrition',
                 index_col: str = 'Id'
                 ) -> None:
        self.cat_encode = cat_encode
        self.na_fill = cat_encode
        self.target_col = target_col
        self.index_col = index_col
        self.copy = True
        self.binner = WOE()
        self.feat_sel = VotingSelector(minimum_votes=3)
        self.clf = WOE()
        self.columns = list()
        self.medians = dict()
        self.means = dict()

    def _copy(self, X: pd.DataFrame) -> pd.DataFrame:
        return X.copy() if self.copy else X

    def _create_dummies(self, X: pd.DataFrame) -> pd.DataFrame:
        obj_col = [col for col in X.select_dtypes(include=['object']).columns if
                   col not in [self.target_col, self.index_col]]
        # generate binary values using get_dummies
        dum_df = pd.get_dummies(X, columns=obj_col, prefix=obj_col)
        remove_dummies = [col for col in dum_df.columns if col in ['BusinessTravel_Travel_Rarely',
                                                                   'Department_Research & Development',
                                                                   'EducationField_Life Sciences', 'Gender_Male',
                                                                   'JobRole_Sales Executive',
                                                                   'MaritalStatus_Married']]
        dum_df.drop(columns=remove_dummies, inplace=True)
        return dum_df

    def fit(self, X, y=None):
        """
        Fitting X

        Parameters:
        -----------
        X: pandas.DataFrame

        Returns:
        --------
        self
        """
        if self.cat_encode:
            dum_df = self._create_dummies(X)
            self.feat_sel.fit(dum_df, y)
            self.binner.fit(self.feat_sel.transform(dum_df), y)
            output_woe_bins = self.binner.woe_bins  # future transformation
            output_mono_bins = self.binner.mono_custom_binning
            self.clf = WOE(woe_bins=output_woe_bins, mono_custom_binning=output_mono_bins)  # output_bins was created earlier
        else:
            self.columns = X.columns.tolist()
        self.medians['NumCompaniesWorked'] = X['NumCompaniesWorked'].median()
        self.means['EnvironmentSatisfaction'] = int(X['EnvironmentSatisfaction'].mean())
        self.means['JobSatisfaction'] = int(X['JobSatisfaction'].mean())
        self.means['WorkLifeBalance'] = int(X['WorkLifeBalance'].mean())

        return self

    def transform(self, X, y=None):
        """
        Transform X.

        Parameters:
        -----------
        X: pandas.DataFrame
            

        Returns:
        --------
        X_transformed: pandas.DataFrame
        """
        x_transformed = self._copy(X)
        if self.cat_encode:
            dum_df = self._create_dummies(x_transformed)
            x_transformed = self.clf.transform(self.feat_sel.transform(dum_df))
        else:
            x_transformed = x_transformed[self.columns]
        if self.na_fill:
            x_transformed['TotalWorkingYears'].fillna(x_transformed['YearsAtCompany'])
            for key, value in self.medians.items():
                x_transformed[key].fillna(value)
            for key, value in self.means.items():
                x_transformed[key].fillna(value)

        return x_transformed

    def fit_transform(self, X, y=None, **fit_params):
        """
        fit X, transform X
        Eqivalent to fit(X).transform(X)

        Parameters:
        -----------
        X: pandas.DataFrame
            

        Returns:
        --------
        X_transformed: pandas.DataFrame
        """
        if y is None:
            # fit method of arity 1 (unsupervised transformation)
            return self.fit(X).transform(X)
        else:
            # fit method of arity 2 (supervised transformation)
            return self.fit(X, y).transform(X, y)


In [None]:
# Reporting util for different optimizers
def report_perf(optimizer, X, y, title, callbacks=None):
    """
    A wrapper for measuring time and performances of different optimizers
    
    optimizer = a sklearn or a skopt optimizer
    X = the training set 
    y = our target
    title = a string label for the experiment
    """
    start = time()
    if callbacks:
        optimizer.fit(X, y, callback=callbacks)
    else:
        optimizer.fit(X, y)
    d=pd.DataFrame(optimizer.cv_results_)
    best_score = optimizer.best_score_
    best_score_std = d.iloc[optimizer.best_index_].std_test_score
    best_params = optimizer.best_params_
    print((title + " took %.2f seconds,  candidates checked: %d, best CV score: %.3f "
           +u"\u00B1"+" %.3f") % (time() - start, 
                                  len(optimizer.cv_results_['params']),
                                  best_score,
                                  3*best_score_std))    
    print('Best parameters:')
    pprint.pprint(best_params)
    print()
    return best_params

In [None]:
res_df = pd.read_csv(input_path + 'general_data.csv')
ms = pd.read_csv(input_path + 'manager_survey_data.csv')
es = pd.read_csv(input_path + 'employee_survey_data.csv')

In [None]:
res_df = res_df.merge(ms, on='EmployeeID')
res_df = res_df.merge(es, on='EmployeeID')
del ms, es

In [None]:
print(res_df.info())

In [None]:
res_df.describe(include='all').T

In [None]:
res_df.nunique()

In [None]:
res_df.drop(columns=['Over18', 'StandardHours', 'EmployeeCount'], inplace=True)

In [None]:
res_df['Attrition'].replace({'Yes': 1, 'No': 0}, inplace=True)

In [None]:
for col in res_df.select_dtypes('object').columns:
    print(res_df[col].value_counts())

In [None]:
columns_to_drop = [col for col in res_df.columns if col in [target_col, 'EmployeeCount', 'Over18', 'StandardHours', index_col]]
X = res_df.drop(columns=columns_to_drop)
y = res_df[target_col]

In [None]:
X_train, X_holdout, y_train, y_holdout = train_test_split(
    X, y,
    stratify = y,
    test_size=0.25, random_state=RANDOM_STATE)
print(X_train.shape, y_train.shape, X_holdout.shape, y_holdout.shape)

In [None]:
%matplotlib inline
num_col = [col for col in X_train.select_dtypes(include=['number']).columns if col not in [target_col, index_col]]

X_train[num_col].hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

In [None]:
corr = X_train.corr()
plt.figure(figsize=(12, 10))
plt.style.use('bmh')

sns.heatmap(corr[abs(corr)>0.49], # (corr >= 0.5) | (corr <= -0.4)
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

In [None]:
binner = WOE()
binner.fit(X_train, y_train)

In [None]:
%matplotlib inline
plt.figure.max_open_warning=30
woe_df = binner.woe_df
charts = BarCharts(bar_type='v')
charts.plot(woe_df);

In [None]:
prep = DataPrepare(cat_encode=True, na_fill=True)
train_x = prep.fit_transform(X_train, y_train)
train_y = y_train.copy()
train_x_h = prep.transform(X_holdout)

In [None]:
feat_sel = VotingSelector(minimum_votes=3)
feat_sel.fit(train_x, train_y)
print(feat_sel.available_techniques)
feat_sel.feature_importances_

In [None]:
feat_sel.feature_votes_

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

logistic = LogisticRegression(class_weight='balanced', dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='ovr', n_jobs=NUM_THREADS,
#           solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False, random_state=RANDOM_STATE)

distributions = dict(C=Real(1e-10, 1e2, 'log-uniform'),
                     penalty=Categorical(['l2', 'l1']))

opt_lr = BayesSearchCV(logistic,
                    distributions,
                    scoring="roc_auc",
                    cv=skf,
                    n_iter=100,
                    n_jobs=2,
                    return_train_score=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=RANDOM_STATE)


In [None]:
%%time
best_params = report_perf(opt_lr, train_x, train_y, 'LogReg', 
                          callbacks=[VerboseCallback(100), 
                                     DeadlineStopper(60*10)]) #DeadlineStopper(60*60*15)])#15 hours

In [None]:
pd.DataFrame(opt_lr.cv_results_).sort_values(by=['param_C', 'mean_test_score']).plot(
    x='param_C', y='mean_test_score', logx=True,
                                   sort_columns=True, figsize=(12,8))

In [None]:
print("val. score: %s" % opt_lr.best_score_)
print("test score: %s" % opt_lr.score(train_x_h, y_holdout))

y_proba = opt_lr.best_estimator_.predict_proba(train_x_h)
y_pred = opt_lr.best_estimator_.predict(train_x_h)

print('Accuracy:', accuracy_score(y_holdout,y_pred))
print('MCC:', matthews_corrcoef(y_holdout,y_pred))
print(confusion_matrix(y_holdout,y_pred))
print(classification_report(y_holdout,y_pred))


In [None]:
prep_cb = DataPrepare()
train_x = prep_cb.fit_transform(X_train, y_train)
train_y = y_train.copy()
train_x_h = prep_cb.transform(X_holdout)

In [None]:
cat_features = [col for col in train_x.select_dtypes(include=['object']).columns if col not in [target_col, index_col]]
cat_features

In [None]:
#usual catboost

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

params = {'iterations': 500,
          'thread_count': NUM_THREADS,
          'learning_rate': 0.05,
          'loss_function': 'Logloss',
          'depth': 6,
          'metadata': {'model_dtypes': str(train_x.dtypes.to_dict())},
          'name': 'cb_sample',
          'random_state': RANDOM_STATE,
          'cat_features': cat_features,
          'custom_metric': ['Accuracy', 'Precision', 'Recall', 'F1'],
          'eval_metric': 'AUC:hints=skip_train~false',
          'early_stopping_rounds': 100,
          'border_count': 30,
#           'boost_from_average': True,
          'metric_period': 25,
#           'task_type': 'GPU',
          'verbose': False}

clf = CatBoostClassifier(**params)

# Defining your search space
search_spaces = {#'iterations': Integer(10, 1000),
                'depth': Integer(2, 6),
                'random_strength': Real(1e-2, 1e3, 'log-uniform'),
                'bagging_temperature': Real(0.5, 1.5),
                'border_count': Integer(5, 50),
                'one_hot_max_size': Integer(2, 15),
                'l2_leaf_reg': Integer(1, 100, 'log-uniform'),
                'max_ctr_complexity': Integer(2, 4),
                }

opt = BayesSearchCV(clf,
                    search_spaces,
                    scoring="roc_auc",
                    cv=skf,
                    n_iter=100,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=True,
                    iid=False,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=RANDOM_STATE)

In [None]:
%%time
best_params = report_perf(opt, train_x, train_y, 'CatBoost', 
                          callbacks=[VerboseCallback(100), 
                                     DeadlineStopper(60*10)]) #DeadlineStopper(60*60*15)])#15 hours

In [None]:
best_params

In [None]:
# pd.DataFrame(opt.cv_results_).sort_values(by=['param_l2_leaf_reg', 'mean_test_score']).plot(
#     x='param_l2_leaf_reg', y='mean_test_score', logx=True,
#                                    sort_columns=True, figsize=(12,8))
col_to_plot='param_l2_leaf_reg'
pd.DataFrame(opt.cv_results_).groupby([col_to_plot])['mean_test_score'].mean().reset_index().plot(
    x=col_to_plot, y='mean_test_score', logx=True,
                                   sort_columns=True, figsize=(12,8))

In [None]:
col_to_plot='param_depth'
pd.DataFrame(opt.cv_results_).groupby([col_to_plot])['mean_test_score'].mean().reset_index().plot(
    x=col_to_plot, y='mean_test_score',
                                   sort_columns=True, figsize=(12,8))

In [None]:
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(train_x_h, y_holdout))

y_proba = opt.best_estimator_.predict_proba(train_x_h)
y_pred = opt.best_estimator_.predict(train_x_h)

print('Accuracy:', accuracy_score(y_holdout,y_pred))
print('MCC:', matthews_corrcoef(y_holdout,y_pred))
print(confusion_matrix(y_holdout,y_pred))
print(classification_report(y_holdout,y_pred))

In [None]:
opt.best_estimator_.get_all_params()

In [None]:
cv_dataset = Pool(data=train_x_h,
                  label=y_holdout,
                  cat_features=cat_features,
                  feature_names=train_x_h.columns.tolist(),
                  thread_count=NUM_THREADS_FILE)
shap_values=opt.best_estimator_.get_feature_importance(cv_dataset, type='ShapValues', thread_count = NUM_THREADS,
                                                   verbose = cv_dataset.num_row()//5)
shap_expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

shap.summary_plot(shap_values,train_x_h, max_display=30, auto_size_plot=True)

shap.summary_plot(shap_values,train_x_h, max_display=30, plot_type='bar')

## Catboost in Stochastic Gradient Langevin Boosting (SGLB) mode
Recently Catboost team presented new feature. Let's test it.

https://arxiv.org/abs/2001.07248
> In this paper, we introduce Stochastic Gradient
> Langevin Boosting (SGLB) — a powerful and efficient machine learning framework, which may
> deal with a wide range of loss functions and has
> provable generalization guarantees. The method
> is based on a special form of Langevin Diffusion
> equation specifically designed for gradient boosting. This allows us to **guarantee the global convergence**, while standard gradient boosting algorithms can guarantee only local optima, which is
> a problem for multimodal loss functions. To illustrate the advantages of SGLB, we apply it to a
> classification task with 0-1 loss function, which
> is known to be multimodal, and to a standard Logistic regression task that is convex. The **algorithm is implemented as a part of the CatBoost**
> gradient boosting library and **outperforms classic**
> gradient boosting methods.

https://github.com/catboost/catboost/releases/tag/v0.21

> The main feature of this release is the Stochastic Gradient Langevin Boosting (SGLB) mode that can improve quality of your models with **non-convex loss functions**. To use it specify langevin option and tune diffusion_temperature and model_shrink_rate. 

In [None]:
#catboost with SGLB functionality

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

params = {'iterations': 500,
          'thread_count': NUM_THREADS,
          'learning_rate': 0.05,
          'loss_function': 'Logloss',
          'depth': 6,
          'metadata': {'model_dtypes': str(train_x.dtypes.to_dict())},
          'name': 'cb_sglb',
          'random_state': RANDOM_STATE,
          'cat_features': cat_features,
          'custom_metric': ['Accuracy', 'Precision', 'Recall', 'F1'],
          'eval_metric': 'AUC:hints=skip_train~false',
          'early_stopping_rounds': 100,
          'border_count': 30,
          'langevin': True,# option and tune diffusion_temperature and model_shrink_rate
#           'boost_from_average': True,
          'metric_period': 25,
#           'task_type': 'GPU',
          'verbose': False}

clf_sglb = CatBoostClassifier(**params)
# Defining your search space
search_spaces_sglb = {#'iterations': Integer(10, 1000),
                'depth': Integer(2, 6),
                'random_strength': Real(1e-2, 1e3, 'log-uniform'),
                'bagging_temperature': Real(0.5, 1.5),
                'border_count': Integer(5, 50),
                'one_hot_max_size': Integer(2, 15),
                'l2_leaf_reg': Integer(1, 100, 'log-uniform'),
                'max_ctr_complexity': Integer(2, 4),
                'diffusion_temperature': Real(0.01, 10.0, 'log-uniform'),
#                 'model_shrink_rate': Real(0.1, 3.0),
                }

opt_sglb = BayesSearchCV(clf_sglb,
                    search_spaces_sglb,
                    scoring="roc_auc",
                    cv=skf,
                    n_iter=100,
                    n_jobs=1,  # use just 1 job with CatBoost in order to avoid segmentation fault
                    return_train_score=True,
                    iid=False,
                    refit=True,
                    optimizer_kwargs={'base_estimator': 'GP'},
                    random_state=RANDOM_STATE)

In [None]:
%%time
best_params = report_perf(opt_sglb, train_x, train_y, 'CatBoostSGLB', 
                          callbacks=[VerboseCallback(100), 
                                     DeadlineStopper(60*10)]) #DeadlineStopper(60*60*15)])#15 hours

In [None]:
best_params

In [None]:
# pd.DataFrame(opt_sglb.cv_results_).sort_values(by=['param_diffusion_temperature', 'mean_test_score']).plot(
#     x='param_diffusion_temperature', y='mean_test_score',
#                                    sort_columns=True, figsize=(12,8))
col_to_plot='param_diffusion_temperature'
pd.DataFrame(opt_sglb.cv_results_).groupby([col_to_plot])['mean_test_score'].mean().reset_index().plot(
    x=col_to_plot, y='mean_test_score',
                                   sort_columns=True, figsize=(12,8))

In [None]:
# pd.DataFrame(opt_sglb.cv_results_).sort_values(by=['param_depth', 'mean_test_score']).plot(
#     x='param_depth', y='mean_test_score',
#                                    sort_columns=True, figsize=(12,8))
col_to_plot='param_depth'
pd.DataFrame(opt_sglb.cv_results_).groupby([col_to_plot])['mean_test_score'].mean().reset_index().plot(
    x=col_to_plot, y='mean_test_score',
                                   sort_columns=True, figsize=(12,8))

In [None]:
col_to_plot='param_l2_leaf_reg'
pd.DataFrame(opt_sglb.cv_results_).groupby([col_to_plot])['mean_test_score'].mean().reset_index().plot(
    x=col_to_plot, y='mean_test_score',# logx=True,
                                   sort_columns=True, figsize=(12,8))

In [None]:
col_to_plot='param_max_ctr_complexity'
pd.DataFrame(opt_sglb.cv_results_).groupby([col_to_plot])['mean_test_score'].mean().reset_index().plot(
    x=col_to_plot, y='mean_test_score',
                                   sort_columns=True, figsize=(12,8))

In [None]:
print("val. score: %s" % opt_sglb.best_score_)
print("test score: %s" % opt_sglb.score(train_x_h, y_holdout))

y_proba = opt_sglb.best_estimator_.predict_proba(train_x_h)
y_pred = opt_sglb.best_estimator_.predict(train_x_h)

print('Accuracy:', accuracy_score(y_holdout,y_pred))
print('MCC:', matthews_corrcoef(y_holdout,y_pred))
print(confusion_matrix(y_holdout,y_pred))
print(classification_report(y_holdout,y_pred))

In [None]:
opt_sglb.best_estimator_.get_all_params()

In [None]:
pd.DataFrame(opt_sglb.cv_results_).sort_values(by=['mean_test_score', 'mean_train_score', 'mean_fit_time'], ascending=[False, True, True])

In [None]:
# Seems to fail in new mode with
# CatBoostError: catboost/libs/fstr/shap_values.cpp:810: Cannot calc shap values, model contains non zero approx for zero-weight leaf
cv_dataset = Pool(data=train_x_h,
                  label=y_holdout,
                  cat_features=cat_features,
                  feature_names=train_x_h.columns.tolist(),
                  thread_count=NUM_THREADS_FILE)
shap_values=opt_sglb.best_estimator_.get_feature_importance(cv_dataset, type='ShapValues', thread_count = NUM_THREADS,
                                                   verbose = cv_dataset.num_row()//5)
shap_expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]

shap.summary_plot(shap_values,train_x_h, max_display=30, auto_size_plot=True)

shap.summary_plot(shap_values,train_x_h, max_display=30, plot_type='bar')