In [1]:
from datetime import date
import time

import pandas as pd
import numpy as np
import dask.dataframe as dd
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from skopt import BayesSearchCV
from category_encoders import TargetEncoder

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
dtypes = {
    'EmployeeNo': 'category',
    'Division': 'category',
    'Qualification': 'category',
    'Gender': 'category',
    'Channel_of_Recruitment': 'category',
    'Trainings_Attended': 'int64',
    'Year_of_birth': 'int64',
    'Last_performance_score': 'float64',
    'Year_of_recruitment': 'int64',
    'Targets_met': 'category',
    'Previous_Award': 'category',
    'Training_score_average': 'int64',
    'State_Of_Origin': 'category',
    'Foreign_schooled': 'category',
    'Marital_Status': 'category',
    'Past_Disciplinary_Action': 'category',
    'Previous_IntraDepartmental_Movement': 'category',
    'No_of_previous_employers': 'category',
    'Promoted_or_Not': 'int64'
}

In [3]:
train_df = pd.read_csv('train.csv', dtype=dtypes)
test_df = pd.read_csv('test.csv', dtype=dtypes)

In [4]:
test_df.head()

Unnamed: 0,EmployeeNo,Division,Qualification,Gender,Channel_of_Recruitment,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers
0,YAK/S/00005,Information Technology and Solution Support,First Degree or HND,Male,Agency and others,2,1976,7.5,2017,0,0,65,FCT,Yes,Married,No,No,1
1,YAK/S/00011,Information Technology and Solution Support,,Male,Direct Internal process,2,1991,0.0,2018,0,0,69,OGUN,Yes,Married,No,No,1
2,YAK/S/00015,Research and Innovation,"MSc, MBA and PhD",Male,Direct Internal process,2,1984,7.5,2012,0,0,76,KANO,Yes,Married,No,No,1
3,YAK/S/00016,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,2,1984,2.5,2009,0,0,52,RIVERS,No,Single,No,No,1
4,YAK/S/00017,Information Technology and Solution Support,First Degree or HND,Male,Agency and others,2,1983,7.5,2014,1,0,69,FCT,Yes,Married,No,No,0


In [5]:
# Handle missing Qualifications
train_df['Qualification'] = train_df['Qualification'].cat.add_categories('Unknown').fillna('Unknown')
test_df['Qualification'] = test_df['Qualification'].cat.add_categories('Unknown').fillna('Unknown')

In [6]:
target_column = 'Promoted_or_Not'

In [7]:
# Select all categorical columns for encoding
categorical_columns = list(train_df.select_dtypes('category').columns)
categorical_columns = list(set(categorical_columns) - set({'Promoted_or_Not', 'EmployeeNo', 'Gender', 'State_Of_Origin'}))

## Transformers

In [8]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.columns = list(X.columns)
        return self
    
    def transform(self, X, y=None):
        this_year = date.today().year
        age =  this_year - X['Year_of_birth']
        num_yrs_at_firm = this_year - X['Year_of_recruitment']
        self.columns.extend(['age', 'num_yrs_at_firm'])
        return pd.DataFrame(np.c_[X, age, num_yrs_at_firm], columns=self.columns)


class ColumnsSelector(BaseEstimator, TransformerMixin):
    def __init__(self, drop_cols):
        self.drop_cols = drop_cols
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        relevant_cols = list(set(X.columns) - set(self.drop_cols))
        return X[relevant_cols]
    

class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, cols, target):
        self.cols = cols
        self.target = target
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        y = X[self.target]
        return TargetEncoder(cols=self.cols).fit_transform(X, y)

In [9]:
# Pipeline
pipeline = Pipeline([
    ('cat', CategoricalEncoder(cols=categorical_columns, target='Promoted_or_Not')),
    ('feature_engineering', FeatureEngineer()),
    ('selector', ColumnsSelector(drop_cols=['EmployeeNo', 'Gender', 'State_Of_Origin', 'Year_of_birth', 'Year_of_recruitment']))
])

In [10]:
encoded_df = pipeline.fit_transform(train_df)

In [11]:
encoded_df['Promoted_or_Not'] = encoded_df['Promoted_or_Not'].astype('category')

In [12]:
X = encoded_df.drop(['Promoted_or_Not'], axis=1)
y = encoded_df['Promoted_or_Not']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Hyperparameter Tuning

In [45]:
# param_hyperopt= {
#     'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(1)),
#     'max_depth': scope.int(hp.quniform('max_depth', 5, 15, 1)),
#     'n_estimators': scope.int(hp.quniform('n_estimators', 5, 35, 1)),
#     'num_leaves': scope.int(hp.quniform('num_leaves', 5, 50, 1)),
#     'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']),
#     'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),
#     'reg_lambda': hp.uniform('reg_lambda', 0.0, 1.0),
# }

In [46]:
GradientBoostingClassifier()

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [60]:
opt = BayesSearchCV(
    GradientBoostingClassifier(),
    {
        'loss': ['deviance', 'exponential'],
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'subsample': (0.01, 1.0, 'log-uniform'),
        'max_depth': (5, 15, 'log-uniform'),
        'max_features': ['auto', 'sqrt', 'log2']
    },
    n_iter=2,
    cv=10
)

In [61]:
opt.fit(X_train, y_train)

BayesSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=True, n_iter=2, n_jobs=1, n_points=1,
       optimizer_kwargs=None, pre_dispatch='2*n_jobs', random_state=None,
       refit=True, return_train_score=False, scoring=None,
       search_spaces={'loss': ['deviance', 'exponential'], 'learning_rate': (0.01, 1.0, 'log-uniform'), 'subsample': (0.01, 1.0, 'log-uniform'), 'max_depth': (5, 15, 'log-uniform'), 'max_features': ['auto', 'sqrt', 'log2']},
       verbose=0)

In [50]:
opt.best_params_

{'learning_rate': 0.24069470383761565, 'loss': 'deviance'}

In [51]:
opt.best_score_

0.9418578093901921

In [52]:
opt.score(X_test, y_test)

0.9393188046457002