# Import

In [1]:
# Basic
import pandas as pd
import numpy as np

import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.preprocessing import *

from robusta.crossval import *
from robusta.pipeline import *
from sklearn.metrics import *

# Data
from catboost.datasets import amazon

# Model
from robusta.testing import all_transformers, get_estimator, ESTIMATORS

%load_ext memory_profiler

Using TensorFlow backend.


# Data

In [2]:
train, test = amazon()

y_train = train['ACTION']
train.drop(columns='ACTION', inplace=True)

train.index.name = 'id'
test.set_index('id', inplace=True)

train

Unnamed: 0_level_0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,42680,5905,117929,117930,119569,119323,123932,19793,119325
...,...,...,...,...,...,...,...,...,...
32764,23497,16971,117961,118300,119993,118321,240983,290919,118322
32765,25139,311198,91261,118026,122392,121143,173805,249618,121145
32766,34924,28805,117961,118327,120299,124922,152038,118612,124924
32767,80574,55643,118256,118257,117945,280788,280788,292795,119082


# Preprocessing

In [3]:
from inspect import getfullargspec

check_arg = lambda func, arg: arg in getfullargspec(func)[0]

In [4]:
transformers = []

for transformer_dict in all_transformers():
    
    if 'numeric' not in transformer_dict['tags']:
        continue
        
    transformer = transformer_dict['class']()
    name = transformer_dict['name']
    
    if not check_arg(transformer.fit, 'X'):
        continue
        
    transformer = make_pipeline(
        transformer,
        ColumnRenamer(prefix=name+'__'),
    )
        
    transformers.append((name, transformer))

In [16]:
prep_pipe = make_pipeline(
    FeatureUnion(transformers),
    FeatureUnion([
        ('category', make_pipeline(
            TypeSelector('category'),
            OrdinalEncoder(),
            Categorizer(),
        )),
        ('numeric', make_pipeline(
            TypeSelector(np.number),
        )),
    ]),
)

X_train = prep_pipe.fit_transform(train)
X_test = prep_pipe.transform(test)

X_train

Unnamed: 0,KBinsDiscretizer__RESOURCE,KBinsDiscretizer__MGR_ID,KBinsDiscretizer__ROLE_ROLLUP_1,KBinsDiscretizer__ROLE_ROLLUP_2,KBinsDiscretizer__ROLE_DEPTNAME,KBinsDiscretizer__ROLE_TITLE,KBinsDiscretizer__ROLE_FAMILY_DESC,KBinsDiscretizer__ROLE_FAMILY,KBinsDiscretizer__ROLE_CODE,DowncastTransformer__RESOURCE,...,PolynomialFeatures__ROLE_TITLE^2,PolynomialFeatures__ROLE_TITLE ROLE_FAMILY_DESC,PolynomialFeatures__ROLE_TITLE ROLE_FAMILY,PolynomialFeatures__ROLE_TITLE ROLE_CODE,PolynomialFeatures__ROLE_FAMILY_DESC^2,PolynomialFeatures__ROLE_FAMILY_DESC ROLE_FAMILY,PolynomialFeatures__ROLE_FAMILY_DESC ROLE_CODE,PolynomialFeatures__ROLE_FAMILY^2,PolynomialFeatures__ROLE_FAMILY ROLE_CODE,PolynomialFeatures__ROLE_CODE^2
0,2,4,0,1,4,0,0,2,0,39353,...,1.390159e+10,1.390171e+10,3.430080e+10,1.390194e+10,1.390182e+10,3.430110e+10,1.390206e+10,8.463386e+10,3.430168e+10,1.390230e+10
1,0,0,0,3,4,2,1,3,2,17183,...,1.405078e+10,1.405078e+10,3.657713e+10,1.405114e+10,1.405078e+10,3.657713e+10,1.405114e+10,9.521791e+10,3.657805e+10,1.405149e+10
2,2,2,2,1,0,-1,4,0,-1,36724,...,1.389546e+10,3.158591e+10,2.324692e+09,1.389558e+10,7.179827e+10,5.284281e+09,3.158618e+10,3.889178e+08,2.324711e+09,1.389569e+10
3,2,1,0,3,3,1,3,2,1,36135,...,1.399986e+10,2.851335e+10,3.442183e+10,1.399998e+10,5.807281e+10,7.010653e+10,2.851359e+10,8.463386e+10,3.442212e+10,1.400010e+10
4,3,1,0,0,2,3,2,0,3,42680,...,1.423798e+10,1.478794e+10,2.361760e+09,1.423822e+10,1.535914e+10,2.452986e+09,1.478819e+10,3.917628e+08,2.361800e+09,1.423846e+10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32764,1,2,0,1,3,1,3,2,1,23497,...,1.399986e+10,2.851335e+10,3.442183e+10,1.399998e+10,5.807281e+10,7.010653e+10,2.851359e+10,8.463386e+10,3.442212e+10,1.400010e+10
32765,1,4,0,0,4,4,3,2,4,25139,...,1.467563e+10,2.105526e+10,3.023947e+10,1.467587e+10,3.020818e+10,4.338486e+10,2.105561e+10,6.230915e+10,3.023997e+10,1.467611e+10
32766,2,3,0,2,3,4,3,2,4,34924,...,1.560551e+10,1.899289e+10,1.481725e+10,1.560576e+10,2.311555e+10,1.803353e+10,1.899320e+10,1.406881e+10,1.481749e+10,1.560601e+10
32767,4,4,2,1,0,4,4,3,3,80574,...,7.884190e+10,7.884190e+10,8.221332e+10,3.343680e+10,7.884190e+10,8.221332e+10,3.343680e+10,8.572891e+10,3.486661e+10,1.418052e+10


# Task

In [9]:
cv = 5
scoring = 'roc_auc'

# Estimator

In [13]:
estimator = get_estimator('LGB', 'classifier')
estimator.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# Evaluation

## Before

In [18]:
result = crossval(estimator, cv, train, y_train,
                  scoring=scoring, return_pred=False,
                  verbose=2, n_jobs=None)

[00:06:16]  LGBMClassifier

[00:06:16]  FOLD  0:   0.8288
[00:06:16]  FOLD  1:   0.8555
[00:06:17]  FOLD  2:   0.8260
[00:06:17]  FOLD  3:   0.8286
[00:06:17]  FOLD  4:   0.8533

[00:06:17]  AVERAGE:   [33m0.8384[0m ± 0.0131



## After

In [19]:
result = crossval(estimator, cv, X_train, y_train,
                  scoring=scoring, return_pred=False,
                  verbose=2, n_jobs=None)

[00:06:18]  LGBMClassifier

[00:06:22]  FOLD  0:   0.8323
[00:06:26]  FOLD  1:   0.8593
[00:06:30]  FOLD  2:   0.8343
[00:06:34]  FOLD  3:   0.8288
[00:06:39]  FOLD  4:   0.8577

[00:06:39]  AVERAGE:   [33m0.8425[0m ± 0.0132



# Feature Importances

In [38]:
imp = pd.DataFrame(result['importance'], columns=X_train.columns)
imp = pd.concat([imp.mean(), imp.std()], axis=1)
imp.columns = ['mean', 'std']

imp.sort_values('mean', ascending=False)[:50]

Unnamed: 0,mean,std
SyntheticFeatures__RESOURCE+MGR_ID,52.2,8.642916
SyntheticFeatures__MGR_ID+ROLE_FAMILY_DESC,52.2,7.854935
Normalizer__ROLE_DEPTNAME,50.2,7.155418
SyntheticFeatures__RESOURCE*MGR_ID,47.8,8.613942
SyntheticFeatures__MGR_ID+ROLE_DEPTNAME,47.6,8.848729
Normalizer__ROLE_FAMILY_DESC,42.8,5.540758
SyntheticFeatures__MGR_ID*ROLE_FAMILY,39.4,9.343447
Normalizer__RESOURCE,39.0,4.358899
SyntheticFeatures__MGR_ID*ROLE_FAMILY_DESC,38.8,4.32435
SyntheticFeatures__MGR_ID-ROLE_FAMILY_DESC,38.0,5.385165
