# Import

In [1]:
# Basic
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.selector import *
from robusta.crossval import *
from robusta.pipeline import *
from robusta.preprocessing import *

from sklearn.model_selection import train_test_split
from sklearn.linear_model import *
from sklearn.metrics import *

# Model
from lightgbm import LGBMClassifier

Using TensorFlow backend.


# Data

In [2]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=1000, n_features=50, n_informative=20, 
                           n_redundant=5, n_classes=2, n_clusters_per_class=3, 
                           random_state=666)

X = pd.DataFrame(X)
y = pd.Series(y)

X.rename(columns=lambda x: 'x{}'.format(x), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
X_test

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x40,x41,x42,x43,x44,x45,x46,x47,x48,x49
521,-0.015545,3.120701,-0.952097,2.261142,-0.485901,1.727562,-0.682361,-0.736157,0.954906,0.323083,...,2.185016,-0.375094,3.046058,5.049321,-1.107050,-0.180777,-0.446804,2.495812,0.892103,-0.328104
737,0.938483,-0.114027,-0.634103,-3.072293,-0.719155,1.208492,0.035899,-0.582463,1.561299,-4.078034,...,0.127850,1.449720,-1.474860,-1.024908,1.965200,0.703258,0.151494,-4.599697,1.661065,-0.732552
740,0.426196,-1.923413,0.081620,2.799865,1.149632,-0.634046,0.423921,1.427695,-0.198778,-7.494691,...,-0.818374,-1.032857,-1.475499,-0.330181,1.264079,-0.765433,-0.678938,0.939628,2.061886,0.882823
660,0.971310,0.542094,0.002276,0.522958,-0.469007,1.102414,-0.197951,-0.075132,-0.973817,-3.635104,...,0.586837,3.191036,-0.552804,0.853129,3.049305,0.875181,1.141393,2.431720,2.034581,-0.054248
411,-0.827593,-1.197363,1.730815,4.004263,0.677573,0.147013,-0.115850,0.581984,-2.249601,0.410509,...,0.235676,-3.486864,-0.877853,2.466350,-0.616883,0.547480,0.371583,1.330100,1.617696,-0.463036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,-0.092397,-2.051265,-0.368744,3.758068,-1.231090,-0.664343,-0.967801,1.899223,0.023334,1.401348,...,0.263918,3.622385,-3.186513,-1.434954,3.269006,2.380049,0.711234,-0.690256,-0.340061,-1.505044
332,0.260335,2.925519,0.063404,0.499770,-0.151398,0.707486,2.024642,-2.097941,0.260230,-5.095540,...,-0.272538,5.402958,1.055036,-3.539942,-0.587854,1.548070,0.840004,3.241976,-0.573676,0.145328
208,-1.612152,2.016041,-1.333888,1.196686,-2.193994,2.114421,-0.306809,0.374249,1.179624,0.460604,...,0.234624,1.151448,1.874006,1.401194,-0.203070,1.015166,-0.593770,0.977323,2.704582,-0.361519
613,0.854486,-1.786048,-0.161680,-5.288175,0.167380,0.178870,2.773503,1.068635,-1.428526,19.775181,...,0.837280,2.548546,-7.298736,-0.222159,-3.649191,-1.130297,0.034346,-1.371287,-0.154926,-0.257729


# Task & Model

In [4]:
scoring = 'roc_auc'
cv = 5

model = LogisticRegression()

# Baseline

In [5]:
features = X_train.columns

_, y_pred = crossval_predict(model, cv, X_train[features], y_train, X_new=X_test[features],
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

roc_auc_score(y_test, y_pred)

[01:14:12]  LogisticRegression

[01:14:12]  FOLD  0:   0.8495
[01:14:12]  FOLD  1:   0.9011
[01:14:12]  FOLD  2:   0.8280
[01:14:12]  FOLD  3:   0.8470
[01:14:12]  FOLD  4:   0.8074

[01:14:12]  AVERAGE:   [33m0.8466[0m ± 0.0312



0.8626

# Selector

## RFE 
#### (Recursive Feature Elimination)

In [11]:
selector = RFE(model, cv, scoring, step=0.1, n_jobs=-1)
selector.fit(X_train, y_train)

features = selector.get_features()

[01:17:13] ITER: 1/9      SUBSET: 50/50      SCORE: [33m0.8466[0m ± [36m0.0312[0m      ETA: 879 ms
[01:17:13] ITER: 2/9      SUBSET: 45/50      SCORE: [33m0.8466[0m ± 0.0312      ETA: 770 ms
[01:17:13] ITER: 3/9      SUBSET: 41/50      SCORE: [33m0.8492[0m ± [36m0.0307[0m      ETA: 662 ms
[01:17:13] ITER: 4/9      SUBSET: 37/50      SCORE: [33m0.8547[0m ± 0.0312      ETA: 550 ms
[01:17:14] ITER: 5/9      SUBSET: 34/50      SCORE: [33m0.8559[0m ± [36m0.0293[0m      ETA: 439 ms
[01:17:14] ITER: 6/9      SUBSET: 31/50      SCORE: 0.8540 ± [36m0.0285[0m      ETA: 328 ms
[01:17:14] ITER: 7/9      SUBSET: 28/50      SCORE: [33m0.8562[0m ± 0.0292      ETA: 218 ms
[01:17:14] ITER: 8/9      SUBSET: 26/50      SCORE: [33m0.8573[0m ± 0.0292      ETA: 108 ms
[01:17:14] ITER: 9/9      SUBSET: 25/50      SCORE: 0.8572 ± 0.0304      ETA: 0 ms
Iterations limit exceed!


In [12]:
result = crossval(model, cv, X_train[features], y_train, X_new=X_test[features],
                  scoring=scoring, method='predict_proba', verbose=2)

roc_auc_score(y_test, y_pred)

[01:17:16]  LogisticRegression

[01:17:16]  FOLD  0:   0.8623
[01:17:16]  FOLD  1:   0.9054
[01:17:16]  FOLD  2:   0.8314
[01:17:17]  FOLD  3:   0.8646
[01:17:17]  FOLD  4:   0.8228

[01:17:17]  AVERAGE:   [33m0.8573[0m ± 0.0292



0.8686

## RFE via Permutation Importance

In [8]:
selector = PermutationRFE(model, cv, scoring, step=0.1, n_jobs=-1)
selector.fit(X_train, y_train)

features = selector.get_features()

[01:14:15] ITER: 1/9      SUBSET: 50/50      SCORE: [33m0.8466[0m ± [36m0.0312[0m      ETA: 18 sec
[01:14:17] ITER: 2/9      SUBSET: 45/50      SCORE: [33m0.8531[0m ± [36m0.0291[0m      ETA: 15 sec
[01:14:20] ITER: 3/9      SUBSET: 41/50      SCORE: [33m0.8574[0m ± [36m0.0283[0m      ETA: 13 sec
[01:14:22] ITER: 4/9      SUBSET: 37/50      SCORE: [33m0.8621[0m ± [36m0.0256[0m      ETA: 11 sec
[01:14:24] ITER: 5/9      SUBSET: 34/50      SCORE: [33m0.8646[0m ± [36m0.0255[0m      ETA: 8 sec
[01:14:26] ITER: 6/9      SUBSET: 31/50      SCORE: [33m0.8661[0m ± [36m0.0236[0m      ETA: 6 sec
[01:14:28] ITER: 7/9      SUBSET: 28/50      SCORE: [33m0.8677[0m ± 0.0246      ETA: 4 sec
[01:14:30] ITER: 8/9      SUBSET: 26/50      SCORE: [33m0.8681[0m ± 0.0261      ETA: 2 sec
[01:14:32] ITER: 9/9      SUBSET: 25/50      SCORE: [33m0.8684[0m ± 0.0262      ETA: 0 ms
Iterations limit exceed!


In [10]:
_, y_pred = crossval_predict(model, cv, X_train[features], y_train, X_new=X_test[features],
                             scoring=scoring, method='predict_proba', verbose=2)

roc_auc_score(y_test, y_pred)

[01:14:40]  LogisticRegression

[01:14:40]  FOLD  0:   0.8586
[01:14:40]  FOLD  1:   0.9116
[01:14:40]  FOLD  2:   0.8728
[01:14:40]  FOLD  3:   0.8685
[01:14:40]  FOLD  4:   0.8302

[01:14:40]  AVERAGE:   [33m0.8684[0m ± 0.0262



0.8686

# ___TEST___