# Import

In [1]:
# Basic
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')

# Data
from catboost import datasets

# Prep
from robusta.preprocessing import *
from robusta.pipeline import *

# Eval
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from robusta.crossval import crossval, crossval_predict
from sklearn.metrics import roc_auc_score

# Model
from robusta.testing import get_estimator
from robusta.stack import stack_results, StackingTransformer
from robusta.linear_model import BlendClassifier, CaruanaClassifier

Using TensorFlow backend.


# Data

In [2]:
X, _ = datasets.amazon()
X.index.name = 'id'
y = X.pop('ACTION')

X

Unnamed: 0_level_0,RESOURCE,MGR_ID,ROLE_ROLLUP_1,ROLE_ROLLUP_2,ROLE_DEPTNAME,ROLE_TITLE,ROLE_FAMILY_DESC,ROLE_FAMILY,ROLE_CODE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,39353,85475,117961,118300,123472,117905,117906,290919,117908
1,17183,1540,117961,118343,123125,118536,118536,308574,118539
2,36724,14457,118219,118220,117884,117879,267952,19721,117880
3,36135,5396,117961,118343,119993,118321,240983,290919,118322
4,42680,5905,117929,117930,119569,119323,123932,19793,119325
...,...,...,...,...,...,...,...,...,...
32764,23497,16971,117961,118300,119993,118321,240983,290919,118322
32765,25139,311198,91261,118026,122392,121143,173805,249618,121145
32766,34924,28805,117961,118327,120299,124922,152038,118612,124924
32767,80574,55643,118256,118257,117945,280788,280788,292795,119082


id
0        1
1        1
2        1
3        1
4        1
        ..
32764    1
32765    1
32766    1
32767    1
32768    1
Name: ACTION, Length: 32769, dtype: int64

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Task

In [4]:
get_score = lambda y_test, y_pred: roc_auc_score(y_test, y_pred)

cv = RepeatedStratifiedKFold(5, 3, random_state=0)
scoring = 'roc_auc'

# Models

In [5]:
TYPE='classifier'

estimators = [
    # TREE-BASED MODELS
    # Random Forest
    get_estimator('RandomForest', TYPE, n_jobs=-1),
    # LightGBM
    get_estimator('LGB', TYPE, boosting_type='gbdt', num_leaves=3),
    get_estimator('LGB', TYPE, boosting_type='gbdt', num_leaves=7),
    get_estimator('LGB', TYPE, boosting_type='gbdt', num_leaves=15),
    get_estimator('LGB', TYPE, boosting_type='gbdt', num_leaves=31),
    get_estimator('LGB', TYPE, boosting_type='gbdt', num_leaves=63),
    get_estimator('LGB', TYPE, boosting_type='dart', num_leaves=3),
    get_estimator('LGB', TYPE, boosting_type='dart', num_leaves=7),
    get_estimator('LGB', TYPE, boosting_type='dart', num_leaves=15),
    get_estimator('LGB', TYPE, boosting_type='dart', num_leaves=31),
    get_estimator('LGB', TYPE, boosting_type='dart', num_leaves=63),
    get_estimator('LGB', TYPE, boosting_type='goss', num_leaves=3),
    get_estimator('LGB', TYPE, boosting_type='goss', num_leaves=7),
    get_estimator('LGB', TYPE, boosting_type='goss', num_leaves=15),
    get_estimator('LGB', TYPE, boosting_type='goss', num_leaves=31),
    get_estimator('LGB', TYPE, boosting_type='goss', num_leaves=63),
    # XGBoost
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gbtree', max_depth=2),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gbtree', max_depth=3),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gbtree', max_depth=4),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gbtree', max_depth=5),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gbtree', max_depth=6),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gblinear', max_depth=2),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gblinear', max_depth=3),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gblinear', max_depth=4),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gblinear', max_depth=5),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='gblinear', max_depth=6),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='dart', max_depth=2),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='dart', max_depth=3),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='dart', max_depth=4),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='dart', max_depth=5),
    get_estimator('XGB', TYPE, n_jobs=-1, booster='dart', max_depth=6),
    
    # SVM & LINEAR MODELS
    # SVM
    #get_estimator('SVM', TYPE, probability=True, kernel='linear', C=.1),
    #get_estimator('SVM', TYPE, probability=True, kernel='linear', C=1.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='linear', C=10.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='linear', C=100.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='linear', C=1000.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='poly', degree=2, C=.1),
    #get_estimator('SVM', TYPE, probability=True, kernel='poly', degree=2, C=1.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='poly', degree=2, C=10.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='poly', degree=2, C=100.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='poly', degree=2, C=1000.0),
    # LogReg
    get_estimator('LogisticRegression', TYPE, C=.1),
    get_estimator('LogisticRegression', TYPE, C=1.0),
    get_estimator('LogisticRegression', TYPE, C=10.0),
    get_estimator('LogisticRegression', TYPE, C=100.0),
    get_estimator('LogisticRegression', TYPE, C=1000.0),
    
    # DENSITY-BASED MODELS
    # Gaussian Process
    #get_estimator('GaussianProcess', TYPE),
    # RBF-SVM
    #get_estimator('SVM', TYPE, probability=True, kernel='rbf', C=.1),
    #get_estimator('SVM', TYPE, probability=True, kernel='rbf', C=1.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='rbf', C=10.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='rbf', C=100.0),
    #get_estimator('SVM', TYPE, probability=True, kernel='rbf', C=1000.0),
    
    # NEAREST NEIGHBOURS
    get_estimator('KNeighbors', TYPE, n_jobs=-1, n_neighbors=10),
    get_estimator('KNeighbors', TYPE, n_jobs=-1, n_neighbors=20),
    get_estimator('KNeighbors', TYPE, n_jobs=-1, n_neighbors=50),
    get_estimator('KNeighbors', TYPE, n_jobs=-1, n_neighbors=100),
    get_estimator('KNeighbors', TYPE, n_jobs=-1, n_neighbors=200),
    
]

In [6]:
from collections import defaultdict

names = defaultdict(int)
results = {}

for estimator in estimators:
    name = estimator.__class__.__name__
    idx = names[name]
    names[name] += 1
    
    result = crossval(estimator, cv, X_train, y_train, 
                      X_new=X_test, scoring=scoring,
                      method='predict_proba')
    
    results[name + str(idx)] = result

[10:25:57]  RandomForestClassifier

[10:26:00]  VAL 1:   0.8341
[10:26:01]  VAL 2:   0.8533
[10:26:03]  VAL 3:   0.8216
[10:26:04]  VAL 4:   0.8399
[10:26:06]  VAL 5:   0.8379
[10:26:07]  VAL 6:   0.8036
[10:26:09]  VAL 7:   0.8282
[10:26:10]  VAL 8:   0.8547
[10:26:12]  VAL 9:   0.8298
[10:26:13]  VAL 10:   0.8202
[10:26:14]  VAL 11:   0.8402
[10:26:16]  VAL 12:   0.8445
[10:26:17]  VAL 13:   0.8273
[10:26:19]  VAL 14:   0.8245
[10:26:21]  VAL 15:   0.8120

[10:26:21]  VALID:   [33m0.8315[0m ± 0.0138

[10:26:21]  LGBMClassifier

[10:26:21]  VAL 1:   0.6949
[10:26:21]  VAL 2:   0.7329
[10:26:22]  VAL 3:   0.6900
[10:26:22]  VAL 4:   0.7021
[10:26:22]  VAL 5:   0.6995
[10:26:22]  VAL 6:   0.6667
[10:26:22]  VAL 7:   0.7367
[10:26:22]  VAL 8:   0.7108
[10:26:23]  VAL 9:   0.6796
[10:26:23]  VAL 10:   0.6941
[10:26:23]  VAL 11:   0.6994
[10:26:23]  VAL 12:   0.6972
[10:26:23]  VAL 13:   0.6951
[10:26:23]  VAL 14:   0.7034
[10:26:24]  VAL 15:   0.6974

[10:26:24]  VALID:   [33m0.7000[0

[10:27:43]  VAL 6:   0.6726
[10:27:44]  VAL 7:   0.7473
[10:27:45]  VAL 8:   0.7253
[10:27:46]  VAL 9:   0.6959
[10:27:47]  VAL 10:   0.7085
[10:27:48]  VAL 11:   0.7085
[10:27:49]  VAL 12:   0.7050
[10:27:50]  VAL 13:   0.7006
[10:27:51]  VAL 14:   0.7133
[10:27:52]  VAL 15:   0.7035

[10:27:52]  VALID:   [33m0.7096[0m ± 0.0180

[10:27:52]  XGBClassifier

[10:27:53]  VAL 1:   0.7255
[10:27:54]  VAL 2:   0.7661
[10:27:55]  VAL 3:   0.7348
[10:27:57]  VAL 4:   0.7459
[10:27:58]  VAL 5:   0.7485
[10:28:00]  VAL 6:   0.6982
[10:28:02]  VAL 7:   0.7799
[10:28:03]  VAL 8:   0.7609
[10:28:04]  VAL 9:   0.7350
[10:28:05]  VAL 10:   0.7331
[10:28:06]  VAL 11:   0.7424
[10:28:08]  VAL 12:   0.7334
[10:28:09]  VAL 13:   0.7367
[10:28:10]  VAL 14:   0.7292
[10:28:11]  VAL 15:   0.7321

[10:28:12]  VALID:   [33m0.7401[0m ± 0.0185

[10:28:12]  XGBClassifier

[10:28:13]  VAL 1:   0.7564
[10:28:15]  VAL 2:   0.7976
[10:28:16]  VAL 3:   0.7575
[10:28:18]  VAL 4:   0.7850
[10:28:19]  VAL 5:   0.772

[10:35:57]  VAL 12:   0.5255
[10:35:58]  VAL 13:   0.5530
[10:35:58]  VAL 14:   0.5381
[10:35:58]  VAL 15:   0.5460

[10:35:58]  VALID:   [33m0.5349[0m ± 0.0129

[10:35:58]  LogisticRegression

[10:35:58]  VAL 1:   0.5399
[10:35:58]  VAL 2:   0.5376
[10:35:58]  VAL 3:   0.5163
[10:35:58]  VAL 4:   0.5396
[10:35:59]  VAL 5:   0.5290
[10:35:59]  VAL 6:   0.5408
[10:35:59]  VAL 7:   0.5564
[10:35:59]  VAL 8:   0.5467
[10:35:59]  VAL 9:   0.5191
[10:35:59]  VAL 10:   0.5226
[10:35:59]  VAL 11:   0.5134
[10:35:59]  VAL 12:   0.5255
[10:35:59]  VAL 13:   0.5530
[10:35:59]  VAL 14:   0.5381
[10:36:00]  VAL 15:   0.5460

[10:36:00]  VALID:   [33m0.5349[0m ± 0.0129

[10:36:00]  LogisticRegression

[10:36:00]  VAL 1:   0.5399
[10:36:00]  VAL 2:   0.5376
[10:36:00]  VAL 3:   0.5163
[10:36:00]  VAL 4:   0.5396
[10:36:00]  VAL 5:   0.5290
[10:36:00]  VAL 6:   0.5408
[10:36:01]  VAL 7:   0.5564
[10:36:01]  VAL 8:   0.5467
[10:36:01]  VAL 9:   0.5191
[10:36:01]  VAL 10:   0.5226
[10:36:01]  VAL 1

In [7]:
best_score = None
best_name = None

for name, result in results.items():
    score = get_score(y_test, result['new_pred'])
    result['model_name'] = name
    if not best_score or best_score < score:
        best_score = score
        best_name = name

print(f'Best test score: {best_score:.4f} ({best_name})')

Best test score: 0.8505 (LGBMClassifier4)


# Stacking

In [8]:
S_train, S_test = stack_results(results)
S_train

Unnamed: 0_level_0,RandomForestClassifier0,LGBMClassifier0,LGBMClassifier1,LGBMClassifier2,LGBMClassifier3,LGBMClassifier4,LGBMClassifier5,LGBMClassifier6,LGBMClassifier7,LGBMClassifier8,...,LogisticRegression0,LogisticRegression1,LogisticRegression2,LogisticRegression3,LogisticRegression4,KNeighborsClassifier0,KNeighborsClassifier1,KNeighborsClassifier2,KNeighborsClassifier3,KNeighborsClassifier4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28089,0.340000,0.921559,0.744767,0.625682,0.420387,0.504731,0.928346,0.892341,0.747469,0.548484,...,0.930927,0.930927,0.930927,0.930927,0.930927,0.900000,0.916667,0.913333,0.913333,0.905000
1440,0.950000,0.957647,0.960879,0.965363,0.963356,0.973419,0.929737,0.935071,0.936829,0.939860,...,0.939036,0.939036,0.939036,0.939036,0.939036,0.900000,0.900000,0.926667,0.946667,0.961667
10533,0.860000,0.950279,0.950207,0.911228,0.926554,0.906442,0.935224,0.933902,0.920947,0.922684,...,0.942866,0.942865,0.942865,0.942865,0.942865,1.000000,1.000000,1.000000,0.996667,0.978333
1206,0.900000,0.969562,0.971382,0.971524,0.979199,0.992587,0.940513,0.941155,0.944628,0.948036,...,0.968168,0.968168,0.968168,0.968168,0.968168,0.900000,0.950000,0.973333,0.963333,0.951667
31041,0.986667,0.970906,0.970224,0.982133,0.982648,0.994338,0.944499,0.947406,0.953397,0.962255,...,0.947160,0.947157,0.947158,0.947158,0.947158,1.000000,1.000000,1.000000,0.990000,0.963333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21575,0.940000,0.966861,0.965888,0.966605,0.971756,0.974871,0.944873,0.949393,0.954032,0.954015,...,0.946566,0.946563,0.946564,0.946564,0.946564,1.000000,0.950000,0.966667,0.970000,0.970000
29802,0.980000,0.962504,0.964213,0.968610,0.971873,0.989740,0.933156,0.930041,0.942194,0.949171,...,0.942683,0.942683,0.942683,0.942683,0.942683,1.000000,1.000000,0.953333,0.940000,0.931667
5390,1.000000,0.954168,0.965130,0.971238,0.981886,0.990251,0.928309,0.942536,0.947898,0.957449,...,0.939107,0.939107,0.939107,0.939107,0.939107,0.900000,0.916667,0.940000,0.936667,0.873333
860,0.920000,0.912715,0.939533,0.939634,0.965493,0.969237,0.900845,0.910844,0.910623,0.889751,...,0.938504,0.938504,0.938504,0.938504,0.938504,0.500000,0.716667,0.826667,0.786667,0.836667


# Blending

## Simple Averaging

In [9]:
blend = BlendClassifier()

blend.fit(S_train, y_train)

y_pred = blend.predict_proba(S_test)[:, 1]
get_score(y_test, y_pred)

0.8246453764099475

## Ranks Averaging

In [10]:
blend = make_pipeline(
    QuantileTransformer(),
    BlendClassifier(),
)

blend.fit(S_train, y_train)

y_pred = blend.predict_proba(S_test)[:, 1]
get_score(y_test, y_pred)

0.764840721865225

## Weighted Averaging

In [11]:
weights = np.array([np.mean(result['val_score']) for result in results.values()])

In [12]:
blend = BlendClassifier()

blend.fit(S_train * weights, y_train)

y_pred = blend.predict_proba(S_test * weights)[:, 1]
get_score(y_test, y_pred)

0.8276970925840819

In [13]:
blend = BlendClassifier()

blend.fit(S_train * weights**2, y_train)

y_pred = blend.predict_proba(S_test * weights**2)[:, 1]
get_score(y_test, y_pred)

0.8302185721050221

## Weighted Ranks Averaging

In [14]:
blend = make_pipeline(
    QuantileTransformer(),
    BlendClassifier(),
)

blend.fit(S_train * weights, y_train)

y_pred = blend.predict_proba(S_test * weights)[:, 1]
get_score(y_test, y_pred)

0.7648444000311289

In [15]:
blend = make_pipeline(
    QuantileTransformer(),
    BlendClassifier(),
)

blend.fit(S_train * weights**2, y_train)

y_pred = blend.predict_proba(S_test * weights**2)[:, 1]
get_score(y_test, y_pred)

0.7648370436993213

# Caruana Ensembling

## Basic

In [40]:
blend = CaruanaClassifier('roc_auc', iters=10, init_iters=0, replace=False, colsample=1)
blend.fit(S_train, y_train)

y_pred = blend.predict_proba(S_test)[:, 1]
get_score(y_test, y_pred)

0.8474211604497274

## Modified

In [48]:
blend = CaruanaClassifier('roc_auc', random_state=0, tqdm=True)
blend.fit(S_train, y_train)

y_pred = blend.predict_proba(S_test)[:, 1]
get_score(y_test, y_pred)

HBox(children=(IntProgress(value=0), HTML(value='')))




0.8530156507895086

In [42]:
TOP=10

pd.Series(blend.weights_, index=S_train.columns).sort_values()[::-1][:TOP]

RandomForestClassifier0    46.0
LGBMClassifier4            14.0
LGBMClassifier13           14.0
LGBMClassifier14            9.0
LGBMClassifier5             7.0
XGBClassifier8              2.0
LGBMClassifier3             1.0
LGBMClassifier8             1.0
LGBMClassifier9             1.0
LGBMClassifier12            1.0
dtype: float64

## Modified + Ranked

In [47]:
blend = make_pipeline(
    QuantileTransformer(),
    CaruanaClassifier('roc_auc', random_state=0, tqdm=True),
)

blend.fit(S_train, y_train)

y_pred = blend.predict_proba(S_test)[:, 1]
get_score(y_test, y_pred)

HBox(children=(IntProgress(value=0), HTML(value='')))




0.854047666707062

In [44]:
TOP=10

pd.Series(blend[-1].weights_, index=S_train.columns).sort_values()[::-1][:TOP]

RandomForestClassifier0    53.0
LGBMClassifier4            22.0
LGBMClassifier13            9.0
LGBMClassifier14            7.0
LGBMClassifier9             4.0
XGBClassifier14             1.0
LGBMClassifier3             1.0
LGBMClassifier8             1.0
LGBMClassifier12            1.0
XGBClassifier4              1.0
dtype: float64