# Import

In [1]:
# Basic
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')

# Dataset
from sklearn.datasets import make_moons

# ML Toolkit
from sklearn.metrics import roc_auc_score
from robusta.crossval import crossval

# Model
from robusta.calibration import CalibratedClassifierCV
from robusta.testing import all_models

Using TensorFlow backend.


# Data

In [2]:
X, y = make_moons(200, noise=0.2)

X = pd.DataFrame(X)
y = pd.Series(y)

X

Unnamed: 0,0,1
0,0.258196,0.884728
1,0.908593,0.100670
2,0.874676,-0.363453
3,0.325744,-0.559665
4,1.932335,0.077514
...,...,...
195,-0.351102,1.338449
196,0.962902,0.181837
197,1.324512,0.376371
198,1.079179,-0.178871


# All Classifiers

In [3]:
scoring = 'roc_auc'
cv = 5

In [4]:
BLACKLIST = ['RVM', 'Blend', 'CatBoost']

results = {}

for model_dict in all_models(['classifier']):
    
    try:
        clf = model_dict['class']() # default params
        name = model_dict['name']
    except:
        continue
    
    if name in BLACKLIST:
        continue
    
    if not hasattr(clf, 'predict_proba'):
        if hasattr(clf, 'decision_function'):
            clf = CalibratedClassifierCV(clf)
        else:
            continue
    
    result = crossval(clf, cv, X, y, scoring=scoring, method='predict_proba',
                      verbose=2, n_jobs=-1, n_digits=6)
    
    results[name] = result

[21:18:05]  LogisticRegression

[21:18:05]  FOLD  0:   0.937500
[21:18:05]  FOLD  1:   0.907500
[21:18:05]  FOLD  2:   0.970000
[21:18:05]  FOLD  3:   0.965000
[21:18:05]  FOLD  4:   0.950000

[21:18:05]  AVERAGE:   [33m0.946000[0m ± 0.022394

[21:18:05]  LogisticRegressionCV

[21:18:05]  FOLD  0:   0.935000
[21:18:05]  FOLD  1:   0.890000
[21:18:05]  FOLD  2:   0.972500
[21:18:05]  FOLD  3:   0.972500
[21:18:05]  FOLD  4:   0.952500

[21:18:06]  AVERAGE:   [33m0.944500[0m ± 0.030635

[21:18:06]  PassiveAggressiveClassifier

[21:18:06]  FOLD  0:   0.927500
[21:18:06]  FOLD  1:   0.892500
[21:18:06]  FOLD  2:   0.912500
[21:18:06]  FOLD  3:   0.980000
[21:18:06]  FOLD  4:   0.957500

[21:18:06]  AVERAGE:   [33m0.934000[0m ± 0.031289

[21:18:06]  Perceptron

[21:18:06]  FOLD  0:   0.945000
[21:18:06]  FOLD  1:   0.917500
[21:18:06]  FOLD  2:   0.970000
[21:18:06]  FOLD  3:   0.965000
[21:18:06]  FOLD  4:   0.947500

[21:18:06]  AVERAGE:   [33m0.949000[0m ± 0.018480

[21:18:06]  R

# All Regressors

In [5]:
scoring = 'r2'
cv = 5

In [6]:
BLACKLIST = ['RVM', 'Blend', 'CatBoost', 'RANSAC', 'SGD', 'BART']

results = {}

for model_dict in all_models(['regressor']):
    
    try:
        reg = model_dict['class']() # default params
        name = model_dict['name']
    except:
        continue
    
    if name in BLACKLIST or 'MultiTask' in name:
        continue
    
    try:
        result = crossval(reg, cv, X, y, scoring=scoring, method='predict',
                          verbose=2, n_jobs=-1, n_digits=6)

        results[name] = result
    except:
        pass

[21:18:15]  ARDRegression

[21:18:15]  FOLD  0:   0.615125
[21:18:15]  FOLD  1:   0.494335
[21:18:16]  FOLD  2:   0.638428
[21:18:16]  FOLD  3:   0.680171
[21:18:16]  FOLD  4:   0.585241

[21:18:16]  AVERAGE:   [33m0.602660[0m ± 0.062418

[21:18:16]  BayesianRidge

[21:18:16]  FOLD  0:   0.615694
[21:18:16]  FOLD  1:   0.497566
[21:18:16]  FOLD  2:   0.640176
[21:18:16]  FOLD  3:   0.677738
[21:18:16]  FOLD  4:   0.583886

[21:18:16]  AVERAGE:   [33m0.603012[0m ± 0.061009

[21:18:16]  ElasticNet

[21:18:16]  FOLD  0:   -0.003916
[21:18:16]  FOLD  1:   -0.003916
[21:18:16]  FOLD  2:   -0.003916
[21:18:16]  FOLD  3:   -0.015783
[21:18:16]  FOLD  4:   -0.003916

[21:18:17]  AVERAGE:   [33m-0.006289[0m ± 0.004747

[21:18:17]  ElasticNetCV

[21:18:17]  FOLD  0:   0.616386
[21:18:17]  FOLD  1:   0.497173
[21:18:17]  FOLD  2:   0.639764
[21:18:17]  FOLD  3:   0.678052
[21:18:17]  FOLD  4:   0.583948

[21:18:17]  AVERAGE:   [33m0.603065[0m ± 0.061198

[21:18:17]  HuberRegressor

[21:18


[21:18:34]  AVERAGE:   [33m0.837145[0m ± 0.057861

[21:18:34]  LGBMRegressor

[21:18:34]  FOLD  0:   0.830497
[21:18:34]  FOLD  1:   0.701906
[21:18:35]  FOLD  2:   0.840543
[21:18:35]  FOLD  3:   0.950201
[21:18:35]  FOLD  4:   0.823583

[21:18:35]  AVERAGE:   [33m0.829346[0m ± 0.078749



In [8]:
results

{'ARDRegression': {'fit_time': array([0.20031476, 0.20204711, 0.21033406, 0.20480204, 0.19974804]),
  'importance': [array([0.18228286, 0.57267735]),
   array([0.16447968, 0.58403199]),
   array([0.17181951, 0.56932258]),
   array([0.19812847, 0.52910551]),
   array([0.20041318, 0.55860226])],
  'oof_pred': 0      0.086114
  1      0.653682
  2      0.913292
  3      0.925598
  4      0.853554
           ...   
  195   -0.264094
  196    0.645336
  197    0.609140
  198    0.870132
  199    0.398261
  Name: 0, Length: 200, dtype: float64,
  'pred_time': array([0.00210619, 0.00172591, 0.00181079, 0.00164986, 0.00165892]),
  'score': [0.6151245001440433,
   0.49433549434124713,
   0.6384276760797765,
   0.6801714948150881,
   0.5852412396583686],
  'score_time': array([0.0014751 , 0.00152516, 0.00174904, 0.00141597, 0.00133824]),
  'concat_time': 0.07020330429077148,
  'features': [0, 1],
  'datetime': datetime.datetime(2019, 11, 9, 21, 18, 16, 194391),
  'cv': KFold(n_splits=5, random_s