# Import

In [1]:
# Basic
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.selector import *
from robusta.crossval import *
from robusta.pipeline import *
from robusta.preprocessing import *

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.metrics import *

# Model
from sklearn.linear_model import Ridge

Using TensorFlow backend.


# Data

In [2]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(n_samples=200, n_features=50, n_informative=5, 
                       random_state=666)

X = pd.DataFrame(X)
y = pd.Series(y)

X.rename(columns=lambda x: 'x{}'.format(x), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
y

0      -75.242067
1       24.025248
2     -147.517904
3      -37.842019
4      -14.627993
          ...    
195     35.853590
196     22.774910
197   -126.268509
198    130.487629
199     15.891230
Length: 200, dtype: float64

# Task & Model

In [4]:
get_score = lambda y_true, y_pred: -mean_squared_error(y_true, y_pred)
scoring = 'neg_mean_squared_error'

cv = 5

model = Ridge()

# Baseline

In [5]:
features = X_train.columns

_, y_pred = crossval_predict(model, cv, X_train[features], y_train, X_new=X_test[features],
                             scoring=scoring, verbose=2, n_jobs=None)

get_score(y_test, y_pred)

[20:25:53]  Ridge

[20:25:53]  FOLD  0:   -4.4332
[20:25:53]  FOLD  1:   -13.3185
[20:25:53]  FOLD  2:   -5.2113
[20:25:53]  FOLD  3:   -19.5226
[20:25:53]  FOLD  4:   -12.6811

[20:25:53]  AVERAGE:   [33m-11.0333[0m ± 5.6118



-11.124871666693128

# Selector

## RFE 
#### (Recursive Feature Elimination)

In [6]:
selector = RFE(model, cv, scoring, n_jobs=-1)
selector.fit(X_train, y_train)

[20:25:53] ITER: 1/26      SUBSET: 50/50      SCORE: [33m-11.0333[0m ± [36m5.6118[0m      ETA: 2 sec
[20:25:53] ITER: 2/26      SUBSET: 49/50      SCORE: [33m-10.9132[0m ± [36m5.6045[0m      ETA: 2 sec
[20:25:53] ITER: 3/26      SUBSET: 48/50      SCORE: [33m-10.8331[0m ± 5.6296      ETA: 2 sec
[20:25:53] ITER: 4/26      SUBSET: 47/50      SCORE: [33m-10.6639[0m ± 5.6079      ETA: 2 sec
[20:25:54] ITER: 5/26      SUBSET: 46/50      SCORE: [33m-10.4105[0m ± [36m5.2825[0m      ETA: 2 sec
[20:25:54] ITER: 6/26      SUBSET: 45/50      SCORE: [33m-10.2890[0m ± 5.2926      ETA: 2 sec
[20:25:54] ITER: 7/26      SUBSET: 44/50      SCORE: [33m-10.1547[0m ± 5.3142      ETA: 2 sec
[20:25:54] ITER: 8/26      SUBSET: 43/50      SCORE: [33m-9.9987[0m ± [36m5.1674[0m      ETA: 1 sec
[20:25:54] ITER: 9/26      SUBSET: 42/50      SCORE: [33m-9.8979[0m ± 5.2327      ETA: 1 sec
[20:25:54] ITER: 10/26      SUBSET: 41/50      SCORE: [33m-9.7868[0m ± 5.2371      ETA: 1 sec
[20:25

RFE(cv=5,
    estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
                    normalize=False, random_state=None, solver='auto',
                    tol=0.001),
    min_features=0.5, n_digits=4, n_jobs=-1, scoring='neg_mean_squared_error',
    step=1, use_best=True, verbose=1)

In [7]:
subset = selector.get_subset()

_, y_pred = crossval_predict(model, cv, X_train[subset], y_train, X_new=X_test[subset],
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

get_score(y_test, y_pred)

[20:25:56]  Ridge

[20:25:56]  FOLD  0:   -3.1975
[20:25:56]  FOLD  1:   -7.7803
[20:25:56]  FOLD  2:   -3.0137
[20:25:56]  FOLD  3:   -7.2204
[20:25:56]  FOLD  4:   -5.9828

[20:25:56]  AVERAGE:   [33m-5.4389[0m ± 1.9929



-8.485321431575544

## RFE via Permutation Importance

In [8]:
selector = PermutationRFE(model, cv, scoring, min_features=10, step=0.1, 
                          verbose=2, n_jobs=-1)
selector.fit(X_train, y_train)

[20:26:00] ITER: 1/20      SUBSET: 50/50      SCORE: [33m-11.0333[0m ± [36m5.6118[0m      ETA: 37 sec
[20:26:02] ITER: 2/20      SUBSET: 45/50      SCORE: [33m-7.9381[0m ± [36m3.1770[0m      ETA: 36 sec
[20:26:04] ITER: 3/20      SUBSET: 41/50      SCORE: [33m-7.1174[0m ± [36m2.7743[0m      ETA: 34 sec
[20:26:06] ITER: 4/20      SUBSET: 37/50      SCORE: [33m-6.3699[0m ± [36m2.4176[0m      ETA: 32 sec
[20:26:08] ITER: 5/20      SUBSET: 34/50      SCORE: [33m-6.0039[0m ± [36m2.4088[0m      ETA: 29 sec
[20:26:10] ITER: 6/20      SUBSET: 31/50      SCORE: [33m-5.3048[0m ± [36m2.1016[0m      ETA: 26 sec
[20:26:11] ITER: 7/20      SUBSET: 28/50      SCORE: [33m-4.6051[0m ± [36m1.2229[0m      ETA: 24 sec
[20:26:13] ITER: 8/20      SUBSET: 26/50      SCORE: [33m-4.4978[0m ± 1.2613      ETA: 22 sec
[20:26:15] ITER: 9/20      SUBSET: 24/50      SCORE: [33m-4.2243[0m ± 1.3387      ETA: 20 sec
[20:26:16] ITER: 10/20      SUBSET: 22/50      SCORE: [33m-4.0772[0m ±

PermutationRFE(cv=5,
               estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                               max_iter=None, normalize=False,
                               random_state=None, solver='auto', tol=0.001),
               min_features=10, n_digits=4, n_jobs=-1, n_repeats=5,
               random_state=0, scoring='neg_mean_squared_error', step=0.1,
               use_best=True, verbose=2)

In [10]:
subset = selector.get_subset()

_, y_pred = crossval_predict(model, cv, X_train[subset], y_train, X_new=X_test[subset],
                             scoring=scoring, method='predict_proba', verbose=2)

get_score(y_test, y_pred)

[20:26:53]  Ridge

[20:26:53]  FOLD  0:   -2.4788
[20:26:53]  FOLD  1:   -2.7648
[20:26:53]  FOLD  2:   -1.9524
[20:26:53]  FOLD  3:   -3.8839
[20:26:53]  FOLD  4:   -2.7053

[20:26:54]  AVERAGE:   [33m-2.7570[0m ± 0.6320



-3.7339182989611386