# Import

In [1]:
# Basic
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.selector import *
from robusta.crossval import *
from robusta.pipeline import *
from robusta.preprocessing import *

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.metrics import *

# Model
from sklearn.linear_model import Ridge

Using TensorFlow backend.


# Data

In [2]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(n_samples=200, n_features=50, n_informative=5, 
                       random_state=666)

X = pd.DataFrame(X)
y = pd.Series(y)

X.rename(columns=lambda x: 'x{}'.format(x), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
y

0      -75.242067
1       24.025248
2     -147.517904
3      -37.842019
4      -14.627993
          ...    
195     35.853590
196     22.774910
197   -126.268509
198    130.487629
199     15.891230
Length: 200, dtype: float64

# Task & Model

In [4]:
get_score = lambda y_true, y_pred: -mean_squared_error(y_true, y_pred)
scoring = 'neg_mean_squared_error'

cv = 5

model = Ridge()

# Baseline

In [5]:
features = X_train.columns

_, y_pred = crossval_predict(model, cv, X_train[features], y_train, X_new=X_test[features],
                             scoring=scoring, verbose=2, n_jobs=None)

get_score(y_test, y_pred)

[21:10:21]  Ridge

[21:10:21]  FOLD  0:   -4.4332
[21:10:21]  FOLD  1:   -13.3185
[21:10:21]  FOLD  2:   -5.2113
[21:10:21]  FOLD  3:   -19.5226
[21:10:21]  FOLD  4:   -12.6811

[21:10:21]  AVERAGE:   [33m-11.0333[0m ± 5.6118



-11.124871666693128

# Selector

## RFE 
#### (Recursive Feature Elimination)

In [6]:
selector = RFE(model, cv, scoring, n_jobs=-1)
selector.fit(X_train, y_train)

features = selector.get_features()

[21:10:21] ITER: 1/26	SUBSET: 50/50	SCORE: [33m-11.0333[0m ± [36m5.6118[0m	ETA: 2 sec
[21:10:21] ITER: 2/26	SUBSET: 49/50	SCORE: [33m-10.9132[0m ± [36m5.6045[0m	ETA: 2 sec
[21:10:22] ITER: 3/26	SUBSET: 48/50	SCORE: [33m-10.8331[0m ± 5.6296	ETA: 2 sec
[21:10:22] ITER: 4/26	SUBSET: 47/50	SCORE: [33m-10.6639[0m ± 5.6079	ETA: 2 sec
[21:10:22] ITER: 5/26	SUBSET: 46/50	SCORE: [33m-10.4105[0m ± [36m5.2825[0m	ETA: 2 sec
[21:10:22] ITER: 6/26	SUBSET: 45/50	SCORE: [33m-10.2890[0m ± 5.2926	ETA: 2 sec
[21:10:22] ITER: 7/26	SUBSET: 44/50	SCORE: [33m-10.1547[0m ± 5.3142	ETA: 1 sec
[21:10:22] ITER: 8/26	SUBSET: 43/50	SCORE: [33m-9.9987[0m ± [36m5.1674[0m	ETA: 1 sec
[21:10:22] ITER: 9/26	SUBSET: 42/50	SCORE: [33m-9.8979[0m ± 5.2327	ETA: 1 sec
[21:10:22] ITER: 10/26	SUBSET: 41/50	SCORE: [33m-9.7868[0m ± 5.2371	ETA: 1 sec
[21:10:23] ITER: 11/26	SUBSET: 40/50	SCORE: [33m-9.7188[0m ± 5.1924	ETA: 1 sec
[21:10:23] ITER: 12/26	SUBSET: 39/50	SCORE: [33m-9.6912[0m ± 5.1754	ETA: 

AttributeError: 'RFE' object has no attribute 'get_features'

In [7]:
_, y_pred = crossval_predict(model, cv, X_train[features], y_train, X_new=X_test[features],
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

get_score(y_test, y_pred)

[02:54:19]  Ridge

[02:54:19]  FOLD  0:   -3.1975
[02:54:19]  FOLD  1:   -7.7803
[02:54:19]  FOLD  2:   -3.0137
[02:54:19]  FOLD  3:   -7.2204
[02:54:19]  FOLD  4:   -5.9828

[02:54:19]  AVERAGE:   [33m-5.4389[0m ± 1.9929



-8.4853214315755

## RFE via Permutation Importance

In [8]:
selector = PermutationRFE(model, cv, scoring, min_features=10, step=0.1, 
                          verbose=2, n_jobs=-1)
selector.fit(X_train, y_train)

features = selector.get_features()

[02:54:21] ITER: 1/20      SUBSET: 50/50      SCORE: [33m-11.0333[0m ± [36m5.6118[0m      ETA: 35 sec
[02:54:23] ITER: 2/20      SUBSET: 45/50      SCORE: [33m-7.9381[0m ± [36m3.1770[0m      ETA: 35 sec
[02:54:23] DROP: {'x7', 'x18', 'x17', 'x11', 'x1'}
[02:54:25] ITER: 3/20      SUBSET: 41/50      SCORE: [33m-6.9718[0m ± [36m2.9866[0m      ETA: 33 sec
[02:54:25] DROP: {'x41', 'x37', 'x43', 'x40'}
[02:54:27] ITER: 4/20      SUBSET: 37/50      SCORE: [33m-6.0186[0m ± [36m2.6229[0m      ETA: 30 sec
[02:54:27] DROP: {'x42', 'x23', 'x30', 'x20'}
[02:54:28] ITER: 5/20      SUBSET: 34/50      SCORE: [33m-5.4408[0m ± [36m2.2830[0m      ETA: 28 sec
[02:54:28] DROP: {'x25', 'x21', 'x16'}
[02:54:30] ITER: 6/20      SUBSET: 31/50      SCORE: [33m-4.9590[0m ± [36m2.0389[0m      ETA: 26 sec
[02:54:30] DROP: {'x22', 'x24', 'x32'}
[02:54:32] ITER: 7/20      SUBSET: 28/50      SCORE: [33m-4.6546[0m ± [36m1.8037[0m      ETA: 23 sec
[02:54:32] DROP: {'x33', 'x0', 'x8'}
[02:54

In [9]:
_, y_pred = crossval_predict(model, cv, X_train[features], y_train, X_new=X_test[features],
                             scoring=scoring, method='predict_proba', verbose=2)

get_score(y_test, y_pred)

[02:54:49]  Ridge

[02:54:49]  FOLD  0:   -2.4788
[02:54:49]  FOLD  1:   -2.7648
[02:54:49]  FOLD  2:   -1.9524
[02:54:49]  FOLD  3:   -3.8839
[02:54:49]  FOLD  4:   -2.7053

[02:54:49]  AVERAGE:   [33m-2.7570[0m ± 0.6320



-3.733918298961119