# Import

In [1]:
# Basic
import pandas as pd
import numpy as np
import warnings

warnings.simplefilter('ignore')

# ML Toolkit
from robusta.selector import *
from robusta.crossval import *
from robusta.pipeline import *
from robusta.preprocessing import *

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression
from sklearn.metrics import *

# Model
from sklearn.linear_model import Ridge

Using TensorFlow backend.


# Data

In [2]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

X, y = make_regression(n_samples=200, n_features=50, n_informative=5, 
                       random_state=666)

X = pd.DataFrame(X)
y = pd.Series(y)

X.rename(columns=lambda x: 'x{}'.format(x), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
y

0      -75.242067
1       24.025248
2     -147.517904
3      -37.842019
4      -14.627993
          ...    
195     35.853590
196     22.774910
197   -126.268509
198    130.487629
199     15.891230
Length: 200, dtype: float64

# Task & Model

In [4]:
get_score = lambda y_true, y_pred: -mean_squared_error(y_true, y_pred)
scoring = 'neg_mean_squared_error'

cv = 5

model = Ridge()

# Baseline

In [5]:
features = X_train.columns

_, y_pred = crossval_predict(model, cv, X_train[features], y_train, X_new=X_test[features],
                             scoring=scoring, verbose=2, n_jobs=None)

get_score(y_test, y_pred)

[20:23:17]  Ridge

[20:23:17]  FOLD  0:   -4.4332
[20:23:18]  FOLD  1:   -13.3185
[20:23:18]  FOLD  2:   -5.2113
[20:23:18]  FOLD  3:   -19.5226
[20:23:18]  FOLD  4:   -12.6811

[20:23:18]  AVERAGE:   [33m-11.0333[0m ± 5.6118



-11.124871666693128

# Selector

## RFE 
#### (Recursive Feature Elimination)

In [6]:
selector = RFE(model, cv, scoring, n_jobs=-1)
selector.fit(X_train, y_train)

UnboundLocalError: local variable 'time' referenced before assignment

In [7]:
subset = selector.get_subset()

_, y_pred = crossval_predict(model, cv, X_train[subset], y_train, X_new=X_test[subset],
                             scoring=scoring, method='predict_proba',
                             verbose=2, n_jobs=None)

get_score(y_test, y_pred)

NotFittedError: RFE is not fitted

## RFE via Permutation Importance

In [None]:
selector = PermutationRFE(model, cv, scoring, min_features=10, step=0.1, 
                          verbose=2, n_jobs=-1)
selector.fit(X_train, y_train)

In [9]:
subset = selector.get_subset()

_, y_pred = crossval_predict(model, cv, X_train[features], y_train, X_new=X_test[features],
                             scoring=scoring, method='predict_proba', verbose=2)

get_score(y_test, y_pred)

[02:54:49]  Ridge

[02:54:49]  FOLD  0:   -2.4788
[02:54:49]  FOLD  1:   -2.7648
[02:54:49]  FOLD  2:   -1.9524
[02:54:49]  FOLD  3:   -3.8839
[02:54:49]  FOLD  4:   -2.7053

[02:54:49]  AVERAGE:   [33m-2.7570[0m ± 0.6320



-3.733918298961119