In [1]:
import numpy as np
from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from sklearnex import patch_sklearn
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
# Fix/initiate randomness
seed = 42
rs = np.random.RandomState(np.random.MT19937(np.random.SeedSequence(seed)))

Create a completely random "classification" data set: 
* No relation between "features" and "labels"
* No (purposeful) relationship between the features
* Approx. balanced classes

In [3]:
n_samples, n_features, n_classes = 5_000, 500, 2

X = rs.standard_normal((n_samples, n_features))
y = rs.randint(0, n_classes, size=n_samples)

In [4]:
np.unique(y, return_counts=True)

(array([0, 1]), array([2490, 2510]))

In [5]:
# Various helper functions 

def get_models(random_state):
    return LogisticRegression(penalty="none"), RandomForestClassifier(random_state=random_state)


def get_auc(model, X, y_true):
    return roc_auc_score(y_true=y_true, y_score=model.predict_proba(X)[:, 1])

For the following experiments we will create two models:
1. A standard linear Logistic Regression (with no penalty)
2. A standard non-linear model: Random Forest classifier

In [6]:
linear_model, nonlinear_model = get_models(rs)

### 1. Experiment: Overfitting

#### 1.1 Wrong way: train + test on the same data

In [7]:
linear_model.fit(X, y)
nonlinear_model.fit(X, y)

auc_linear = get_auc(linear_model, X, y)
auc_nonlinear = get_auc(nonlinear_model, X, y)

print("Train+Test on same data = overfitting:")
print(f"Linear model: AUC={auc_linear:.4f}")
print(f"Non-Linear model: AUC={auc_nonlinear:.4f}")

Train+Test on same data = overfitting:
Linear model: AUC=0.6847
Non-Linear model: AUC=1.0000


#### 1.2 Right way: train + test using cross-validation

In [14]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)

auc_linear = cross_val_score(clone(linear_model), X, y, cv=cv, scoring="roc_auc", n_jobs=2)
auc_nonlinear = cross_val_score(clone(nonlinear_model), X, y, cv=cv, scoring="roc_auc", n_jobs=2)

print("Train+Test via cross-validation = good:")
print(f"Linear model: AUC={auc_linear.mean():.4f} ({auc_linear.std():.4f})")
print(f"Non-Linear model: AUC={auc_nonlinear.mean():.4f} ({auc_nonlinear.std():.4f})")

Train+Test via cross-validation = good:
Linear model: AUC=0.4991 (0.0202)
Non-Linear model: AUC=0.5014 (0.0224)


### 2. Experiment: Data Leakage during Feature Selection

In [9]:
from sklearn.feature_selection import SelectPercentile

#### 2.1 Wrong way: Perform feature selection on the whole data set and then use cross-validation to fit/validate the model

In [15]:
feature_selection = SelectPercentile(percentile=10)
X_sel = feature_selection.fit_transform(X, y)

auc_linear = cross_val_score(clone(linear_model), X_sel, y, cv=cv, scoring="roc_auc", n_jobs=2)
auc_nonlinear = cross_val_score(clone(nonlinear_model), X_sel, y, cv=cv, scoring="roc_auc", n_jobs=2)

print("Feature selection on all data + cross-validation afterwards = bias:")
print(f"Linear model: AUC={auc_linear.mean():.4f} ({auc_linear.std():.4f})")
print(f"Non-Linear model: AUC={auc_nonlinear.mean():.4f} ({auc_nonlinear.std():.4f})")

Feature selection on all data + cross-validation afterwards = bias:
Linear model: AUC=0.5953 (0.0173)
Non-Linear model: AUC=0.5428 (0.0196)


#### 2.2 Right way: Feature selection + model fit only on the training set, validation of the whole pipeline on the independent test set

In [16]:
from sklearn.pipeline import make_pipeline

linear_pipeline = make_pipeline(clone(feature_selection), clone(linear_model))
nonlinear_pipeline = make_pipeline(clone(feature_selection), clone(nonlinear_model))

auc_linear = cross_val_score(linear_pipeline, X, y, cv=cv, scoring="roc_auc", n_jobs=2)
auc_nonlinear = cross_val_score(nonlinear_pipeline, X, y, cv=cv, scoring="roc_auc", n_jobs=2)

print("Feature selection + modeling via cross-validation = good:")
print(f"Linear model: AUC={auc_linear.mean():.4f} ({auc_linear.std():.4f})")
print(f"Non-Linear model: AUC={auc_nonlinear.mean():.4f} ({auc_nonlinear.std():.4f})")

Feature selection + modeling via cross-validation = good:
Linear model: AUC=0.5129 (0.0080)
Non-Linear model: AUC=0.4998 (0.0228)


#### 2.3 More extreme bias: p >> n case

Create a new (random) data set which has more features (`n_features = 1_000`) than samples (`n_samples = 500`)

In [17]:
# p > n regime:
n_samples_new, n_features_new, n_classes_new = 500, 1_000, 2

X_new = rs.standard_normal((n_samples_new, n_features_new))
y_new = rs.randint(0, n_classes_new, size=n_samples_new)

#### 2.3.1 Wrong way

In [18]:
feature_selection = SelectPercentile(percentile=10)
X_new_sel = feature_selection.fit_transform(X_new, y_new)
auc_linear = cross_val_score(clone(linear_model), X_new_sel, y_new, cv=cv, scoring="roc_auc", n_jobs=2)
auc_nonlinear = cross_val_score(clone(nonlinear_model), X_new_sel, y_new, cv=cv, scoring="roc_auc", n_jobs=2)

print("p >> n:")
print("Feature selection on all data + cross-validation afterwards = bias:")
print(f"Linear model: AUC={auc_linear.mean():.4f} ({auc_linear.std():.4f})")
print(f"Non-Linear model: AUC={auc_nonlinear.mean():.4f} ({auc_nonlinear.std():.4f})")

p >> n:
Feature selection on all data + cross-validation afterwards = bias:
Linear model: AUC=0.7925 (0.0513)
Non-Linear model: AUC=0.7493 (0.0454)


#### 2.3.2 Right way

In [19]:
auc_linear = cross_val_score(clone(linear_pipeline), X_new, y_new, cv=cv, scoring="roc_auc", n_jobs=2)
auc_nonlinear = cross_val_score(clone(nonlinear_pipeline), X_new, y_new, cv=cv, scoring="roc_auc", n_jobs=2)

print("p >> n")
print("Feature selection + modeling via cross-validation = good:")
print(f"Linear model: AUC={auc_linear.mean():.4f} ({auc_linear.std():.4f})")
print(f"Non-Linear model: AUC={auc_nonlinear.mean():.4f} ({auc_nonlinear.std():.4f})")

p >> n
Feature selection + modeling via cross-validation = good:
Linear model: AUC=0.4682 (0.0321)
Non-Linear model: AUC=0.5111 (0.0388)


### 3. Experiment: Improper validation while using cross-validation: the case for ~~nasty~~ nested cross-validation 

In [20]:
from sklearn.feature_selection import RFECV

To reduce the computational demand I will select a random _subset_ of the original 1000 features to perform the following experiments. 

Important to remember our observation in 2.3 where all the bias seems to be much stronger when p >> n

In [23]:
features_selected = rs.permutation(n_features)[:200]
X_sel = X[:, features_selected]

#### 3.1 Wrong way: Performing a selection of optimal number of features via cross-validation __and__ using the obtained cross-validated values as reported performance

In [24]:
cv_inner = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [25]:
rfe_linear = RFECV(estimator=clone(linear_model), cv=cv_inner, scoring="roc_auc", n_jobs=4)
rfe_nonlinear = RFECV(estimator=clone(nonlinear_model), cv=cv_inner, scoring="roc_auc", n_jobs=4)

rfe_linear.fit(X_sel, y)
rfe_nonlinear.fit(X_sel, y)

print("RFE-CV selection used as reporting = biased:")
print(f"Linear model: AUC={rfe_linear.cv_results_['mean_test_score'].max():.4f}")
print(f"Non-Linear model: AUC={rfe_nonlinear.cv_results_['mean_test_score'].max():.4f}")

#### 3.2 Right (but computational intensive way): nested cross-validation

![nested-CV](https://vitalflux.com/wp-content/uploads/2020/08/Screenshot-2020-08-30-at-6.33.47-PM.png)

In [36]:
from sklearn.pipeline import make_pipeline

auc_linear = cross_val_score(clone(rfe_linear), X, y, cv=cv, scoring="roc_auc")
auc_nonlinear = cross_val_score(clone(rfe_nonlinear), X, y, cv=cv, scoring="roc_auc")

print("Nested cross-validation = good:")
print(f"Linear model: AUC={auc_linear.mean():.4f} ({auc_linear.std():.4f})")
print(f"Non-Linear model: AUC={auc_nonlinear.mean():.4f} ({auc_nonlinear.std():.4f})")

KeyboardInterrupt: 

#### 3.3 Hyperameter tuning + performance reporting

In [38]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

In [None]:
gs_linear_model = LogisticRegressionCV(cv=cv_inner, scoring="roc_auc", n_jobs=4)
gs_nonlinear_model = GridSearchCV(estimator=clone(nonlinear_model), param_grid={"max_features": ["sqrt", "log2", 0.1, 0.3, 0.5, 0.7, 0.9]}, cv=cv_inner,
                                  scoring="roc_auc", n_jobs=4)

gs_linear_model.fit(X, y)
gs_nonlinear_model.fit(X, y)

KeyboardInterrupt: 