In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%config InlineBackend.figure_format='retina'
plt.rcParams["figure.dpi"] = 70

base_dir = Path().resolve().parent
data_dir = base_dir / 'data'
sys.path.append(str(base_dir))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
data = pd.read_csv(data_dir / 'finalFeatures.csv', index_col=0)
data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)
data = data.drop(columns=['Income', 'Outcome'])
target = 'Balance'
features = list(data.drop(columns=target).columns)
X, y = data[features], data[target]

### Built-In Feature Selection Methods: L1 and L2 Regularizations

In feature selection, automatic methods are commonly used to select relevant features during the training of a model. Two popular approaches for feature selection are L1 and L2 regularizations.

L1 regularization, also known as Lasso regularization, encourages sparsity in the feature space by penalizing the absolute magnitude of the coefficients. This promotes the selection of a subset of features that have the most impact on the model's performance.

L2 regularization, also called Ridge regularization, penalizes the squared magnitude of the coefficients. While it also helps in feature selection by shrinking less important features towards zero, it does not promote sparsity as strongly as L1 regularization.

When Choosing the Main Model

When selecting the main model for your task, it is important to consider the feature selection method that best suits your specific requirements. L1 and L2 regularizations can be integrated into various machine learning models, such as linear regression, logistic regression, and support vector machines, among others.



### SVM and L2 Regularization

Support Vector Machines (SVM) is a popular machine learning algorithm that has L2 regularization built into its default implementation. The regularization strength in SVM is controlled by the parameter C, where higher values of C correspond to a lower regularization effect.

L2 regularization in SVM helps in controlling the model's complexity by adding a penalty term to the objective function. This penalty discourages large coefficient values and promotes a smoother decision boundary. It helps to prevent overfitting and improve the generalization ability of the SVM model.

In [None]:
from sklearn.svm import SVR
model = SVR(C=1)
model.fit(X, y)

### Using SequentialFeatureSelector for Stability Enhancement

In order to improve the stability of our method, we will employ the Wrapper method known as SequentialFeatureSelector. This approach helps to select a subset of features by iteratively adding or removing them based on their performance with the chosen model.

To ensure reliable evaluation, we will incorporate cross-validation while utilizing the SequentialFeatureSelector. However, traditional cross-validation techniques may not be suitable for time series data due to the temporal nature of the data. Therefore, we will employ a specialized time series split to overcome this challenge.

Standard cross-validation, which randomly shuffles the data, can introduce data leakage and provide inaccurate performance estimates when working with time series data. To address this, we will use a specialized time series cross-validation approach such as "rolling window" or "walk-forward" validation. This method involves sequentially splitting the data into training and validation sets, preserving the temporal order of the data.

By incorporating the SequentialFeatureSelector and time series cross-validation, we aim to enhance the stability and reliability of our feature selection process in the context of time series analysis.

In [None]:
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=3)
for i, (train, test) in enumerate(tscv.split(data)):
    bar = plt.barh(f'Fold {i}', train.shape[0], color='tab:orange')
    plt.bar_label(bar, labels=['train'], label_type='center')

    bar = plt.barh(f'Fold {i}', test.shape[0], left=train.shape[0], color='tab:blue')
    plt.bar_label(bar, labels=['test'], label_type='center')

plt.title('Folds with TimeSeriesSplit')
plt.xlabel('Time series observations indexes');

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.svm import SVR

model = SVR()
n_splits = 3
sfs = SequentialFeatureSelector(model, cv=TimeSeriesSplit(n_splits), n_features_to_select=15, n_jobs=-1)
%time sfs.fit(X, y)

In [None]:
selected_features_mask = sfs.get_support()
selected_features = X.columns[selected_features_mask]
print('Seleted Features:\n')
print(*selected_features, sep='\n')

### Correlation-based Feature Selection: A Filtering Method

In our feature selection process, we will utilize the filtering method known as Correlation-based Feature Selection. This method aims to identify and select relevant features based on their correlation with the target variable.

In [None]:
from tslib.feature_selection import get_best_cfs_features

print(get_best_cfs_features.__doc__)
selected_features, _ = get_best_cfs_features(data, features, target)
print('Selected features:\n')
print(*selected_features, sep='\n')

### Compare stability of feature selection methods 

## Assessing Stability through Metric Variability Across Folds

In order to evaluate the stability of our feature selection models, we will examine the variability of metrics when testing on different folds. This approach allows us to assess the consistency of our model's performance across different subsets of the data.

In [None]:
from collections import defaultdict

from tslib.scoring import get_score
from tslib.feature_selection import fit_default_model, fit_sfs_model, fit_cfs_model

tscv = TimeSeriesSplit(n_splits=5)

metric_dict = defaultdict(list)

for train_idx, test_idx in tqdm(list(tscv.split(data))):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]

    for method in ['default', 'sfs', 'cfs']:
        fit_func = globals()[f'fit_{method}_model']
        current_model, selected_features = fit_func(model, X_train, y_train)

        pred = current_model.predict(X_test[selected_features])
        score = get_score(y_test, pred)
        metric_dict[method].append(score)

In [None]:
for i, (method, vals) in enumerate(metric_dict.items()):
    plt.scatter([i]*len(vals), vals, label=method)

plt.xticks([])
plt.legend(bbox_to_anchor=(0.5, 0), loc='upper center', ncol=3)
plt.title('Metrics on different folds with different feature selection models');

In [None]:
fs_summary = pd.DataFrame(index=['mean', 'std'])
for method, vals in metric_dict.items():
    mean = np.mean(vals).round(2)
    std = np.std(vals).round(2)
    fs_summary[method] = [mean, std]

fs_summary

Unnamed: 0,default,sfs,cfs
mean,-0.7,-0.91,-1.03
std,1.06,1.15,0.82
