# Statistical foundation of machine learning

In [4]:
import numpy as np
from prep import TimeWindowTransformer, LabelWindowExtractor, TimeDomainTransformer

# adjust import if needed

## Loading raw data

In [5]:
# loading raw data
# PATH = f'data/
PATH = r'C:\Users\gianm\Documents\Uni\Big Data\F422\project\data\\'
DATASET = 'guided' # change this to guided/freemoves if needed

X = np.load(PATH + f'{DATASET}/{DATASET}_dataset_X.npy')
Y = np.load(PATH + f'{DATASET}/{DATASET}_dataset_Y.npy')

## (Optional) Signal filtering

if you plan to filter your sEMG signals, it is recommended to perform
this preprocessing step directly on the continuous raw data prior to window extraction or feature
computation. Note that this step is completely optional but may improve your results.

## (0.5 point) Dataset preparation and augmentation through overlapping windows

You should first segment your sEMG signals into smaller windows of fixed size k = 500. These windows should be created with a chosen degree of overlap, which you can adjust based on the computational and memory resources available to you. Keep in mind that a larger overlap results in a greater number of samples and thus a larger dataset to train your models but to the cost of increasing computational demands.

In [6]:
# Define parameters
size = 500
step = 250

# Initialize transformers
tw_transformer = TimeWindowTransformer(size=size, step=step)
label_extractor = LabelWindowExtractor(size=size, step=step)

# Apply transformations
X_windows = tw_transformer.transform(X)     # shape: (5, n_windows, 8, 500)
Y_labels = label_extractor.transform(Y)     # shape: (5, n_windows, 51)

# Inspect shapes
print("X_windows shape:", X_windows.shape)
print("Y_labels shape:", Y_labels.shape)


X_windows shape: (5, 919, 8, 500)
Y_labels shape: (5, 919, 51)


## (1 point) Cross-validation strategy

Determine and implement an adequate cross-validation strategy to validate your regression models, specifying how you organized your data partitions for training and validation. Provide a detailed justification showing that your validation sets remain completely independent from the training set. Include reasoning or evidence demonstrating explicitly that your chosen partitioning strategy prevents data leakage or bias, ensuring the reliability and generalizability of your model performance estimates.

In [12]:
X_train_val = X_windows[:4]
Y_train_val = Y_labels[:4]
X_test = X_windows[4]
Y_test = Y_labels[4]

## (3 points) Baseline approach

Create a custom class inheriting from scikit-learn’s `BaseEstimator`
and `TransformerMixin` that implements the extraction of common time-domain features described
in section 3.1. Note that the features described in Section 3.1 represent the minimal required set. We
encourage you to include additional features or preprocessing steps if you would like to further improve your model performances. Select at least two different regression models, compare their cross-validated performance, and evaluate their feature importances. For both models, perform feature selection to determine the optimal subset of features minimizing the Root Mean Squared Error (RMSE).
Clearly document this process in your notebook, discussing the outcomes in detail. Finally, create a
scikit-learn `Pipeline` that integrates your custom feature extraction class, the optimal feature selection step, and the best-performing regression model identified from your cross-validation results.
Using visualizations and tables to illustrate your findings, and employing formulas or pseudo-code
to explain the feature selection procedure, is strongly encouraged. Note that one-third of the score
will depend on the quality and clarity of your documentation.

In [9]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.kernel_ridge import KernelRidge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from validation import *

import pyriemann
import pyriemann.regression

### Time domain pipelines

In [10]:
time_est0 = Pipeline(
    [
        ('time_domain_features', TimeDomainTransformer(0.3)),
        ('regressor', Lasso())
    ]
)

time_est1 = Pipeline(
    [
        ('time_domain_features', TimeDomainTransformer()),
        ('kernel_ridge', KernelRidge())
    ]
)

time_est2 = Pipeline(
    [
        ('time_domain_features', TimeDomainTransformer()),
        ('multioutput_svr', MultiOutputRegressor(SVR()))
    ]
)

time_est3 = Pipeline(
    [
        ('time_domain_features', TimeDomainTransformer()),
        ('decision_tree', DecisionTreeRegressor())
    ]
)

time_est4 = Pipeline(
    [
        ('time_domain_features', TimeDomainTransformer()),
        ('extra_tree', ExtraTreeRegressor())
    ]
)

time_est5 = Pipeline(
    [
        ('time_domain_features', TimeDomainTransformer()),
        ('random_forest', RandomForestRegressor(
            n_estimators = 30
        ))
    ]
)

In [None]:
# --- Evaluation ---
# X_train_val and Y_train_val must be loaded before calling this
metric_fns = {'RMSE': RMSE, 'NMSE': NMSE}

all_results = {}
for i, pipeline in enumerate([time_est0, time_est1, time_est2, time_est3, time_est4, time_est5]):
    print(f"\n--- Evaluating time_est{i} ---")
    result = cross_validate_pipeline(pipeline, X_train_val, Y_train_val, metric_fns)
    all_results[f'time_est{i}'] = result


--- Evaluating time_est0 ---


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



Fold 1
RMSE: train=6.2674, val=8.0515
NMSE: train=0.1937, val=0.3344


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



Fold 2
RMSE: train=6.2658, val=7.4550
NMSE: train=0.1943, val=0.2824


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



Fold 3
RMSE: train=6.2199, val=7.7785
NMSE: train=0.1967, val=0.2843


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



Fold 4
RMSE: train=6.4948, val=6.5881
NMSE: train=0.2088, val=0.2199

Mean Validation Scores:
mean_val_RMSE: 7.4683
mean_val_NMSE: 0.2802


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


NameError: name 'all_results' is not defined

### Riemannian geometry pipelines

In [None]:
# # ----- Riemannian geometry of covariance matrices ----- #
geom_est0 = Pipeline(
    [
        ('cov_matrices', pyriemann.estimation.Covariances()),
        ('svr', MultiOutputRegressor(pyriemann.regression.SVR()))
    ]
)

geom_est1 = Pipeline(
    [
        ('cov_matrices', pyriemann.estimation.Covariances()),
        ('projection', pyriemann.tangentspace.TangentSpace(
            metric = 'riemann',
            tsupdate = True)),
        ('kernel_ridge', KernelRidge(
            kernel='laplacian'))
    ]
)

geom_est2 = Pipeline(
    [
        ('cov_matrices', pyriemann.estimation.Covariances()),
        ('projection', pyriemann.tangentspace.TangentSpace(
            metric = 'riemann',
            tsupdate = True)),
        ('regressor', RandomForestRegressor())
    ]
)

geom_est3 = Pipeline(
    [
        ('cov_matrices', pyriemann.estimation.Covariances()),
        ('projection', pyriemann.tangentspace.TangentSpace(
            metric = 'riemann',
            tsupdate = True)),
        ('regressor', Lasso())
    ]
)

geom_est4 = Pipeline(
    [
        ('cov_matrices', pyriemann.estimation.Covariances()),
        ('projection', pyriemann.tangentspace.TangentSpace(
            metric = 'riemann',
            tsupdate = True)),
        ('regressor', MultiOutputRegressor(GradientBoostingRegressor()))
    ]
)

In [None]:
print('Kernels for ridge:')
display({'additive_chi2', 'polynomial', 'laplacian', 'poly', 'sigmoid', 'precomputed', 'cosine', 'rbf', 'linear', 'chi2'})
print('Kernels for SVR:')
display({'linear', 'poly', 'rbf', 'sigmoid'})

### Ensemble regressors

In [None]:
ens_est0 = VotingRegressor(
    estimators = [time_est5, geom_est1, geom_est4]
)

### Estimator validation

In [None]:
pipeline = geom_est0 # change this to validate another pipeline

results = {}
for fold in range(4):
    train_idx = [0,1,2,3]
    train_idx.remove(fold)
    val_idx = fold

    X_train = X_train_val[train_idx].reshape(-1, *X_train_val.shape[2:])
    Y_train = Y_train_val[train_idx].reshape(-1, *Y_train_val.shape[2:])
    X_val = X_train_val[val_idx]
    Y_val = Y_train_val[val_idx]
    
    pipeline.fit(X_train, Y_train)
    Y_train_pred = pipeline.predict(X_train)
    Y_val_pred = pipeline.predict(X_val)
    print(f"Fold {fold+1}\n.\
          train RMSE:\t{RMSE(Y_train_pred, Y_train):.4f}\ttrain NMSE:\t{NMSE(Y_train_pred, Y_train):.4f}\n.\
            val RMSE:\t{RMSE(Y_val_pred, Y_val):.4f}\tval NMSE:\t{NMSE(Y_val_pred, Y_val):.4f}")
    
    results[fold] = {
        'train_RMSE': RMSE(Y_train_pred, Y_train),
        'train_NMSE': NMSE(Y_train_pred, Y_train),
        'val_RMSE': RMSE(Y_val_pred, Y_val),
        'val_NMSE': NMSE(Y_val_pred, Y_val),
    }

mean_val_RMSE = np.mean([dic['val_RMSE'] for dic in results.values()])
mean_val_NMSE = np.mean([dic['val_NMSE'] for dic in results.values()])
print('Mean val RMSE:', mean_val_RMSE)
print('Mean val NMSE:', mean_val_NMSE)

### Visualizing predictions to unseen data

In [None]:
X_train_val_flat = X_train_val.reshape(-1, *X_train_val.shape[2:])
Y_train_val_flat = Y_train_val.reshape(-1, *Y_train_val.shape[2:])

pipeline.fit(X_train_val_flat, Y_train_val_flat)
Y_train_pred = pipeline.predict(X_train_val_flat)
Y_test_pred = pipeline.predict(X_test)

print(f"train RMSE:\t{RMSE(Y_train_pred, Y_train_val_flat):.4f}\ttrain NMSE:\t{NMSE(Y_train_pred, Y_train_val_flat):.4f}\ntest RMSE:\t{RMSE(Y_test_pred, Y_test):.4f}\ttest NMSE:\t{NMSE(Y_test_pred, Y_test):.4f}")

In [None]:
from visualization_tools import scatter_3d_points
import matplotlib.pyplot as plt
%matplotlib ipympl

bone = 6
Y_true_bone = Y_test[:,3*bone:3*(bone+1)]
Y_pred_bone = Y_test_pred[:,3*bone:3*(bone+1)]

print(f'RMSE for bone {bone}:', RMSE(Y_pred_bone, Y_true_bone))
print(f'NMSE for bone {bone}:', NMSE(Y_pred_bone, Y_true_bone))

ax = scatter_3d_points(Y_true_bone, color = 'b')
scatter_3d_points(Y_pred_bone, color = 'r', ax = ax)
# ax.set_xlim3d(-50, 50)
# ax.set_ylim3d(-50, 50)
# ax.set_zlim3d(-50, 50)
plt.show()