### Training Jobs Tracking
- The goal of this notebook is to experiment on how to track training jobs. Specifically to understand what needs to be tracked and how to track them. Additionally, figuring out the multitude of hyperparamter and how to log them for every training run.

### Imports

In [1]:
import random

import numpy as np
import pandas as pd
import pycatch22
import seaborn as sns
from imblearn.over_sampling import SMOTE

# Scipy related imports
from scipy.fft import rfft, rfftfreq
from scipy.signal import find_peaks
from scipy.stats import kurtosis, skew, zscore

# Sklearn related imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Constants

In [2]:
FS = 125
LABEL_COL = 187
RANDOM_SEED = 42
CLASSIFICATION_COL = 186

### Helper Functions

In [3]:
def calculate_band_power(
    signals: np.ndarray, low_freq: int, high_freq: int, fs: int = FS
) -> np.ndarray:
    """Converts multiple signals into their frequency domain. Then
    calculates the sum of FFT magnitudes within a specified frequency band.

    This function isolates the frequencies between `low_freq` (inclusive)
    and `high_freq` (inclusive) and sums the corresponding values from the
    FFT array.

    Args:
      signals : np.ndarray
        2D array of raw signals.
      fs : int
        Sample rate.
      low_freq : int
        The lower bound of the frequency band (inclusive).
      high_freq : int
        The upper bound of the frequency band (inclusive).
    Returns:
        The sum of values within the band.

    """
    period = 1 / fs
    num_of_samples = signals.shape[-1]
    freqs = rfftfreq(num_of_samples, period)
    # FFT to get the magnitudes of the signals
    mag = np.abs(rfft(signals, axis=-1))
    # low <= f < high
    band_mask = (freqs >= low_freq) & (freqs < high_freq)

    mag_band = mag[..., band_mask]
    return np.sum(mag_band, axis=-1)

In [4]:
def spectral_centroid(signals: np.ndarray, fs: int = FS) -> np.ndarray:
    """Takes raw signals and tries to find the spectral centroid.
    The spectral centroid indicates if the frequencies are high,
    if high, then the spectral centroid is high for that signal.

    Args:
      signals : np.ndarray
        2D array of raw signals.
      fs : int
        Sample rate.

    Returns:
      1D array corrosponding to each row of the original array of signals.
      Each index has the spectral centroid value for that signal.

    """
    # NOTE(s1perera) Aug 19, 2025: possible use welch instead because we reduce the noise of the signal
    period = 1 / fs
    num_of_samples = signals.shape[-1]
    xf = rfftfreq(num_of_samples, period)
    # FFT to get the magnitudes of the signals
    mag = np.abs(rfft(signals, axis=-1))
    # https://stackoverflow.com/questions/54032515/spectral-centroid-of-numpy-array
    top = np.sum(mag * xf, axis=-1)
    bottom = np.sum(mag, axis=-1)
    bottom[bottom == 0] = 1e-10
    sc = top / bottom
    return sc

In [5]:
def extract_features(signals: np.ndarray) -> pd.DataFrame:
    """Extracts all the statistical and physical features from each signal.

    This is a dimentionality reduction process to reduce the number
    of feature we currently have.

    Args:
      signals : np.ndarray
        2D array of signals. Where each row is a signaland each column is a timestep.

    Returns:
      Features of the signal within a dataframe
    """
    # Statistical Moments
    means = signals.mean(axis=1)
    std = signals.std(axis=1)
    median = np.median(signals, axis=1)
    # Possible physical stats
    energy = np.sum(signals**2, axis=1)
    rPeak = np.max(signals, axis=1)
    minValue = np.min(signals, axis=1)

    slope_avg = np.mean(np.diff(signals), axis=1)
    zero_crossing = np.sum(
        np.diff(
            (np.sign(signals - (signals.mean(axis=1, keepdims=True))) >= 0), axis=1
        ),
        axis=1,
    )
    sc = spectral_centroid(signals=signals)
    l = 0
    h = 10
    band_ten_to_twenty_five = calculate_band_power(
        signals=signals, low_freq=l, high_freq=h
    )

    df = pd.DataFrame(
        {
            "mean": means,
            "stdDev": std,
            "median": median,
            "energy": energy,
            "rPeak": rPeak,
            "minValue": minValue,
            "slope_avg": slope_avg,
            "zeroCrossing": zero_crossing,
            "spectralCentroid": sc,
            f"band{l}to{h}Freq": band_ten_to_twenty_five,
        }
    )

    return df

In [6]:
def extract_features_catch22(signals: np.ndarray) -> pd.DataFrame:
    """
    Extracts statistical, physical, and Catch22 features from each signal.

    This is a dimentionality reduction process to reduce the number
    of feature we currently have.

    Args:
      signals : np.ndarray
        2D array of signals. Where each row is a signaland each column is a timestep.

    Returns:
      Features of the signal within a dataframe
    """
    # Original features
    means = signals.mean(axis=1)
    std = signals.std(axis=1)
    median = np.median(signals, axis=1)
    energy = np.sum(signals**2, axis=1)
    rPeak = np.max(signals, axis=1)
    minValue = np.min(signals, axis=1)
    slope_avg = np.mean(np.diff(signals), axis=1)

    # np.newaxis and  keepdims=True, this way we are using the means we created
    # and not creating a new set of means in memory. just more efficient.
    mean_centered = signals - means[:, np.newaxis]
    zero_crossing = np.sum(np.diff(np.sign(mean_centered) >= 0, axis=1), axis=1)

    # Frequency analysis
    sc = spectral_centroid(signals=signals)
    l, h = 0, 10
    band_power = calculate_band_power(signals=signals, low_freq=l, high_freq=h)

    # Using Catch22 to get features
    # We iterate through rows and collect the 22 features for each
    c22_list = []
    for row in signals:
        # catch24=True includes mean/std, but since you have them manually,
        # we can stick to the 22 canonical features.
        res = pycatch22.catch22_all(row, short_names=True)
        c22_list.append(res["values"])

    # Get column names from the last successful extraction
    c22_colnames = res["short_names"]
    c22_df = pd.DataFrame(c22_list, columns=c22_colnames)

    # Now we combine everything
    manual_features = pd.DataFrame(
        {
            "mean": means,
            "stdDev": std,
            "median": median,
            "energy": energy,
            "rPeak": rPeak,
            "minValue": minValue,
            "slope_avg": slope_avg,
            "zeroCrossing": zero_crossing,
            "spectralCentroid": sc,
            f"band{l}to{h}Freq": band_power,
        }
    )

    # Concatenate the manual features with the Catch22 dataframe
    full_df = pd.concat([manual_features, c22_df], axis=1)

    return full_df

### Training
1. First we load the data.
2. From our EDA, we can deduce that we dont need to check for duplicates or missing values, so that is skipped.
3. Preprocessing will be considered the following:
   * Using `catch22` import, it will extract feature on top of the features that have been manually imported.
4. Then the training will begin with 20% of the data being the test.
5. The training data will be normalized using scaler.
6. Best feature extraction will be used done with `RFC` and logged as a hyperparameter.
7. A training job will be done using data augmented through SMOTE to reduces the imbalance between normal and abnormal classification.


In [10]:
normal_train_path = "./.data/ptbdb_normal.csv"
abnormal_train_path = "./.data/ptbdb_abnormal.csv"
normal_df = pd.read_csv(normal_train_path, header=None)
abnormal_df = pd.read_csv(abnormal_train_path, header=None)
df = pd.concat([normal_df, abnormal_df], axis=0)
df = df.astype("float32")

0      float32
1      float32
2      float32
3      float32
4      float32
        ...   
183    float32
184    float32
185    float32
186    float32
187    float32
Length: 188, dtype: object

In [None]:
X = df.iloc[:, :-1].values
y = df[LABEL_COL]
new_feature_df = extract_features_catch22(signals=X)
X_train, X_test, y_train, y_test = train_test_split(
    X_new, y, test_size=0.2, random_state=42
)