# Apnea Event Classifier Training Notebook

This notebook guides you through training a machine learning model to detect apnea events from audio features.

In [37]:
# In your ML training script...
import pandas as pd

df = pd.read_csv("../master_apnea_dataset.csv")

# --- DIAGNOSTIC STEP: INVESTIGATE SILENT FRAMES ---
# Define silence as having an RMS energy near zero.
# A threshold of 1e-5 (0.00001) is a good starting point.
SILENCE_THRESHOLD = 1e-5 

silent_frames = df[df['rms'] < SILENCE_THRESHOLD]
print(f"Found {len(silent_frames)} silent frames out of {len(df)} total frames.")
print(f"This is {len(silent_frames) / len(df) * 100:.2f}% of the dataset.\n")

Found 4724 silent frames out of 147575 total frames.
This is 3.20% of the dataset.



In [38]:
if not silent_frames.empty:
    print("Label distribution of silent frames:")
    print(silent_frames['label'].value_counts())

Label distribution of silent frames:
label
0    4626
1      98
Name: count, dtype: int64


In [None]:
# === IMPORTS ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib  # For saving the model and scaler

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE

# === FUNCTION DEFINITIONS ===

def create_windows(data, labels, window_size=30, step_size=1):
    """Converts time-series data into overlapping windows."""
    print(f"\n--- Creating Windows (Window Size={window_size}) ---")
    X, y = [], []
    for i in range(0, len(data) - window_size, step_size):
        window = data[i:(i + window_size)]
        X.append(window)
        label_window = labels[i:(i + window_size)]
        y.append(1 if 1 in label_window else 0)

    X, y = np.array(X), np.array(y)
    print(f"Created {X.shape[0]} windows.")
    print(f"Shape of windowed data (X): {X.shape}")
    return X, y

def extract_statistical_features(windowed_data):
    """Takes windowed data and extracts statistical features from each window."""
    print("Extracting statistical features from windows...")
    num_samples = windowed_data.shape[0]
    num_features = windowed_data.shape[2]
    num_stats = 4  # mean, std, min, max
    
    statistical_features = np.zeros((num_samples, num_features * num_stats))

    for i, window in enumerate(windowed_data):
        mean_f = np.mean(window, axis=0)
        std_f = np.std(window, axis=0)
        min_f = np.min(window, axis=0)
        max_f = np.max(window, axis=0)
        statistical_features[i] = np.hstack((mean_f, std_f, min_f, max_f))
        
    print(f"New feature shape: {statistical_features.shape}")
    return statistical_features

# === MAIN SCRIPT EXECUTION ===

# --- 1. Load and Clean Data ---
print("--- Step 1: Loading and Cleaning Data ---")
try:
    df = pd.read_csv("master_apnea_dataset.csv")
except FileNotFoundError:
    print("❌ ERROR: master_apnea_dataset.csv not found. Please run the data preparation notebook first.")
    exit()

print(f"Original dataset size: {len(df)} frames")

# Define a threshold for silence based on RMS energy
SILENCE_THRESHOLD = 1e-5 

# Remove silent frames
df_cleaned = df[df['rms'] >= SILENCE_THRESHOLD].copy()
print(f"Cleaned dataset size: {len(df_cleaned)} frames")
print(f"Removed {len(df) - len(df_cleaned)} silent frames.")


# --- 2. Define Features and Split Data Chronologically ---
print("\n--- Step 2: Defining Features and Splitting Data ---")
# Define the feature columns to be used
# (Excluding patient_id, frame info, and the label)
features = [col for col in df_cleaned.columns if col not in ['patient_id', 'frame_start', 'frame_end', 'label']]
X = df_cleaned[features].values
y = df_cleaned['label'].values

# CRITICAL: For time-series data, split chronologically to prevent data leakage.
# We CANNOT use scikit-learn's train_test_split directly as it shuffles randomly.
test_size = int(len(df_cleaned) * 0.20) # 20% for the final, unseen test set

X_train_full = X[:-test_size]
y_train_full = y[:-test_size]
X_test = X[-test_size:]
y_test = y[-test_size:]

print(f"Training set size: {len(X_train_full)}")
print(f"Test set size: {len(X_test)}")


# --- 3. Scale Features ---
print("\n--- Step 3: Scaling Features ---")
# The scaler is FIT ONLY on the training data
scaler = StandardScaler()
X_train_full_scaled = scaler.fit_transform(X_train_full)
X_test_scaled = scaler.transform(X_test) # Apply the same transformation to the test set
print("Features have been scaled using StandardScaler.")


# --- 4. Create Windows and Extract Statistical Features ---
# Apply windowing to both training and test sets
X_train_win, y_train_win = create_windows(X_train_full_scaled, y_train_full)
X_test_win, y_test_win = create_windows(X_test_scaled, y_test)

# Extract statistical features from the windowed data
X_train_featured = extract_statistical_features(X_train_win)
X_test_featured = extract_statistical_features(X_test_win)


# --- 5. Handle Class Imbalance using SMOTE ---
print("\n--- Step 5: Handling Class Imbalance with SMOTE ---")
print(f"Class distribution before SMOTE: {np.bincount(y_train_win)}")

# Apply SMOTE ONLY to the training data.
# The test set must remain imbalanced to reflect real-world conditions.
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_featured, y_train_win)

print(f"Class distribution after SMOTE: {np.bincount(y_train_resampled)}")


# --- 6. Train the RandomForest Model ---
print("\n--- Step 6: Training the RandomForest Model ---")
# Initialize the classifier with settings for good performance and balance
model = RandomForestClassifier(n_estimators=150, random_state=42, n_jobs=-1, class_weight='balanced')

# Train the model on the balanced (SMOTE'd) data
model.fit(X_train_resampled, y_train_resampled)
print("✅ Model training complete.")


# --- 7. Evaluate the Model on the UNSEEN Test Set ---
print("\n--- Step 7: Evaluating Model Performance on the Test Set ---")
# Make predictions on the prepared test data
y_pred = model.predict(X_test_featured)

# Display the Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test_win, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Normal', 'Apnea'])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix - Test Set Performance")
plt.show()

# Print the detailed Classification Report
print("\nClassification Report:")
print(classification_report(y_test_win, y_pred, target_names=['Normal (0)', 'Apnea (1)']))
print("---")
print("Key Metrics to Watch:")
print("Recall (Apnea): Of all actual apnea events, how many did we find? (Higher is better)")
print("Precision (Apnea): When we predicted apnea, how often were we right? (Higher is better)")
print("F1-Score (Apnea): A balanced measure of Precision and Recall.")


# --- 8. Save the Final Model and Scaler ---
print("\n--- Step 8: Saving Model and Scaler for Future Use ---")
joblib.dump(model, 'apnea_detection_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
print("✅ Model and scaler have been saved to disk.")

Original dataset size: 147575 frames
Cleaned dataset size: 142851 frames
Removed 4724 silent frames.


: 

In [31]:
# Train the Model
clf.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [32]:
# Evaluate Model Performance
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8688832717609696
              precision    recall  f1-score   support

           0       0.87      1.00      0.93     13424
           1       0.28      0.01      0.01      2005

    accuracy                           0.87     15429
   macro avg       0.57      0.50      0.47     15429
weighted avg       0.79      0.87      0.81     15429



In [33]:
# Save the Trained Model
joblib.dump(clf, 'apnea_rf_model.joblib')
print("Model saved as apnea_rf_model.joblib")

Model saved as apnea_rf_model.joblib


In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# --- (Optional) Step 0: Create a Sample Dataset ---
# This function generates a dummy CSV file to make the script runnable.
# In your actual use case, you will skip this and use your own CSV file.
def create_dummy_dataset(filename="apnea_dataset.csv", num_seconds=77000):
    """Generates a sample dataset for demonstration purposes."""
    print(f"Creating a dummy dataset with {num_seconds} seconds of data...")
    time = np.arange(num_seconds)
    # Simulate features
    energy = np.random.rand(num_seconds) * 0.02 + 0.01
    zcr = np.random.rand(num_seconds) * 0.005
    centroid = np.random.rand(num_seconds) * 1000 + 1500
    
    # Simulate labels (highly imbalanced)
    labels = np.zeros(num_seconds, dtype=int)
    # Create short, infrequent apnea events (label=1)
    for _ in range(int(num_seconds / 500)): # ~5% of data as events
        start = np.random.randint(0, num_seconds - 60)
        # Apnea events usually show a drop in energy
        energy[start:start+20] *= 0.1 
        labels[start:start+20] = 1 # Event lasts for 20 seconds
        
    df = pd.DataFrame({
        'energy': energy,
        'zcr': zcr,
        'centroid': centroid,
        'label': labels
    })
    df.to_csv(filename, index=False)
    print(f"Dummy dataset saved as '{filename}'.")
    return df

# --- Step 1: Load and Split the Data ---

def load_and_split_data(filename):
    """Loads data and performs a chronological split."""
    print("\n--- Step 1: Loading and Splitting Data ---")
    df = pd.read_csv(filename)
    
    # Define the features (X) and the target (y)
    features = ['energy', 'zcr', 'centroid']
    X = df[features]
    y = df['label']

    # CRITICAL: For time-series data, split chronologically, not randomly.
    # This prevents the model from training on future data and testing on past data.
    val_size = int(len(df) * 0.15) # 15% for validation
    test_size = int(len(df) * 0.15) # 15% for testing

    # The first 70% is for training
    X_train = X.iloc[:-val_size-test_size]
    y_train = y.iloc[:-val_size-test_size]

    # The next 15% is for validation
    X_val = X.iloc[-val_size-test_size:-test_size]
    y_val = y.iloc[-val_size-test_size:-test_size]

    # The final 15% is for testing
    X_test = X.iloc[-test_size:]
    y_test = y.iloc[-test_size:]

    print(f"Data split chronologically:")
    print(f"Training set shape: {X_train.shape}")
    print(f"Validation set shape: {X_val.shape}")
    print(f"Test set shape: {X_test.shape}")
    
    return X_train, y_train, X_val, y_val, X_test, y_test, features

# --- Step 2: Scale Features ---

def scale_features(X_train, X_val, X_test):
    """Scales features using StandardScaler."""
    print("\n--- Step 2: Scaling Features ---")
    # Initialize the scaler
    scaler = StandardScaler()

    # Fit the scaler ONLY on the training data to learn the distribution
    scaler.fit(X_train)

    # Apply the learned transformation to all datasets
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    print("Features have been scaled using StandardScaler.")
    return X_train_scaled, X_val_scaled, X_test_scaled

# --- Step 3: Create Time Windows ---

def create_windows(data, labels, window_size=30, step_size=1):
    """Converts time-series data into overlapping windows."""
    print(f"\n--- Step 3: Creating Windows (Window Size={window_size}) ---")
    X, y = [], []
    for i in range(0, len(data) - window_size, step_size):
        # Extract a window of features
        window = data[i:(i + window_size)]
        X.append(window)

        # The label for the window is 1 if ANY second in it is an apnea event
        label_window = labels[i:(i + window_size)]
        y.append(1 if 1 in label_window else 0)

    X, y = np.array(X), np.array(y)
    print(f"Created {X.shape[0]} windows.")
    print(f"Shape of windowed data (X): {X.shape}") # (samples, timesteps, features)
    print(f"Shape of windowed labels (y): {y.shape}")
    return X, y

# --- Main Execution ---

if __name__ == "__main__":
    # Use your own dataset file here
    dataset_filename = "../master_apnea.csv"
    
    # Create the dummy dataset if it doesn't exist
    try:
        pd.read_csv(dataset_filename)
    except FileNotFoundError:
        create_dummy_dataset(dataset_filename)

    # Step 1: Load and split the data
    X_train, y_train, X_val, y_val, X_test, y_test, features = load_and_split_data(dataset_filename)

    # Step 2: Scale the features
    X_train_scaled, X_val_scaled, X_test_scaled = scale_features(X_train, X_val, X_test)

    # Step 3: Create windows for all datasets
    # We use the raw labels (y_train) here, not the scaled features
    X_train_win, y_train_win = create_windows(X_train_scaled, y_train.values)
    X_val_win, y_val_win = create_windows(X_val_scaled, y_val.values)
    X_test_win, y_test_win = create_windows(X_test_scaled, y_test.values)
    
    # --- Step 4: Handle Class Imbalance with SMOTE ---
    print("\n--- Step 4: Handling Class Imbalance with SMOTE ---")
    print(f"Class distribution before SMOTE: {np.bincount(y_train_win)}")
    
    # SMOTE works on 2D data, so we must reshape the windows first.
    # Reshape from (samples, timesteps, features) to (samples, timesteps * features)
    nsamples, nsteps, nfeatures = X_train_win.shape
    X_train_reshaped = X_train_win.reshape((nsamples, nsteps * nfeatures))
    
    smote = SMOTE(random_state=42)
    # Apply SMOTE ONLY to the training data
    X_train_smote, y_train_smote = smote.fit_resample(X_train_reshaped, y_train_win)

    print(f"Class distribution after SMOTE: {np.bincount(y_train_smote)}")
    
    # --- Step 5: Train the Model ---
    print("\n--- Step 5: Training a Random Forest Model ---")
    # Initialize the classifier
    # class_weight='balanced' is a good alternative to SMOTE for tree-based models
    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

    # Train the model on the balanced (SMOTE'd) data
    model.fit(X_train_smote, y_train_smote)
    print("Model training complete.")
    
    # --- Step 6: Evaluate the Model on the Unseen Test Set ---
    print("\n--- Step 6: Evaluating the Model on the Test Set ---")
    
    # Reshape the test data windows to match the model's expected input shape (2D)
    nsamples_test, nsteps_test, nfeatures_test = X_test_win.shape
    X_test_reshaped = X_test_win.reshape((nsamples_test, nsteps_test * nfeatures_test))
    
    # Make predictions on the test data
    y_pred = model.predict(X_test_reshaped)

    # Display the Confusion Matrix
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test_win, y_pred)
    # Plotting the matrix for better visualization
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Normal', 'Apnea'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix - Test Set")
    plt.show()
    
    print("\nClassification Report:")
    # Print the classification report with key metrics
    # Note: 'support' is the number of actual occurrences of each class in y_test_win
    print(classification_report(y_test_win, y_pred, target_names=['Normal (0)', 'Apnea (1)']))

    print("\n--- Interpretation of Results ---")
    print("Recall (Sensitivity) for 'Apnea (1)': This is the most important metric here.")
    print("It tells you: Of all the real apnea events in the test set, what percentage did your model successfully identify?")
    print("A high recall means you are missing fewer apnea events (low false negatives).")
    print("\nPrecision for 'Apnea (1)': This metric tells you:")
    print("Of all the events the model labeled as apnea, what percentage were actually apnea?")
    print("A high precision means the model isn't raising too many false alarms (low false positives).")

ImportError: cannot import name '_deprecate_Xt_in_inverse_transform' from 'sklearn.utils.deprecation' (c:\Users\solom\Documents\Github Projects\Evaluating-Noise-Reduction-Techniques\venv\Lib\site-packages\sklearn\utils\deprecation.py)