# Apnea Event Classifier Training Notebook

This notebook guides you through training a machine learning model to detect apnea events from audio features.

In [37]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib # For saving/loading models

from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier # Or import xgb.XGBClassifier if you prefer XGBoost
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay
)
from imblearn.over_sampling import SMOTE # For handling class imbalance
from imblearn.pipeline import Pipeline as ImbPipeline # Use imblearn's pipeline for SMOTE

# Set a random seed for reproducibility
np.random.seed(42)


ImportError: cannot import name 'line_search_wolfe1' from 'sklearn.utils.fixes' (c:\Users\solom\Documents\Evaluating-Noise-Reduction-Techniques\venv\Lib\site-packages\sklearn\utils\fixes.py)

In [None]:
# Cell 2: Load Data & Initial Inspection

# --- IMPORTANT: Replace 'your_apnea_dataset.csv' with the actual path to your data ---
data_path = '..\master_apnea_dataset (1).csv'
df = pd.read_csv(data_path)

# Display initial data info
print("--- Initial Data Information ---")
print(df.head())
print("\nData Info:")
df.info()
print("\nInitial Label Distribution:")
print(df['label'].value_counts())
print("\nInitial Label Distribution (Normalized):")
print(df['label'].value_counts(normalize=True))

  df = pd.read_csv('..\master_apnea_dataset (1).csv')


In [None]:
# Cell 3: Data Preprocessing & Cleaning

print("\n--- Data Preprocessing & Cleaning ---")

# Check for explicit missing values (NaNs)
print("Missing values before cleaning:\n", df.isnull().sum())

# Define features (X) and target (y)
# Exclude 'patient_id', 'frame_start', 'frame_end' and 'label' from features
features = [col for col in df.columns if col not in ['patient_id', 'frame_start', 'frame_end', 'label']]
X = df[features]
y = df['label']
patient_ids = df['patient_id'] # Keep patient_ids separate for splitting

# Identify potential 'zero-filled' rows that might indicate problematic silent segments
# Using a small epsilon for float comparison
epsilon = 1e-6
# Assuming 'energy', 'zcr', 'rms', 'bandwidth', 'rolloff' are typically non-zero for meaningful audio
# MFCCs can also be problematic if all zeros (or a very large negative number indicating silence)
key_audio_features_for_zero_check = ['energy', 'zcr', 'rms', 'bandwidth', 'rolloff'] + [f'mfcc_{i}' for i in range(1, 14)]

# Find rows where all key audio features are near zero (or an extremely small value)
problematic_zero_rows_indices = df[
    (df[key_audio_features_for_zero_check].abs() < epsilon).all(axis=1)
].index

if not problematic_zero_rows_indices.empty:
    print(f"\nIdentified {len(problematic_zero_rows_indices)} rows where all key audio features are near zero.")
    print("These rows might represent true silence or corrupted data. Deciding to remove them.")
    df_cleaned = df.drop(problematic_zero_rows_indices).reset_index(drop=True)
    
    # Update X, y, and patient_ids with the cleaned data
    X = df_cleaned[features]
    y = df_cleaned['label']
    patient_ids = df_cleaned['patient_id']
    print(f"Removed {len(problematic_zero_rows_indices)} rows. New data shape: {X.shape}")
else:
    df_cleaned = df.copy()
    print("\nNo rows found with all key audio features near zero. Proceeding with original data.")


# Check for infinite values (e.g., from division by zero or log of zero)
# Replace infinities with NaN, then drop rows with NaN. This is a robust way to handle them.
# Apply this to the numerical columns only.
print("\nChecking for infinite values...")
initial_shape = df_cleaned.shape
for col in X.select_dtypes(include=np.number).columns:
    if np.isinf(X[col]).any():
        print(f"  Found infinite values in column: {col}")
        X[col] = X[col].replace([np.inf, -np.inf], np.nan)

# Drop rows where NaNs were introduced by cleaning (or were already present)
initial_rows = X.shape[0]
X, y, patient_ids = X.dropna(), y[X.dropna().index], patient_ids[X.dropna().index] # Ensure y and patient_ids are also aligned

rows_after_nan_drop = X.shape[0]
if initial_rows != rows_after_nan_drop:
    print(f"Dropped {initial_rows - rows_after_nan_drop} rows due to NaN values (including those from infinite replacement).")

print("Data shape after all cleaning steps:", X.shape)
print("Label distribution after cleaning:\n", y.value_counts(normalize=True))



Kept 306,140 frames after removing silent ones.


In [None]:
# Cell 4: Data Splitting (Patient-Wise)

print("\n--- Data Splitting (Patient-Wise) ---")

# Get unique patient IDs
unique_patient_ids = patient_ids.unique()

# Split patient IDs into training and testing sets
# Stratify by patient's overall label prevalence if possible, otherwise just random.
# For simplicity, we'll stratify based on the label for the *entire* patient_id group.
# A more robust stratification might consider the proportion of apnea events per patient.
# Here, we'll assign patients to train/test based on a random split.
train_patient_ids, test_patient_ids = train_test_split(
    unique_patient_ids,
    test_size=0.2, # 20% of patients for testing
    random_state=42
    # stratify= # Can't directly stratify patient IDs by their *frame* labels easily.
    # The current approach is to ensure a patient's data is either fully in train or fully in test.
)

# Filter the main DataFrame based on patient IDs
X_train = X[patient_ids.isin(train_patient_ids)].copy()
y_train = y[patient_ids.isin(train_patient_ids)].copy()
X_test = X[patient_ids.isin(test_patient_ids)].copy()
y_test = y[patient_ids.isin(test_patient_ids)].copy()

# Keep patient IDs aligned for GroupKFold in hyperparameter tuning
patient_ids_train = patient_ids[patient_ids.isin(train_patient_ids)].copy()

print(f"Total unique patients: {len(unique_patient_ids)}")
print(f"Patients in training set: {len(train_patient_ids)}")
print(f"Patients in test set: {len(test_patient_ids)}")
print(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")
print(f"Train label distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Test label distribution:\n{y_test.value_counts(normalize=True)}")

In [None]:
# Cell 5: Model Definition and Pipeline Construction

print("\n--- Model Definition and Pipeline Construction ---")

# Define your classifier. RandomForestClassifier is a good choice for a start.
# For imbalanced datasets, 'class_weight="balanced"' can be an alternative to SMOTE,
# but SMOTE often works well by generating synthetic samples.
classifier = RandomForestClassifier(random_state=42, n_jobs=-1) # n_jobs=-1 uses all available cores

# Alternatively, if you want to use XGBoost (uncomment the lines below):
# import xgboost as xgb
# classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
# If using XGBoost without SMOTE, consider 'scale_pos_weight' for imbalance:
# classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss',
#                                scale_pos_weight=sum(y_train == 0) / sum(y_train == 1))


# Create an ImbPipeline to chain preprocessing (scaling, SMOTE) with the classifier.
# The StandardScaler ensures data is scaled BEFORE SMOTE creates synthetic samples.
# SMOTE is only applied to the training data *within* each cross-validation fold during fitting.
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),        # Step 1: Standardize features
    ('smote', SMOTE(random_state=42)),   # Step 2: Handle imbalance via oversampling
    ('classifier', classifier)           # Step 3: The machine learning model
])

print("ML Pipeline created:")
print(pipeline)

In [None]:
# Cell 6: Hyperparameter Tuning

print("\n--- Hyperparameter Tuning (GridSearchCV with GroupKFold) ---")

# Define parameter grid for the Random Forest Classifier
param_grid = {
    'classifier__n_estimators': [100, 200], # Number of trees in the forest
    'classifier__max_depth': [10, 20, None], # Max depth of trees (None means unlimited)
    'classifier__min_samples_split': [2, 5], # Min number of samples required to split an internal node
    'smote__sampling_strategy': ['auto', 0.5, 0.75] # SMOTE's sampling strategy
}

# For XGBoost, an example param_grid might look like:
# param_grid = {
#     'classifier__n_estimators': [100, 200],
#     'classifier__max_depth': [3, 5],
#     'classifier__learning_rate': [0.05, 0.1],
#     'smote__sampling_strategy': ['auto', 0.5, 0.75]
# }


# Use GroupKFold for cross-validation to ensure patient data stays together
# We use patient_ids_train to define groups within the training set.
cv = GroupKFold(n_splits=3) # Use 3 or 5 splits for quicker initial tuning

# Perform Grid Search
# 'scoring' is crucial for imbalanced datasets; 'f1' or 'roc_auc' are good choices for binary classification.
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='f1', # Optimize for F1-score, which balances precision and recall
    n_jobs=-1,    # Use all available cores
    verbose=2,    # Show progress
    error_score='raise' # Raise an error if any parameter combination fails
)

print(f"Starting Grid Search with {cv.n_splits} folds...")
grid_search.fit(X_train, y_train, groups=patient_ids_train) # Pass groups for GroupKFold

# Get the best model
best_model = grid_search.best_estimator_

print("\n--- Grid Search Results ---")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation F1-score: {grid_search.best_score_:.4f}")

[I 2025-07-22 00:23:14,763] A new study created in memory with name: no-name-b4d63349-4189-48bf-a256-fc96b047763f
Best trial: 0. Best value: 0.293928:   2%|▎         | 1/40 [00:40<26:17, 40.46s/it]

[I 2025-07-22 00:23:55,219] Trial 0 finished with value: 0.29392843574622235 and parameters: {'scale_pos_weight': 37.316043526313265, 'max_depth': 5, 'eta': 0.11345812640212863, 'subsample': 0.8865984302649417, 'colsample_bytree': 0.5242531138721043, 'min_child_weight': 2}. Best is trial 0 with value: 0.29392843574622235.


Best trial: 0. Best value: 0.293928:   5%|▌         | 2/40 [01:45<34:57, 55.20s/it]

[I 2025-07-22 00:25:00,727] Trial 1 finished with value: 0.28871640775818014 and parameters: {'scale_pos_weight': 47.6579369823112, 'max_depth': 6, 'eta': 0.019663366179587125, 'subsample': 0.7971145374669981, 'colsample_bytree': 0.9878839848656579, 'min_child_weight': 8}. Best is trial 0 with value: 0.29392843574622235.


Best trial: 0. Best value: 0.293928:   8%|▊         | 3/40 [02:23<28:59, 47.02s/it]

[I 2025-07-22 00:25:38,031] Trial 2 finished with value: 0.2934980484626551 and parameters: {'scale_pos_weight': 41.41799042187072, 'max_depth': 3, 'eta': 0.07129260599476135, 'subsample': 0.9703980909164757, 'colsample_bytree': 0.6454468366752837, 'min_child_weight': 7}. Best is trial 0 with value: 0.29392843574622235.


Best trial: 0. Best value: 0.293928:  10%|█         | 4/40 [03:09<27:55, 46.54s/it]

[I 2025-07-22 00:26:23,841] Trial 3 finished with value: 0.28962419913120263 and parameters: {'scale_pos_weight': 25.513499688879033, 'max_depth': 5, 'eta': 0.10747710809720526, 'subsample': 0.9001281906944782, 'colsample_bytree': 0.6395379476415537, 'min_child_weight': 7}. Best is trial 0 with value: 0.29392843574622235.


Best trial: 0. Best value: 0.293928:  12%|█▎        | 5/40 [06:26<58:58, 101.10s/it]

[I 2025-07-22 00:29:41,680] Trial 4 finished with value: 0.2842070118625289 and parameters: {'scale_pos_weight': 43.20817685268065, 'max_depth': 10, 'eta': 0.039382778433431465, 'subsample': 0.931002887806756, 'colsample_bytree': 0.7252052403590041, 'min_child_weight': 3}. Best is trial 0 with value: 0.29392843574622235.


Best trial: 0. Best value: 0.293928:  15%|█▌        | 6/40 [07:37<51:20, 90.59s/it] 

[I 2025-07-22 00:30:51,866] Trial 5 finished with value: 0.28504932608885397 and parameters: {'scale_pos_weight': 20.957247363524345, 'max_depth': 9, 'eta': 0.04140724911125586, 'subsample': 0.6824311326214038, 'colsample_bytree': 0.9242313396760727, 'min_child_weight': 1}. Best is trial 0 with value: 0.29392843574622235.


Best trial: 6. Best value: 0.298913:  18%|█▊        | 7/40 [08:11<39:39, 72.09s/it]

[I 2025-07-22 00:31:25,881] Trial 6 finished with value: 0.29891268809853677 and parameters: {'scale_pos_weight': 48.81425369074565, 'max_depth': 3, 'eta': 0.1675126773903939, 'subsample': 0.5437569466354994, 'colsample_bytree': 0.936693049263768, 'min_child_weight': 7}. Best is trial 6 with value: 0.29891268809853677.


Best trial: 6. Best value: 0.298913:  20%|██        | 8/40 [09:40<41:26, 77.69s/it]

[I 2025-07-22 00:32:55,555] Trial 7 finished with value: 0.27808606431199867 and parameters: {'scale_pos_weight': 48.376470065364366, 'max_depth': 8, 'eta': 0.20468447568762757, 'subsample': 0.7598972190844977, 'colsample_bytree': 0.620445864706811, 'min_child_weight': 2}. Best is trial 6 with value: 0.29891268809853677.


Best trial: 6. Best value: 0.298913:  22%|██▎       | 9/40 [10:31<35:46, 69.24s/it]

[I 2025-07-22 00:33:46,199] Trial 8 finished with value: 0.290550837901102 and parameters: {'scale_pos_weight': 20.93062400000148, 'max_depth': 7, 'eta': 0.03235419737886406, 'subsample': 0.8517514802230737, 'colsample_bytree': 0.6898898923675932, 'min_child_weight': 4}. Best is trial 6 with value: 0.29891268809853677.


Best trial: 6. Best value: 0.298913:  25%|██▌       | 10/40 [12:00<37:37, 75.27s/it]

[I 2025-07-22 00:35:14,973] Trial 9 finished with value: 0.2571869299904314 and parameters: {'scale_pos_weight': 30.205720265248928, 'max_depth': 10, 'eta': 0.27867289966335274, 'subsample': 0.99861303904932, 'colsample_bytree': 0.6407136040769723, 'min_child_weight': 10}. Best is trial 6 with value: 0.29891268809853677.


Best trial: 6. Best value: 0.298913:  28%|██▊       | 11/40 [12:59<34:00, 70.36s/it]

[I 2025-07-22 00:36:14,194] Trial 10 finished with value: 0.23430614111038356 and parameters: {'scale_pos_weight': 4.6191618049101955, 'max_depth': 3, 'eta': 0.013483331261453341, 'subsample': 0.5077540622098804, 'colsample_bytree': 0.8491279326528898, 'min_child_weight': 5}. Best is trial 6 with value: 0.29891268809853677.


Best trial: 6. Best value: 0.298913:  30%|███       | 12/40 [13:28<27:00, 57.87s/it]

[I 2025-07-22 00:36:43,506] Trial 11 finished with value: 0.29631383598534045 and parameters: {'scale_pos_weight': 35.43981686676376, 'max_depth': 4, 'eta': 0.13181009642401656, 'subsample': 0.6322374841838233, 'colsample_bytree': 0.5165192444269078, 'min_child_weight': 6}. Best is trial 6 with value: 0.29891268809853677.


Best trial: 6. Best value: 0.298913:  32%|███▎      | 13/40 [13:58<22:11, 49.30s/it]

[I 2025-07-22 00:37:13,096] Trial 12 finished with value: 0.29426658714826237 and parameters: {'scale_pos_weight': 34.2829180133661, 'max_depth': 4, 'eta': 0.15479022701297399, 'subsample': 0.595677182850372, 'colsample_bytree': 0.8218452669963616, 'min_child_weight': 6}. Best is trial 6 with value: 0.29891268809853677.


Best trial: 6. Best value: 0.298913:  35%|███▌      | 14/40 [14:32<19:19, 44.59s/it]

[I 2025-07-22 00:37:46,782] Trial 13 finished with value: 0.24035039333525457 and parameters: {'scale_pos_weight': 4.252995243111339, 'max_depth': 4, 'eta': 0.0792958559140901, 'subsample': 0.6360525674402145, 'colsample_bytree': 0.5111939197925697, 'min_child_weight': 9}. Best is trial 6 with value: 0.29891268809853677.


Best trial: 14. Best value: 0.299192:  38%|███▊      | 15/40 [15:21<19:15, 46.21s/it]

[I 2025-07-22 00:38:36,768] Trial 14 finished with value: 0.2991920678544472 and parameters: {'scale_pos_weight': 49.85534140389504, 'max_depth': 3, 'eta': 0.2783359128178407, 'subsample': 0.5435846143131988, 'colsample_bytree': 0.8089374562689096, 'min_child_weight': 5}. Best is trial 14 with value: 0.2991920678544472.


Best trial: 15. Best value: 0.299647:  40%|████      | 16/40 [15:58<17:18, 43.28s/it]

[I 2025-07-22 00:39:13,241] Trial 15 finished with value: 0.299646823197514 and parameters: {'scale_pos_weight': 48.14797251749909, 'max_depth': 3, 'eta': 0.25734597864960634, 'subsample': 0.5072986188820164, 'colsample_bytree': 0.8273830171588957, 'min_child_weight': 4}. Best is trial 15 with value: 0.299646823197514.


Best trial: 15. Best value: 0.299647:  42%|████▎     | 17/40 [16:58<18:31, 48.31s/it]

[I 2025-07-22 00:40:13,224] Trial 16 finished with value: 0.26527748309514204 and parameters: {'scale_pos_weight': 11.829358500049295, 'max_depth': 6, 'eta': 0.24601650014029658, 'subsample': 0.5581531887845101, 'colsample_bytree': 0.8172309922866862, 'min_child_weight': 4}. Best is trial 15 with value: 0.299646823197514.


Best trial: 15. Best value: 0.299647:  45%|████▌     | 18/40 [17:59<19:08, 52.20s/it]

[I 2025-07-22 00:41:14,492] Trial 17 finished with value: 0.28411539660608776 and parameters: {'scale_pos_weight': 43.484881406218356, 'max_depth': 5, 'eta': 0.29861243769254037, 'subsample': 0.6925948098190836, 'colsample_bytree': 0.771566552042394, 'min_child_weight': 4}. Best is trial 15 with value: 0.299646823197514.


Best trial: 15. Best value: 0.299647:  48%|████▊     | 19/40 [19:43<23:41, 67.71s/it]

[I 2025-07-22 00:42:58,334] Trial 18 finished with value: 0.2840827673407183 and parameters: {'scale_pos_weight': 39.565778881976435, 'max_depth': 7, 'eta': 0.2017159276082497, 'subsample': 0.5086236496313316, 'colsample_bytree': 0.8829045933236586, 'min_child_weight': 3}. Best is trial 15 with value: 0.299646823197514.


Best trial: 15. Best value: 0.299647:  50%|█████     | 20/40 [20:18<19:16, 57.84s/it]

[I 2025-07-22 00:43:33,167] Trial 19 finished with value: 0.29871892549850576 and parameters: {'scale_pos_weight': 28.66890462141376, 'max_depth': 3, 'eta': 0.08013297912714991, 'subsample': 0.601002158944683, 'colsample_bytree': 0.7867696488350012, 'min_child_weight': 5}. Best is trial 15 with value: 0.299646823197514.


Best trial: 15. Best value: 0.299647:  52%|█████▎    | 21/40 [20:51<15:55, 50.31s/it]

[I 2025-07-22 00:44:05,911] Trial 20 finished with value: 0.29004766153389105 and parameters: {'scale_pos_weight': 49.94114727797212, 'max_depth': 4, 'eta': 0.0260211527669267, 'subsample': 0.697600144383237, 'colsample_bytree': 0.8825328826297223, 'min_child_weight': 3}. Best is trial 15 with value: 0.299646823197514.


Best trial: 15. Best value: 0.299647:  55%|█████▌    | 22/40 [21:22<13:23, 44.65s/it]

[I 2025-07-22 00:44:37,368] Trial 21 finished with value: 0.2991909187774192 and parameters: {'scale_pos_weight': 44.682611268074744, 'max_depth': 3, 'eta': 0.16537870613012753, 'subsample': 0.5632678964733728, 'colsample_bytree': 0.9671717105199087, 'min_child_weight': 7}. Best is trial 15 with value: 0.299646823197514.


Best trial: 22. Best value: 0.301175:  57%|█████▊    | 23/40 [22:07<12:38, 44.63s/it]

[I 2025-07-22 00:45:21,964] Trial 22 finished with value: 0.3011752291012574 and parameters: {'scale_pos_weight': 44.57868314849694, 'max_depth': 3, 'eta': 0.19521292025116885, 'subsample': 0.5488350749975534, 'colsample_bytree': 0.9845107513730289, 'min_child_weight': 5}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  60%|██████    | 24/40 [23:03<12:49, 48.09s/it]

[I 2025-07-22 00:46:18,128] Trial 23 finished with value: 0.2968131644666836 and parameters: {'scale_pos_weight': 44.40479608572019, 'max_depth': 4, 'eta': 0.21386798705501306, 'subsample': 0.5016942144682524, 'colsample_bytree': 0.7309272406419908, 'min_child_weight': 5}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  62%|██████▎   | 25/40 [24:19<14:07, 56.47s/it]

[I 2025-07-22 00:47:34,149] Trial 24 finished with value: 0.2871936299226331 and parameters: {'scale_pos_weight': 40.26382424018128, 'max_depth': 5, 'eta': 0.2702009671288433, 'subsample': 0.5975782460875507, 'colsample_bytree': 0.8998604772176367, 'min_child_weight': 4}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  65%|██████▌   | 26/40 [24:55<11:44, 50.34s/it]

[I 2025-07-22 00:48:10,177] Trial 25 finished with value: 0.2996441554758285 and parameters: {'scale_pos_weight': 32.44502724569912, 'max_depth': 3, 'eta': 0.09838952514321327, 'subsample': 0.5448205272018889, 'colsample_bytree': 0.9974305995557149, 'min_child_weight': 6}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  68%|██████▊   | 27/40 [25:37<10:22, 47.90s/it]

[I 2025-07-22 00:48:52,402] Trial 26 finished with value: 0.2949727118277109 and parameters: {'scale_pos_weight': 32.716171940295766, 'max_depth': 4, 'eta': 0.058834305929473564, 'subsample': 0.64336216573727, 'colsample_bytree': 0.9931262224096704, 'min_child_weight': 6}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  70%|███████   | 28/40 [26:11<08:43, 43.66s/it]

[I 2025-07-22 00:49:26,170] Trial 27 finished with value: 0.29431114915405865 and parameters: {'scale_pos_weight': 19.65946109260358, 'max_depth': 3, 'eta': 0.10141453576028396, 'subsample': 0.5726655574111773, 'colsample_bytree': 0.9510152980055648, 'min_child_weight': 8}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  72%|███████▎  | 29/40 [28:20<12:41, 69.21s/it]

[I 2025-07-22 00:51:34,981] Trial 28 finished with value: 0.28662910131407254 and parameters: {'scale_pos_weight': 36.768496975220216, 'max_depth': 6, 'eta': 0.14512314167452925, 'subsample': 0.523045343989151, 'colsample_bytree': 0.9148622700670116, 'min_child_weight': 6}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  75%|███████▌  | 30/40 [28:46<09:24, 56.42s/it]

[I 2025-07-22 00:52:01,577] Trial 29 finished with value: 0.29476735345418187 and parameters: {'scale_pos_weight': 37.82052576502719, 'max_depth': 5, 'eta': 0.11473866531210569, 'subsample': 0.6636918779253974, 'colsample_bytree': 0.5732245653230701, 'min_child_weight': 2}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  78%|███████▊  | 31/40 [29:56<09:04, 60.54s/it]

[I 2025-07-22 00:53:11,708] Trial 30 finished with value: 0.2900006780615845 and parameters: {'scale_pos_weight': 30.91086334812799, 'max_depth': 5, 'eta': 0.09226629776246963, 'subsample': 0.7148387248652666, 'colsample_bytree': 0.8602749973045593, 'min_child_weight': 1}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  80%|████████  | 32/40 [30:41<07:25, 55.64s/it]

[I 2025-07-22 00:53:55,918] Trial 31 finished with value: 0.29858959863042656 and parameters: {'scale_pos_weight': 45.58791904767853, 'max_depth': 3, 'eta': 0.20186186782026005, 'subsample': 0.5460288243787728, 'colsample_bytree': 0.8104418792864693, 'min_child_weight': 5}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  82%|████████▎ | 33/40 [31:07<05:27, 46.80s/it]

[I 2025-07-22 00:54:22,082] Trial 32 finished with value: 0.29918091412216474 and parameters: {'scale_pos_weight': 47.243887079622404, 'max_depth': 3, 'eta': 0.19099356512903543, 'subsample': 0.5819320004338889, 'colsample_bytree': 0.9992705772464742, 'min_child_weight': 5}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  85%|████████▌ | 34/40 [31:32<04:01, 40.28s/it]

[I 2025-07-22 00:54:47,152] Trial 33 finished with value: 0.2942248594105611 and parameters: {'scale_pos_weight': 40.99482357101445, 'max_depth': 4, 'eta': 0.23969137947547117, 'subsample': 0.537095434177936, 'colsample_bytree': 0.9564168888450519, 'min_child_weight': 4}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  88%|████████▊ | 35/40 [32:00<03:02, 36.52s/it]

[I 2025-07-22 00:55:14,908] Trial 34 finished with value: 0.2965807772199296 and parameters: {'scale_pos_weight': 46.35989179388703, 'max_depth': 3, 'eta': 0.12368497988631018, 'subsample': 0.8052618407842497, 'colsample_bytree': 0.6914581829104212, 'min_child_weight': 8}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  90%|█████████ | 36/40 [32:40<02:31, 37.77s/it]

[I 2025-07-22 00:55:55,596] Trial 35 finished with value: 0.2915502405720267 and parameters: {'scale_pos_weight': 26.274887571052783, 'max_depth': 4, 'eta': 0.23075389317285638, 'subsample': 0.6168343014503747, 'colsample_bytree': 0.8498661426121848, 'min_child_weight': 6}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  92%|█████████▎| 37/40 [33:07<01:43, 34.43s/it]

[I 2025-07-22 00:56:22,217] Trial 36 finished with value: 0.29428418898195796 and parameters: {'scale_pos_weight': 39.330318323443954, 'max_depth': 3, 'eta': 0.0582966264350371, 'subsample': 0.5335890792467257, 'colsample_bytree': 0.9675648691228884, 'min_child_weight': 3}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  95%|█████████▌| 38/40 [34:51<01:50, 55.18s/it]

[I 2025-07-22 00:58:05,828] Trial 37 finished with value: 0.2762595553187789 and parameters: {'scale_pos_weight': 42.24709865718262, 'max_depth': 9, 'eta': 0.17344495128449441, 'subsample': 0.5808240592109569, 'colsample_bytree': 0.7805197882152313, 'min_child_weight': 5}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175:  98%|█████████▊| 39/40 [35:15<00:46, 46.07s/it]

[I 2025-07-22 00:58:30,637] Trial 38 finished with value: 0.2829158910170808 and parameters: {'scale_pos_weight': 18.178006166193654, 'max_depth': 3, 'eta': 0.29746596222636523, 'subsample': 0.7513749735914217, 'colsample_bytree': 0.9274503269680552, 'min_child_weight': 7}. Best is trial 22 with value: 0.3011752291012574.


Best trial: 22. Best value: 0.301175: 100%|██████████| 40/40 [35:59<00:00, 53.99s/it]

[I 2025-07-22 00:59:14,291] Trial 39 finished with value: 0.2939673689698681 and parameters: {'scale_pos_weight': 49.82492906211808, 'max_depth': 4, 'eta': 0.04576358714677485, 'subsample': 0.661907028730905, 'colsample_bytree': 0.5677429357675111, 'min_child_weight': 4}. Best is trial 22 with value: 0.3011752291012574.
Best XGB params: {'scale_pos_weight': 44.57868314849694, 'max_depth': 3, 'eta': 0.19521292025116885, 'subsample': 0.5488350749975534, 'colsample_bytree': 0.9845107513730289, 'min_child_weight': 5}





In [None]:
# Cell 7: Model Evaluation

print("\n--- Model Evaluation on Test Set ---")

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1] # Probability of the positive class (apnea)

# Print classification metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

# Print detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Plot Confusion Matrix
print("\nConfusion Matrix:")
fig, ax = plt.subplots(figsize=(6, 6))
cm_display = ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, ax=ax, cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# Plot ROC Curve
print("\nROC Curve:")
fig, ax = plt.subplots(figsize=(6, 6))
roc_display = RocCurveDisplay.from_estimator(best_model, X_test, y_test, ax=ax)
plt.title("ROC Curve")
plt.show()

# Plot Precision-Recall Curve (often more informative for imbalanced data)
print("\nPrecision-Recall Curve:")
fig, ax = plt.subplots(figsize=(6, 6))
pr_display = PrecisionRecallDisplay.from_estimator(best_model, X_test, y_test, ax=ax)
plt.title("Precision-Recall Curve")
plt.show()

In [None]:
# Cell 8: Model Saving (Optional)

model_filename = 'apnea_detection_model.pkl'
try:
    joblib.dump(best_model, model_filename)
    print(f"\nModel saved successfully as '{model_filename}'")
except Exception as e:
    print(f"Error saving model: {e}")

# Example of how to load the model later:
# loaded_model = joblib.load(model_filename)
# print(f"Model loaded successfully from '{model_filename}'")


ZeroDivisionError: float division by zero