In [1]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib # For saving/loading models

from sklearn.model_selection import train_test_split, GroupKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier # Or import xgb.XGBClassifier if you prefer XGBoost
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, classification_report, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay
)
from imblearn.over_sampling import SMOTE # For handling class imbalance
from imblearn.pipeline import Pipeline as ImbPipeline # Use imblearn's pipeline for SMOTE

# Set a random seed for reproducibility
np.random.seed(42)


In [None]:
# Cell 2: Load Data & Initial Inspection

# --- IMPORTANT: Replace 'your_apnea_dataset.csv' with the actual path to your data ---
data_path = '..\master_apnea_dataset (1).csv'
df = pd.read_csv(data_path)

# Display initial data info
print("--- Initial Data Information ---")
print(df.head())
print("\nData Info:")
df.info()
print("\nInitial Label Distribution:")
print(df['label'].value_counts())
print("\nInitial Label Distribution (Normalized):")
print(df['label'].value_counts(normalize=True))

FileNotFoundError: [Errno 2] No such file or directory: '..master_apnea_dataset (1).csv'

In [None]:
# Cell 3: Data Preprocessing & Cleaning

print("\n--- Data Preprocessing & Cleaning ---")

# Check for explicit missing values (NaNs)
print("Missing values before cleaning:\n", df.isnull().sum())

# Define features (X) and target (y)
# Exclude 'patient_id', 'frame_start', 'frame_end' and 'label' from features
features = [col for col in df.columns if col not in ['patient_id', 'frame_start', 'frame_end', 'label']]
X = df[features]
y = df['label']
patient_ids = df['patient_id'] # Keep patient_ids separate for splitting

# Identify potential 'zero-filled' rows that might indicate problematic silent segments
# Using a small epsilon for float comparison
epsilon = 1e-6
# Assuming 'energy', 'zcr', 'rms', 'bandwidth', 'rolloff' are typically non-zero for meaningful audio
# MFCCs can also be problematic if all zeros (or a very large negative number indicating silence)
key_audio_features_for_zero_check = ['energy', 'zcr', 'rms', 'bandwidth', 'rolloff'] + [f'mfcc_{i}' for i in range(1, 14)]

# Find rows where all key audio features are near zero (or an extremely small value)
problematic_zero_rows_indices = df[
    (df[key_audio_features_for_zero_check].abs() < epsilon).all(axis=1)
].index

if not problematic_zero_rows_indices.empty:
    print(f"\nIdentified {len(problematic_zero_rows_indices)} rows where all key audio features are near zero.")
    print("These rows might represent true silence or corrupted data. Deciding to remove them.")
    df_cleaned = df.drop(problematic_zero_rows_indices).reset_index(drop=True)

    # Update X, y, and patient_ids with the cleaned data
    X = df_cleaned[features]
    y = df_cleaned['label']
    patient_ids = df_cleaned['patient_id']
    print(f"Removed {len(problematic_zero_rows_indices)} rows. New data shape: {X.shape}")
else:
    df_cleaned = df.copy()
    print("\nNo rows found with all key audio features near zero. Proceeding with original data.")


# Check for infinite values (e.g., from division by zero or log of zero)
# Replace infinities with NaN, then drop rows with NaN. This is a robust way to handle them.
# Apply this to the numerical columns only.
print("\nChecking for infinite values...")
initial_shape = df_cleaned.shape
for col in X.select_dtypes(include=np.number).columns:
    if np.isinf(X[col]).any():
        print(f"  Found infinite values in column: {col}")
        X[col] = X[col].replace([np.inf, -np.inf], np.nan)

# Drop rows where NaNs were introduced by cleaning (or were already present)
initial_rows = X.shape[0]
X, y, patient_ids = X.dropna(), y[X.dropna().index], patient_ids[X.dropna().index] # Ensure y and patient_ids are also aligned

rows_after_nan_drop = X.shape[0]
if initial_rows != rows_after_nan_drop:
    print(f"Dropped {initial_rows - rows_after_nan_drop} rows due to NaN values (including those from infinite replacement).")

print("Data shape after all cleaning steps:", X.shape)
print("Label distribution after cleaning:\n", y.value_counts(normalize=True))



In [None]:
# Cell 4: Data Splitting (Patient-Wise)

print("\n--- Data Splitting (Patient-Wise) ---")

# Get unique patient IDs
unique_patient_ids = patient_ids.unique()

# Split patient IDs into training and testing sets
# Stratify by patient's overall label prevalence if possible, otherwise just random.
# For simplicity, we'll stratify based on the label for the *entire* patient_id group.
# A more robust stratification might consider the proportion of apnea events per patient.
# Here, we'll assign patients to train/test based on a random split.
train_patient_ids, test_patient_ids = train_test_split(
    unique_patient_ids,
    test_size=0.2, # 20% of patients for testing
    random_state=42
    # stratify= # Can't directly stratify patient IDs by their *frame* labels easily.
    # The current approach is to ensure a patient's data is either fully in train or fully in test.
)

# Filter the main DataFrame based on patient IDs
X_train = X[patient_ids.isin(train_patient_ids)].copy()
y_train = y[patient_ids.isin(train_patient_ids)].copy()
X_test = X[patient_ids.isin(test_patient_ids)].copy()
y_test = y[patient_ids.isin(test_patient_ids)].copy()

# Keep patient IDs aligned for GroupKFold in hyperparameter tuning
patient_ids_train = patient_ids[patient_ids.isin(train_patient_ids)].copy()

print(f"Total unique patients: {len(unique_patient_ids)}")
print(f"Patients in training set: {len(train_patient_ids)}")
print(f"Patients in test set: {len(test_patient_ids)}")
print(f"Train set shape: {X_train.shape}, Test set shape: {X_test.shape}")
print(f"Train label distribution:\n{y_train.value_counts(normalize=True)}")
print(f"Test label distribution:\n{y_test.value_counts(normalize=True)}")

In [None]:
# Cell 5: Model Definition and Pipeline Construction

print("\n--- Model Definition and Pipeline Construction ---")

# Define your classifier. RandomForestClassifier is a good choice for a start.
# For imbalanced datasets, 'class_weight="balanced"' can be an alternative to SMOTE,
# but SMOTE often works well by generating synthetic samples.
classifier = RandomForestClassifier(random_state=42, n_jobs=-1) # n_jobs=-1 uses all available cores

# Alternatively, if you want to use XGBoost (uncomment the lines below):
# import xgboost as xgb
# classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
# If using XGBoost without SMOTE, consider 'scale_pos_weight' for imbalance:
# classifier = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss',
#                                scale_pos_weight=sum(y_train == 0) / sum(y_train == 1))


# Create an ImbPipeline to chain preprocessing (scaling, SMOTE) with the classifier.
# The StandardScaler ensures data is scaled BEFORE SMOTE creates synthetic samples.
# SMOTE is only applied to the training data *within* each cross-validation fold during fitting.
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),        # Step 1: Standardize features
    ('smote', SMOTE(random_state=42)),   # Step 2: Handle imbalance via oversampling
    ('classifier', classifier)           # Step 3: The machine learning model
])

print("ML Pipeline created:")
print(pipeline)

In [None]:
# Cell 6: Hyperparameter Tuning

print("\n--- Hyperparameter Tuning (GridSearchCV with GroupKFold) ---")

# Define parameter grid for the Random Forest Classifier
param_grid = {
    'classifier__n_estimators': [100, 200], # Number of trees in the forest
    'classifier__max_depth': [10, 20, None], # Max depth of trees (None means unlimited)
    'classifier__min_samples_split': [2, 5], # Min number of samples required to split an internal node
    'smote__sampling_strategy': ['auto', 0.5, 0.75] # SMOTE's sampling strategy
}

# For XGBoost, an example param_grid might look like:
# param_grid = {
#     'classifier__n_estimators': [100, 200],
#     'classifier__max_depth': [3, 5],
#     'classifier__learning_rate': [0.05, 0.1],
#     'smote__sampling_strategy': ['auto', 0.5, 0.75]
# }


# Use GroupKFold for cross-validation to ensure patient data stays together
# We use patient_ids_train to define groups within the training set.
cv = GroupKFold(n_splits=3) # Use 3 or 5 splits for quicker initial tuning

# Perform Grid Search
# 'scoring' is crucial for imbalanced datasets; 'f1' or 'roc_auc' are good choices for binary classification.
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='f1', # Optimize for F1-score, which balances precision and recall
    n_jobs=-1,    # Use all available cores
    verbose=2,    # Show progress
    error_score='raise' # Raise an error if any parameter combination fails
)

print(f"Starting Grid Search with {cv.n_splits} folds...")
grid_search.fit(X_train, y_train, groups=patient_ids_train) # Pass groups for GroupKFold

# Get the best model
best_model = grid_search.best_estimator_

print("\n--- Grid Search Results ---")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation F1-score: {grid_search.best_score_:.4f}")


--- Hyperparameter Tuning (GridSearchCV with GroupKFold) ---
Starting Grid Search with 3 folds...
Fitting 3 folds for each of 36 candidates, totalling 108 fits


In [None]:
# Cell 7: Model Evaluation

print("\n--- Model Evaluation on Test Set ---")

# Make predictions on the test set using the best model
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1] # Probability of the positive class (apnea)

# Print classification metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_proba):.4f}")

# Print detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Plot Confusion Matrix
print("\nConfusion Matrix:")
fig, ax = plt.subplots(figsize=(6, 6))
cm_display = ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, ax=ax, cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# Plot ROC Curve
print("\nROC Curve:")
fig, ax = plt.subplots(figsize=(6, 6))
roc_display = RocCurveDisplay.from_estimator(best_model, X_test, y_test, ax=ax)
plt.title("ROC Curve")
plt.show()

# Plot Precision-Recall Curve (often more informative for imbalanced data)
print("\nPrecision-Recall Curve:")
fig, ax = plt.subplots(figsize=(6, 6))
pr_display = PrecisionRecallDisplay.from_estimator(best_model, X_test, y_test, ax=ax)
plt.title("Precision-Recall Curve")
plt.show()

In [None]:
# Cell 8: Model Saving (Optional)

model_filename = 'apnea_detection_model.pkl'
try:
    joblib.dump(best_model, model_filename)
    print(f"\nModel saved successfully as '{model_filename}'")
except Exception as e:
    print(f"Error saving model: {e}")

# Example of how to load the model later:
# loaded_model = joblib.load(model_filename)
# print(f"Model loaded successfully from '{model_filename}'")
