In [4]:
# Cell 1: Imports
import os
import pandas as pd
import numpy as np
import joblib
from tqdm.notebook import tqdm

# --- Core ML Libraries ---
from sklearn.model_selection import GridSearchCV, GroupKFold, ParameterGrid, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# --- Models to Compare ---
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb # LightGBM for Gradient Boosting

# --- Metrics ---
from sklearn.metrics import classification_report, f1_score, confusion_matrix, ConfusionMatrixDisplay

print("All libraries imported successfully.")

All libraries imported successfully.


In [5]:
# Cell 2: Load Final Dataset

# --- IMPORTANT: Update this path if your dataset has a different name ---
DATASET_PATH = "..\poc_dataset_2.csv"

if os.path.exists(DATASET_PATH):
    df = pd.read_csv(DATASET_PATH)
    print(f"Successfully loaded '{DATASET_PATH}' with {len(df)} frames from {len(df['patient_id'].unique())} patients.")
else:
    print(f"FATAL ERROR: Dataset '{DATASET_PATH}' not found. Please create it first.")
    df = pd.DataFrame() # Create empty dataframe to prevent further errors

if not df.empty:
  # Prepare data for the models
  X = df.drop(columns=['patient_id', 'label'])
  y = df['label']
  groups = df['patient_id'] # This is crucial for GroupKFold

  # Display data info as a final check
  print("\nLabel Distribution:")
  print(y.value_counts(normalize=True))

  DATASET_PATH = "..\poc_dataset_2.csv"


Successfully loaded '..\poc_dataset_2.csv' with 342722 frames from 20 patients.

Label Distribution:
label
0    0.743935
1    0.256065
Name: proportion, dtype: float64


In [6]:
# Cell 3: Setup for Training and Evaluation

# --- 1. Define the Cross-Validation Strategy ---
# We use GroupKFold to ensure that data from the same patient is never in both
# the training and validation fold within the grid search. This gives a realistic performance estimate.
cv_strategy = GroupKFold(n_splits=5) # 5-fold cross-validation is a robust standard


# --- 2. Define the Models and their Hyperparameter Grids for Tuning ---

# We will store our models and their settings in a dictionary for easy iteration
models_to_train = {}

# Model A: Logistic Regression (Fast Baseline)
# It's a simple linear model, so tuning is minimal.
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000, solver='liblinear'))
])
lr_param_grid = {
    'classifier__C': [0.1, 1.0, 10.0] # Regularization strength
}
models_to_train['LogisticRegression'] = (lr_pipeline, lr_param_grid)


# Model B: Random Forest (Robust Workhorse)
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42, class_weight='balanced', n_jobs=-1))
])
rf_param_grid = {
    'classifier__n_estimators': [150, 250],
    'classifier__max_depth': [15, 25, None],
    'classifier__min_samples_leaf': [1, 3]
}
models_to_train['RandomForest'] = (rf_pipeline, rf_param_grid)


# Model C: LightGBM (Fast & Powerful Gradient Boosting)
lgbm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', lgb.LGBMClassifier(random_state=42, objective='binary'))
])
# For imbalanced data, LightGBM often works well by adjusting 'scale_pos_weight'
pos_weight = y.value_counts()[0] / y.value_counts()[1]
lgbm_param_grid = {
    'classifier__n_estimators': [150, 250],
    'classifier__learning_rate': [0.05, 0.1],
    'classifier__num_leaves': [31, 50],
    'classifier__scale_pos_weight': [pos_weight] # Key parameter for imbalance
}
models_to_train['LightGBM'] = (lgbm_pipeline, lgbm_param_grid)

print(f"Setup complete. Will train and evaluate {len(models_to_train)} models:")
for name in models_to_train:
    print(f"- {name}")

Setup complete. Will train and evaluate 3 models:
- LogisticRegression
- RandomForest
- LightGBM


In [7]:
# Cell 4: The Training Loop

# This dictionary will hold the results for each model
model_results = {}

# Loop through each model defined in Cell 3
for model_name, (pipeline, param_grid) in models_to_train.items():
    
    print(f"\n--- Training Model: {model_name} ---")
    
    # Set up GridSearchCV for the current model
    grid_search = GridSearchCV(
        pipeline,
        param_grid,
        cv=cv_strategy,
        scoring='f1_weighted', # Use weighted F1-score for overall performance
        n_jobs=-1,
        verbose=0 # Using TQDM for progress bar
    )
    
    # Calculate total fits for the progress bar
    num_fits = len(ParameterGrid(param_grid)) * cv_strategy.get_n_splits(X, y, groups)
    print(f"Performing {num_fits} fits for {model_name}...")

    # Run the grid search with a progress bar
    with tqdm(total=num_fits) as pbar:
        with joblib.parallel_backend('threading'):
            grid_search.fit(X, y, groups=groups)

    # Store the best score and best parameters found
    model_results[model_name] = {
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_,
        'best_estimator': grid_search.best_estimator_ # Save the trained best model
    }
    
    print(f"Finished training {model_name}. Best F1-Score (weighted): {grid_search.best_score_:.4f}")

print("\n--- All models have been trained. ---")


--- Training Model: LogisticRegression ---
Performing 15 fits for LogisticRegression...


  0%|          | 0/15 [00:00<?, ?it/s]



Finished training LogisticRegression. Best F1-Score (weighted): 0.5108

--- Training Model: RandomForest ---
Performing 60 fits for RandomForest...


  0%|          | 0/60 [00:00<?, ?it/s]

Finished training RandomForest. Best F1-Score (weighted): 0.6438

--- Training Model: LightGBM ---
Performing 40 fits for LightGBM...


  0%|          | 0/40 [00:00<?, ?it/s]



[LightGBM] [Info] Number of positive: 87759, number of negative: 254963
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.041428 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4080
[LightGBM] [Info] Number of data points in the train set: 342722, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.256065 -> initscore=-1.066524
[LightGBM] [Info] Start training from score -1.066524
Finished training LightGBM. Best F1-Score (weighted): 0.4923

--- All models have been trained. ---


In [8]:
# Cell 5: Results Summary and Comparison

print("--- Model Performance Summary ---")
print("-----------------------------------")

# Convert results to a DataFrame for easy viewing and sorting
results_df = pd.DataFrame(
    [(model_name, info['best_score']) for model_name, info in model_results.items()],
    columns=['Model', 'Best F1-Score (Weighted)']
).sort_values(by='Best F1-Score (Weighted)', ascending=False)

print(results_df)

# Identify the champion model
champion_model_name = results_df.iloc[0]['Model']
champion_model_info = model_results[champion_model_name]
best_f1_score = champion_model_info['best_score']
best_model = champion_model_info['best_estimator']

print(f"\n--- Champion Model: {champion_model_name} with F1-Score: {best_f1_score:.4f} ---")
print("\nBest Parameters Found:")
print(champion_model_info['best_params'])


# --- Final Evaluation of the Champion Model on a Hold-Out Test Set ---
# For a final, unbiased evaluation, we should test the champion on data it has NEVER seen.
# We'll split the data one more time, retrain the best model on the full training part,
# and evaluate on the held-out test part.

print("\n--- Final Validation of Champion Model on a Hold-Out Test Set ---")
# Stratify by patient to try and get a mix of patients in train/test
# This is a simple split; for rigorous results, one might do leave-one-patient-out CV
train_indices, test_indices = next(GroupKFold(n_splits=5).split(X, y, groups=groups))

X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]


# Retrain the champion model on the larger training set
print("Retraining champion model on the training split...")
best_model.fit(X_train, y_train)
print("Retraining complete.")

# Evaluate on the unseen test set
y_pred = best_model.predict(X_test)

print("\n--- Test Set Performance ---")
print(classification_report(y_test, target_names=['No Apnea', 'Apnea']))

# Display the confusion matrix for the champion model
ConfusionMatrixDisplay.from_estimator(best_model, X_test, y_test, display_labels=['No Apnea', 'Apnea'], cmap='Blues')
plt.title(f'Confusion Matrix for Champion Model ({champion_model_name})')
plt.show()

--- Model Performance Summary ---
-----------------------------------
                Model  Best F1-Score (Weighted)
1        RandomForest                  0.643846
0  LogisticRegression                  0.510807
2            LightGBM                  0.492325

--- Champion Model: RandomForest with F1-Score: 0.6438 ---

Best Parameters Found:
{'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__n_estimators': 150}

--- Final Validation of Champion Model on a Hold-Out Test Set ---
Retraining champion model on the training split...
Retraining complete.

--- Test Set Performance ---


TypeError: missing a required argument: 'y_pred'