In [5]:
# =============================================================================
# 4_model_training_test.ipynb (Smoke Test)
#
# This script is a "smoke test" to verify that our model training pipeline
# works end-to-end on a small sample of the data before launching the
# full, time-consuming training run.
#
# Key Features:
# - Automatically clears previous test runs for easy iteration.
# - Uses a small, stratified sample of data to ensure class representation.
# - Aligns with AutoGluon documentation by renaming the label column to 'label'.
# - Implements all best practices: fine-tuning, weighted loss for imbalance,
#   and optimizing for ROC AUC.
# - Uses a robust pre-trained checkpoint.
# - Sets a 'medium_quality' preset for robust default hyperparameters.
# - Includes performance optimizations for modern GPUs.
# =============================================================================

# --- Standard Library Imports ---
import os
import pandas as pd
import logging
import shutil # Import shutil for directory removal

# --- Third-Party Imports ---
import torch
from autogluon.multimodal import MultiModalPredictor
from sklearn.model_selection import train_test_split

# --- Configuration Block ---

# 1. Path Configuration
PROCESSED_DATA_DIR = '../data/processed/'
TRAIN_MANIFEST = os.path.join(PROCESSED_DATA_DIR, 'train_data.csv')
TEST_MANIFEST = os.path.join(PROCESSED_DATA_DIR, 'test_data.csv')
MODEL_OUTPUT_PATH = '../models/autogluon_smoketest'

# 2. Test Run Configuration
SAMPLE_SIZE = 400
TIME_LIMIT_SECONDS = 600
EVAL_METRIC = 'roc_auc'
MODEL_TO_TUNE = 'convnext_tiny'
PRESET_QUALITY = 'medium_quality'


# --- Main Execution Block ---

def main():
    """Main function to run the model training smoke test."""
    # 1. Setup and Optimization
    torch.set_float32_matmul_precision('medium')
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info("--- Starting Model Training Smoke Test ---")

    # --- NEW: Automatically clear previous smoke test results ---
    if os.path.exists(MODEL_OUTPUT_PATH):
        logging.info(f"Removing previous smoke test directory: {MODEL_OUTPUT_PATH}")
        shutil.rmtree(MODEL_OUTPUT_PATH)
    os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)


    # 2. Load Data and Calculate Weights
    logging.info(f"Loading full training data from: {TRAIN_MANIFEST}")
    try:
        full_train_df = pd.read_csv(TRAIN_MANIFEST)
    except FileNotFoundError:
        logging.error(f"CRITICAL: Training manifest not found at {TRAIN_MANIFEST}. Please run data generation first.")
        return

    # 3. Create a Stratified Sample
    if len(full_train_df) > SAMPLE_SIZE:
        _, sample_train_df = train_test_split(
            full_train_df,
            test_size=SAMPLE_SIZE,
            stratify=full_train_df['aki_label'],
            random_state=42
        )
        logging.info(f"Created a stratified sample of {SAMPLE_SIZE} records for the test run.")
    else:
        sample_train_df = full_train_df

    # 4. Align with AutoGluon Conventions
    logging.info("Renaming 'aki_label' column to 'label' to align with documentation.")
    sample_train_df = sample_train_df.rename(columns={'aki_label': 'label'})
    
    # 5. Initialize and Fit Predictor
    logging.info(f"Initializing MultiModalPredictor for a single model: {MODEL_TO_TUNE}")
    predictor = MultiModalPredictor(
        label='label',
        problem_type='binary',
        path=MODEL_OUTPUT_PATH,
        eval_metric=EVAL_METRIC,
        presets=PRESET_QUALITY
    )

    logging.info(f"Starting model training with a time limit of {TIME_LIMIT_SECONDS} seconds.")
    predictor.fit(
        train_data=sample_train_df,
        hyperparameters={
            'model.timm_image.checkpoint_name': MODEL_TO_TUNE,
            'optim.loss_func': 'BCEWithLogitsLoss',
            'env.per_gpu_batch_size': 32,
        },
        time_limit=TIME_LIMIT_SECONDS
    )
    
    # 6. Quick Evaluation to Confirm Pipeline Works
    logging.info("--- Smoke Test Evaluation ---")
    try:
        test_df = pd.read_csv(TEST_MANIFEST)
        test_sample = test_df.sample(n=min(100, len(test_df)), random_state=42)
        test_sample = test_sample.rename(columns={'aki_label': 'label'})
        
        # --- UPDATED: Use predictor.evaluate() instead of .leaderboard() ---
        scores = predictor.evaluate(test_sample, metrics=[EVAL_METRIC])
        logging.info(f"Smoke test evaluation scores: {scores}")
        
    except FileNotFoundError:
        logging.warning(f"Test manifest not found at {TEST_MANIFEST}. Skipping evaluation step.")
    except Exception as e:
        logging.error(f"An error occurred during smoke test evaluation: {e}")

    logging.info(f"--- Smoke Test Complete ---")
    logging.info(f"Test model and logs saved to: {MODEL_OUTPUT_PATH}")
    logging.info("If this script completed without errors, you are ready to run the full training.")

if __name__ == '__main__':
    main()


2025-06-23 23:10:30,156 - INFO - --- Starting Model Training Smoke Test ---
2025-06-23 23:10:30,157 - INFO - Removing previous smoke test directory: ../models/autogluon_smoketest
2025-06-23 23:10:30,174 - INFO - Loading full training data from: ../data/processed/train_data.csv
2025-06-23 23:10:32,639 - INFO - Created a stratified sample of 400 records for the test run.
2025-06-23 23:10:32,639 - INFO - Renaming 'aki_label' column to 'label' to align with documentation.
2025-06-23 23:10:32,640 - INFO - Initializing MultiModalPredictor for a single model: convnext_tiny
2025-06-23 23:10:32,642 - INFO - Starting model training with a time limit of 600 seconds.
AutoGluon Version:  1.3.1
Python Version:     3.9.23
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          12
Pytorch Version:    2.5.1
CUDA Version:       11.8
Memory Avail:       6.76 GB / 31.35 GB (21.6%)
Disk Space Avail:   331.62 GB / 1863.00 GB (17.8%)

AutoMM starts to create y

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 1: 'val_roc_auc' reached 0.46933 (best 0.46933), saving model to 'D:\\Projects\\aki_prediction_project\\models\\autogluon_smoketest\\epoch=0-step=1.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 0, global step 3: 'val_roc_auc' reached 0.58667 (best 0.58667), saving model to 'D:\\Projects\\aki_prediction_project\\models\\autogluon_smoketest\\epoch=0-step=3.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 4: 'val_roc_auc' reached 0.58667 (best 0.58667), saving model to 'D:\\Projects\\aki_prediction_project\\models\\autogluon_smoketest\\epoch=1-step=4.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 6: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 7: 'val_roc_auc' reached 0.72133 (best 0.72133), saving model to 'D:\\Projects\\aki_prediction_project\\models\\autogluon_smoketest\\epoch=2-step=7.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 2, global step 9: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 10: 'val_roc_auc' reached 0.63200 (best 0.72133), saving model to 'D:\\Projects\\aki_prediction_project\\models\\autogluon_smoketest\\epoch=3-step=10.ckpt' as top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 12: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 13: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 4, global step 15: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 16: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 5, global step 18: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 19: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 21: 'val_roc_auc' was not in top 3


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 22: 'val_roc_auc' was not in top 3
Start to fuse 3 checkpoints via the greedy soup algorithm.
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting: |          | 0/? [00:00<?, ?it/s]

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting: |          | 0/? [00:00<?, ?it/s]

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting: |          | 0/? [00:00<?, ?it/s]

AutoMM has created your model. 🎉🎉🎉

To load the model, use the code below:
    ```python
    from autogluon.multimodal import MultiModalPredictor
    predictor = MultiModalPredictor.load("d:\Projects\aki_prediction_project\models\autogluon_smoketest")
    ```

If you are not satisfied with the model, try to increase the training time, 
adjust the hyperparameters (https://auto.gluon.ai/stable/tutorials/multimodal/advanced_topics/customization.html),
or post issues on GitHub (https://github.com/autogluon/autogluon/issues).


2025-06-23 23:12:43,736 - INFO - --- Smoke Test Evaluation ---
💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.


Predicting: |          | 0/? [00:00<?, ?it/s]

2025-06-23 23:13:00,794 - INFO - Smoke test evaluation scores: {'roc_auc': np.float64(0.5778947368421052)}
2025-06-23 23:13:00,795 - INFO - --- Smoke Test Complete ---
2025-06-23 23:13:00,795 - INFO - Test model and logs saved to: ../models/autogluon_smoketest
2025-06-23 23:13:00,795 - INFO - If this script completed without errors, you are ready to run the full training.
