In [2]:
# ============================================================
# CELL 1: GPU Check + Drive Mount
# Runtime > Change runtime type > T4 GPU (REQUIRED)
# ============================================================
import torch

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"VRAM: {vram:.1f} GB")
    assert vram > 8, "‚ö†Ô∏è Need at least 8GB VRAM"

from google.colab import drive
drive.mount('/content/drive')


PyTorch: 2.10.0+cpu
CUDA: False
Mounted at /content/drive


In [2]:
# Change directory to where your 'raw' folder is
%cd /content/drive/MyDrive/Skin_triage_data/ml_core/data

print("Zipping the raw data folder. This will take ~15 minutes ONE TIME...")
# Create a zip file named 'raw.zip' containing the 'raw' folder
!zip -r -q raw.zip raw/

print("‚úÖ Zip file created permanently in your Google Drive!")

/content/drive/MyDrive/Skin_triage_data/ml_core/data
Zipping the raw data folder. This will take ~15 minutes ONE TIME...
‚úÖ Zip file created permanently in your Google Drive!


In [3]:
# ============================================================
# CELL 2: Install dependencies + set working directory
# ============================================================
!pip install -q albumentations timm torchmetrics

import sys
import os

# ‚Üê UPDATE THIS if your path changes
DRIVE_ROOT = "/content/drive/MyDrive/Skin_triage_data/ml_core"

os.chdir(DRIVE_ROOT)
sys.path.insert(0, DRIVE_ROOT)

print(f"Working directory: {os.getcwd()}")
print(f"Files: {os.listdir('.')}")


[?25l   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/983.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[90m‚ï∫[0m[90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m307.2/983.2 kB[0m [31m10.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m983.2/983.2 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hWorking directory: /content/drive/MyDrive/Skin_triage_data/ml_core
Files: ['requirements.txt', 'tests', 'test_pipeline.py', 'logs', 'src', 'notebooks', 'config', 'data', 'checkpoints', 'results']


In [4]:
# ============================================================
# NEW CELL 2.5: Fast 60-Second Data Transfer (The Zip Method)
# ============================================================
print("Extracting 25k images from Drive to Colab SSD...")

# 1. Create the local folder on the new VM
!mkdir -p /content/local_data

# 2. Unzip the single file directly to the fast SSD
!unzip -q /content/drive/MyDrive/Skin_triage_data/ml_core/data/raw.zip -d /content/local_data/

print("‚úÖ Data successfully extracted to local high-speed storage!")

Extracting 25k images from Drive to Colab SSD...
‚úÖ Data successfully extracted to local high-speed storage!


In [5]:
# ============================================================
# CELL 3: Load config + build dataloaders + class weights
# ============================================================
import torch
import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

from src.utils.config import load_config
from src.datasets.dataloader import create_dataloaders_from_config, compute_class_weights

# 1. Load configuration
config = load_config("config/config.yaml")

# üö® THE SPEED HACKS (Force the local SSD path) üö®
config.paths.raw_dir = "/content/local_data/raw"
config.training.num_workers = 2

# üö® FORCE GPU DETECTION üö®
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("\n" + "="*40)
print(f"üî• HARDWARE CHECK: {device.type.upper()} üî•")
if device.type == 'cuda':
    print(f"GPU Model: {torch.cuda.get_device_name(0)}")
print("="*40 + "\n")

# 2. Create dataloaders
dataloaders, datasets = create_dataloaders_from_config(config)

train_loader = dataloaders['train']
val_loader   = dataloaders['val']

print(f"‚úì Train batches: {len(train_loader)}")

# 3. Compute class weights

# 3. Compute class weights
class_weights = compute_class_weights(
    csv_path=config.paths.train_csv,
    class_to_idx=config.dataset.class_to_idx
)
class_weights = class_weights.to(device)

print("\n‚úÖ Fast Dataloaders Ready!")


üî• HARDWARE CHECK: CPU üî•



  original_init(self, **validated_kwargs)


‚úì Train batches: 555

‚úÖ Fast Dataloaders Ready!


In [None]:
import sys
import importlib

# 1. Force reload the trainer module
import src.training.trainer
importlib.reload(src.training.trainer)

# 2. Re-import the Trainer class from the fresh module
from src.training.trainer import Trainer

print("‚úÖ Trainer module successfully reloaded from disk!")

‚úÖ Trainer module successfully reloaded from disk!


In [None]:
import importlib
import re

file_path = '/content/drive/MyDrive/Skin_triage_data/ml_core/src/training/trainer.py'

# Read the current file
with open(file_path, 'r') as f:
    code = f.read()

# Replace any lingering 3-variable unpacks with the correct 2-variable unpack
if "for images, labels, _ in pbar:" in code:
    code = re.sub(r'for images, labels, _ in pbar:', r'for images, labels in pbar:', code)

    # Save the file back to Drive
    with open(file_path, 'w') as f:
        f.write(code)
    print("‚úÖ Validation loop patched successfully!")
else:
    print("‚úÖ No more bugs found in the loops!")

# Force Python to use the newly patched file
import src.training.trainer
importlib.reload(src.training.trainer)
from src.training.trainer import Trainer

print("‚úÖ Trainer reloaded and ready for action.")

‚úÖ No more bugs found in the loops!
‚úÖ Trainer reloaded and ready for action.


In [None]:
# ============================================================
# CELL 4: Train Model 1 ‚Äî EfficientNet-B0
# Expected time: ~60-90 min on T4
# Expected balanced acc: 75-82%
# ============================================================
MODEL_NAME = "efficientnet_b0"

model = build_model(
    backbone_name=MODEL_NAME,
    num_classes=config.dataset.num_classes,
    dropout=0.3,
    pretrained=True,
    device=device
)

trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    class_weights=class_weights,
    config=config,
    device=device,
    model_name=MODEL_NAME,
    checkpoint_dir=config.paths.checkpoints_dir,
)

# Phase 1: Head-only warmup (5 epochs)
print("Phase 1: Warming up classification head (5 epochs)...")
model.freeze_backbone()
trainer.num_epochs = 5
trainer.train()

# Phase 2: Full fine-tuning
print("\nPhase 2: Full fine-tuning...")
model.unfreeze_backbone()
trainer.num_epochs = config.training.num_epochs
trainer.early_stopping.counter = 0  # Reset early stopping
history_efficientnet = trainer.train()

print("\n‚úÖ EfficientNet-B0 training complete!")
print(f"   Best Balanced Acc: {trainer.best_val_balanced_acc:.4f}")


In [6]:
# ============================================================
# CELL 4: RESUME Train Model 1 ‚Äî EfficientNet-B0 (CPU Mode)
# ============================================================
import torch
from pathlib import Path

# üö® THE MAGIC LINES THAT FIX YOUR ERROR üö®
from src.models.classifier import build_model
from src.training.trainer import Trainer
# =========================================

MODEL_NAME = "efficientnet_b0"

# 1. Build the model architecture
# (pretrained=False because we are going to load your saved weights instead)
model = build_model(
    backbone_name=MODEL_NAME,
    num_classes=config.dataset.num_classes,
    dropout=0.3,
    pretrained=False,
    device=device
)

# 2. Setup the Trainer
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    class_weights=class_weights,
    config=config,
    device=device,
    model_name=MODEL_NAME,
    checkpoint_dir=config.paths.checkpoints_dir,
)

# 3. Look for the saved checkpoint in Google Drive
checkpoint_path = Path(config.paths.checkpoints_dir) / f"{MODEL_NAME}_latest.pth"

if checkpoint_path.exists():
    print(f"‚úÖ Found checkpoint: {checkpoint_path}")

    # Load the file from Drive into the CPU (bypassing the PyTorch 2.6 security check)
    checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)

    # Restore the model weights and optimizer learning rate
    model.load_state_dict(checkpoint['model_state_dict'])
    trainer.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    trainer.best_val_balanced_acc = checkpoint.get('val_balanced_acc', 0.0)

    # Calculate where we left off
    epochs_completed = checkpoint.get('epoch', 0) + 1
    print(f"üöÄ Successfully loaded weights! Resuming after {epochs_completed} epochs...")

    # 4. Resume Phase 2 (Full fine-tuning)
    # 4. Resume Phase 2 (Full fine-tuning)
    print(f"\nPhase 2: Resuming full fine-tuning on {device}...")
    model.unfreeze_backbone()

    epochs_total = config.training.num_epochs
    epochs_remaining = epochs_total - epochs_completed

    if epochs_remaining > 0:
        trainer.num_epochs = epochs_remaining
        trainer.early_stopping.counter = 0  # Reset early stopping
        history_efficientnet = trainer.train()
        print("\n‚úÖ Resumed training complete!")
        print(f"   Best Balanced Acc: {trainer.best_val_balanced_acc:.4f}")
    else:
        print("\n‚úÖ Model has already completed all 30 epochs!")

else:
    print(f"‚ùå ERROR: No checkpoint found at {checkpoint_path}!")
    print("Check your Drive connection. Did you mount Google Drive?")

  self.scaler = GradScaler(enabled=self.use_amp)


‚úÖ Found checkpoint: checkpoints/efficientnet_b0_latest.pth
üöÄ Successfully loaded weights! Resuming after 5 epochs...

Phase 2: Resuming full fine-tuning on cpu...


  super().__init__(loader)
  with autocast(enabled=self.use_amp):
Epoch 1 [Train]:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 255/555 [1:00:59<58:51, 11.77s/it, loss=0.2017, acc=86.9%]  ERROR:src.datasets.skin_lesion_dataset:Error loading image ISIC_0060467: Image not found: /content/drive/MyDrive/Skin_triage_data/ml_core/data/raw/ISIC_2019/ISIC_2019_Training_Input/ISIC_0060467.jpg
ERROR:src.datasets.skin_lesion_dataset:Error loading image ISIC_0071143: Image not found: /content/drive/MyDrive/Skin_triage_data/ml_core/data/raw/ISIC_2019/ISIC_2019_Training_Input/ISIC_0071143.jpg
ERROR:src.datasets.skin_lesion_dataset:Error loading image ISIC_0059467: Image not found: /content/drive/MyDrive/Skin_triage_data/ml_core/data/raw/ISIC_2019/ISIC_2019_Training_Input/ISIC_0059467.jpg
ERROR:src.datasets.skin_lesion_dataset:Error loading image ISIC_0063583: Image not found: /content/drive/MyDrive/Skin_triage_data/ml_core/data/raw/ISIC_2019/ISIC_2019_Training_Input/ISIC_0063583.jpg
ERROR:src.datasets.skin_lesion_dat

KeyboardInterrupt: 

In [None]:
# ============================================================
# CELL 5: Train Model 2 ‚Äî ResNet50
# Expected time: ~90-120 min on T4
# Expected balanced acc: 73-80%
# ============================================================
MODEL_NAME = "resnet50"

model = build_model(
    backbone_name=MODEL_NAME,
    num_classes=config.dataset.num_classes,
    dropout=0.3,
    pretrained=True,
    device=device
)

trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    class_weights=class_weights,
    config=config,
    device=device,
    model_name=MODEL_NAME,
    checkpoint_dir=config.paths.checkpoints_dir,
)

model.freeze_backbone()
trainer.num_epochs = 5
trainer.train()

model.unfreeze_backbone()
trainer.num_epochs = config.training.num_epochs
trainer.early_stopping.counter = 0
history_resnet = trainer.train()

print("\n‚úÖ ResNet50 training complete!")
print(f"   Best Balanced Acc: {trainer.best_val_balanced_acc:.4f}")


In [None]:
# ============================================================
# CELL 6: Train Model 3 ‚Äî ConvNeXt-Tiny (SOTA candidate)
# Expected time: ~90-120 min on T4
# Expected balanced acc: 78-85% (likely WINNER)
# ============================================================
MODEL_NAME = "convnext_tiny"

model = build_model(
    backbone_name=MODEL_NAME,
    num_classes=config.dataset.num_classes,
    dropout=0.3,
    pretrained=True,
    device=device
)

trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    class_weights=class_weights,
    config=config,
    device=device,
    model_name=MODEL_NAME,
    checkpoint_dir=config.paths.checkpoints_dir,
)

model.freeze_backbone()
trainer.num_epochs = 5
trainer.train()

model.unfreeze_backbone()
trainer.num_epochs = config.training.num_epochs
trainer.early_stopping.counter = 0
history_convnext = trainer.train()

print("\n‚úÖ ConvNeXt-Tiny training complete!")
print(f"   Best Balanced Acc: {trainer.best_val_balanced_acc:.4f}")


In [None]:
# ============================================================
# CELL 7: Evaluate all 3 models on HELD-OUT test set
# Only run this ONCE ‚Äî after all training is done.
# ============================================================
import numpy as np
import pandas as pd

metrics_calc = MetricsCalculator(
    class_names=config.dataset.class_names,
    results_dir=config.paths.results_dir
)

results = []

for model_name in ['efficientnet_b0', 'resnet50', 'convnext_tiny']:
    checkpoint_path = f"{config.paths.checkpoints_dir}/{model_name}_best.pth"

    model = build_model(
        backbone_name=model_name,
        num_classes=config.dataset.num_classes,
        dropout=0.3,
        pretrained=False,  # Will load from checkpoint
        checkpoint_path=checkpoint_path,
        device=device
    )
    model.eval()

    all_preds, all_labels, all_probs = [], [], []

    with torch.no_grad():
        for images, labels, _ in test_loader:
            images = images.to(device)
            logits = model(images)
            probs = torch.softmax(logits, dim=1)
            preds = probs.argmax(dim=1)
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)
    all_probs = np.concatenate(all_probs)

    # Compute metrics
    metrics = metrics_calc.compute(all_preds, all_labels, all_probs, split='test')
    metrics_calc.plot_confusion_matrix(all_preds, all_labels, model_name, 'test')
    metrics_calc.save_metrics_csv(metrics, model_name, 'test')

    results.append({
        'model': model_name,
        'balanced_acc': metrics['balanced_accuracy'],
        'mel_sensitivity': metrics['mel_sensitivity'],
        'mel_auc': metrics['auc_roc_melanoma'],
        'f1_macro': metrics['f1_macro'],
        'auc_macro': metrics['auc_roc_macro'],
    })
    print(f"\n[{model_name}] Balanced Acc: {metrics['balanced_accuracy']:.4f} | "
          f"Mel Sensitivity: {metrics['mel_sensitivity']:.4f}")

# Final comparison table
print("\n" + "="*70)
print("MODEL COMPARISON ‚Äî FINAL TEST SET RESULTS")
print("="*70)
df = pd.DataFrame(results).set_index('model')
print(df.to_string())

winner = df['balanced_acc'].idxmax()
print(f"\nüèÜ BEST MODEL: {winner}")
print(f"   ‚Üí This model proceeds to Phase 3 (Uncertainty + OOD)")
