In [4]:
# Cell 1: Setup
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Create models folder
Path("../models").mkdir(exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

Using device: cuda
GPU: NVIDIA GeForce RTX 3050 6GB Laptop GPU


In [5]:
# Cell 2: Data loading and stratified split (FIXED)
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

DATA_BASE = Path("../data/processed")
datasets = ['ptbxl', 'sami_trop', 'code15']

df_list = []
for ds in datasets:
    csv_path = DATA_BASE / "metadata" / f"{ds}_metadata.csv"
    if csv_path.exists():
        print(f"Loading {csv_path}...")
        df = pd.read_csv(csv_path)
        df['dataset'] = ds
        df_list.append(df)
    else:
        print(f"Warning: {csv_path} not found!")

df_all = pd.concat(df_list, ignore_index=True)
print(f"\nTotal records loaded: {len(df_all)}")

# Soft label → bin for stratification
df_all['label_bin'] = np.where(df_all['label'] < 0.3, 0,
                               np.where(df_all['label'] > 0.7, 1, 0.5))

# Start small for fast training
subset_frac = 0.1  # Change to 0.5 or 1.0 later
df_all = df_all.sample(frac=subset_frac, random_state=42).reset_index(drop=True)

# Create a proper stratification column (as a real column in the DataFrame)
df_all['stratify_group'] = df_all['label_bin'].astype(str) + "_" + df_all['dataset']

# First split: train vs (val+test)
train_df, temp_df = train_test_split(
    df_all,
    test_size=0.2,
    stratify=df_all['stratify_group'],
    random_state=42
)

# Second split: val vs test
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['stratify_group'],  # Now using a real column
    random_state=42
)

# Clean up temporary column (optional)
df_all = df_all.drop(columns=['stratify_group'])
train_df = train_df.drop(columns=['stratify_group'])
val_df = val_df.drop(columns=['stratify_group'])
test_df = test_df.drop(columns=['stratify_group'])

print(f"Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}")
print(f"Positive ratio (confident positives, label > 0.7): {(df_all['label'] > 0.7).mean():.4f}")
print(f"Dataset distribution in full subset:")
print(df_all['dataset'].value_counts())

Loading ..\data\processed\metadata\ptbxl_metadata.csv...
Loading ..\data\processed\metadata\sami_trop_metadata.csv...
Loading ..\data\processed\metadata\code15_metadata.csv...

Total records loaded: 63228
Train: 5058 | Val: 632 | Test: 633
Positive ratio (confident positives, label > 0.7): 0.0364
Dataset distribution in full subset:
dataset
code15       3985
ptbxl        2183
sami_trop     155
Name: count, dtype: int64


In [6]:
# Cell 3: Dataset and DataLoaders + TIMING (Final Working Version)
import time
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import torchvision.transforms as transforms

start_time = time.time()

class ECGImageDataset(Dataset):
    def __init__(self, df, augment=True):
        self.df = df.reset_index(drop=True)
        self.augment = augment
        if augment:
            self.transform = transforms.Compose([
                transforms.ToTensor(),
                transforms.RandomHorizontalFlip(p=0.5),
                transforms.RandomRotation(degrees=8),
                transforms.RandomAffine(degrees=0, translate=(0.05, 0.05)),
            ])
        else:
            self.transform = transforms.ToTensor()
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = row['img_path']
        img = np.load(img_path).astype(np.float32)  # (3, 24, 2048)
        label = torch.tensor(row['label'], dtype=torch.float32)
        if self.transform:
            img = self.transform(img)
        return img, label

# Batch size (RTX 3050 safe)
batch_size = 32

# Datasets
train_ds = ECGImageDataset(train_df, augment=True)
val_ds   = ECGImageDataset(val_df,   augment=False)
test_ds  = ECGImageDataset(test_df,  augment=False)

# Strong oversampling of confident positives
sample_weights = train_df['label'].apply(lambda x: 10.0 if x > 0.7 else 1.0).values
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)

# DataLoaders (num_workers=0 for Windows)
train_loader = DataLoader(train_ds, batch_size=batch_size, sampler=sampler, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)

# Test batch
print("Testing one batch from train_loader...")
img_batch, lbl_batch = next(iter(train_loader))
print(f"✓ Batch image shape: {img_batch.shape}")      # Should be [32, 3, 24, 2048]
print(f"✓ Batch label shape: {lbl_batch.shape}")
print(f"✓ Sample labels: {lbl_batch.tolist()[:10]}")
print(f"✓ Image value range: [{img_batch.min().item():.3f}, {img_batch.max().item():.3f}]")

end_time = time.time()
print(f"\n⏱ Cell 3 execution time: {end_time - start_time:.2f} seconds")

Testing one batch from train_loader...


FileNotFoundError: [Errno 2] No such file or directory: 'data\\processed\\2d_images\\code15\\517868_img.npy'

In [None]:
# Cell 4: ViT Model from Scratch + TIMING
import time
import torch
import torch.nn as nn

start_time = time.time()

class PatchEmbedding(nn.Module):
    def __init__(self, patch_size=16, in_ch=3, embed_dim=768):
        super().__init__()
        self.proj = nn.Conv2d(in_ch, embed_dim, kernel_size=patch_size, stride=patch_size)
        self.num_patches = (24 // patch_size) * (2048 // patch_size)
    
    def forward(self, x):
        x = self.proj(x)                      # (B, embed_dim, H/p, W/p)
        x = x.flatten(2).transpose(1, 2)      # (B, num_patches, embed_dim)
        return x

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim=768, heads=12, mlp_ratio=4, dropout=0.1):
        super().__init__()
        self.norm1 = nn.LayerNorm(embed_dim)
        self.attn  = nn.MultiheadAttention(embed_dim, heads, dropout=dropout, batch_first=True)
        self.norm2 = nn.LayerNorm(embed_dim)
        mlp_dim = int(embed_dim * mlp_ratio)
        self.mlp   = nn.Sequential(
            nn.Linear(embed_dim, mlp_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(mlp_dim, embed_dim),
            nn.Dropout(dropout)
        )
    
    def forward(self, x):
        y, _ = self.attn(self.norm1(x), self.norm1(x), self.norm1(x))
        x = x + y
        x = x + self.mlp(self.norm2(x))
        return x

class ViTClassifier(nn.Module):
    def __init__(self, patch_size=16, embed_dim=768, depth=12, heads=12, mlp_ratio=4, dropout=0.1):
        super().__init__()
        self.patch_embed = PatchEmbedding(patch_size, in_ch=3, embed_dim=embed_dim)
        self.cls_token   = nn.Parameter(torch.zeros(1, 1, embed_dim))
        num_patches      = self.patch_embed.num_patches
        self.pos_embed   = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
        self.dropout     = nn.Dropout(dropout)
        
        self.blocks = nn.ModuleList([
            TransformerBlock(embed_dim, heads, mlp_ratio, dropout)
            for _ in range(depth)
        ])
        
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, 1)  # Logits
    
    def forward(self, x):
        B = x.shape[0]
        x = self.patch_embed(x)
        
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.pos_embed
        x = self.dropout(x)
        
        for block in self.blocks:
            x = block(x)
        
        x = self.norm(x)
        return self.head(x[:, 0]).squeeze(-1)  # (B,)

# Instantiate and move to GPU
model = ViTClassifier(
    patch_size=16,
    embed_dim=768,
    depth=12,
    heads=12,
    dropout=0.1
).to(device)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model: ViT-B/16-like | Trainable parameters: {total_params:,}")
print(f"Model device: {next(model.parameters()).device}")

end_time = time.time()
print(f"\n⏱ Cell 4 execution time: {end_time - start_time:.2f} seconds")

In [None]:
# Cell 5: Training Loop (Manual 20 epochs) + TIMING
import time
from tqdm.notebook import tqdm
import torch.optim as optim

start_time = time.time()

num_epochs = 20
criterion = nn.BCEWithLogitsLoss()  # Stable for soft labels
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.05)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

train_losses = []
val_aurocs = []
best_val_auroc = 0.0
best_model_path = "../models/vit_chagas_best.pth"

print("Starting training...\n")

for epoch in range(num_epochs):
    epoch_start = time.time()
    
    # ----------------- Training -----------------
    model.train()
    epoch_loss = 0.0
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]"):
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        logits = model(images)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    avg_train_loss = epoch_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    
    # ----------------- Validation -----------------
    model.eval()
    val_preds = []
    val_trues = []
    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]"):
            images = images.to(device)
            logits = model(images)
            probs = torch.sigmoid(logits)
            val_preds.extend(probs.cpu().numpy())
            val_trues.extend(labels.numpy())
    
    val_auroc = roc_auc_score(val_trues, val_preds)
    val_auprc = average_precision_score(val_trues, val_preds)
    val_aurocs.append(val_auroc)
    
    # Save best model
    if val_auroc > best_val_auroc:
        best_val_auroc = val_auroc
        torch.save(model.state_dict(), best_model_path)
        print(f"  → NEW BEST MODEL saved! Val AUROC: {val_auroc:.4f}")
    
    scheduler.step()
    
    epoch_time = time.time() - epoch_start
    print(f"Epoch {epoch+1:02d} | Train Loss: {avg_train_loss:.4f} | Val AUROC: {val_auroc:.4f} | Val AUPRC: {val_auprc:.4f} | Time: {epoch_time:.1f}s\n")

total_time = time.time() - start_time
print(f"\nTraining completed! Total time: {total_time/60:.1f} minutes")
print(f"Best Validation AUROC: {best_val_auroc:.4f}")

In [None]:
# Cell 6: Final Test Evaluation + TIMING
import time
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np
import matplotlib.pyplot as plt

start_time = time.time()

# Load best model
model.load_state_dict(torch.load(best_model_path))
model.eval()

test_preds = []
test_trues = []
with torch.no_grad():
    for images, labels in tqdm(test_loader, desc="Test Evaluation"):
        images = images.to(device)
        logits = model(images)
        probs = torch.sigmoid(logits)
        test_preds.extend(probs.cpu().numpy())
        test_trues.extend(labels.numpy())

test_auroc = roc_auc_score(test_trues, test_preds)
test_auprc = average_precision_score(test_trues, test_preds)

# Approximate Challenge metric (fraction of true positives in top 5%)
test_preds_np = np.array(test_preds)
test_trues_np = np.array(test_trues)
sorted_idx = np.argsort(-test_preds_np)  # descending
top_k = max(1, int(0.05 * len(test_preds_np)))
top_trues = test_trues_np[sorted_idx[:top_k]]
challenge_score = np.mean(top_trues > 0.7)  # Confident positives

print("\n=== FINAL TEST RESULTS ===")
print(f"Test AUROC:     {test_auroc:.4f}")
print(f"Test AUPRC:     {test_auprc:.4f}")
print(f"Challenge Score (top 5% confident positives): {challenge_score:.4f}")

# Plot training curves
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label="Train Loss", marker='o')
plt.title("Training Loss")
plt.xlabel("Epoch")
plt.grid()
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(val_aurocs, label="Val AUROC", color='orange', marker='o')
plt.title("Validation AUROC")
plt.xlabel("Epoch")
plt.grid()
plt.legend()

plt.tight_layout()
plt.show()

end_time = time.time()
print(f"\n⏱ Cell 6 execution time: {end_time - start_time:.2f} seconds")