# =============================================================================
# BASELINE DENSENET121 K-FOLD CROSS-VALIDATION (GENUINE VS FORGED)
# =============================================================================
Load pretrained baseline weights and perform 5-fold cross-validation for binary classification.
Uses 110 evaluation users split into 5 folds (same as meta-learning).
Requires: baseline_pretrain.pth from baseline_pretraining.ipynb

# =============================================================================
# STEP 1: SETUP, IMPORTS, AND REPRODUCIBILITY
# =============================================================================

In [None]:
import os
import sys
import json
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from PIL import Image

# Set repo root
current_dir = os.path.abspath(os.getcwd())
REPO_ROOT = os.path.abspath(os.path.join(current_dir, '..'))
if REPO_ROOT not in sys.path:
    sys.path.append(REPO_ROOT)

# Import Custom Modules
from models.feature_extractor import DenseNetFeatureExtractor
from utils.model_evaluation import compute_metrics

# Deterministic Seeding for Reproducible Research
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f" > [System] Seed set to: {seed}")

seed_everything(42)

# Device Configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f" > [System] Computation Device: {DEVICE}")
print(f" > [System] CUDA Available: {torch.cuda.is_available()}")

# =============================================================================
# STEP 2: HYPERPARAMETER CONFIGURATION
# =============================================================================

In [None]:
# --- Input Configuration ---
IMG_SIZE = 224
INPUT_SHAPE = (IMG_SIZE, IMG_SIZE)

# --- Training Hyperparameters ---
BATCH_SIZE = 32
LEARNING_RATE = 1e-5  # Lower learning rate for fine-tuning pretrained model
EPOCHS = 50
N_FOLDS = 5

# --- Data Configuration ---
# Use 110 evaluation users split into 5 folds (same as meta-learning)
SPLIT_DIR = os.path.join(REPO_ROOT, 'data', 'splits')

# --- Pretrained Weights ---
PRETRAINED_WEIGHTS_PATH = os.path.join(REPO_ROOT, 'baseline_pretrain.pth')

# --- Checkpoint Configuration ---
CHECKPOINT_DIR = os.path.join(REPO_ROOT, 'checkpoints', 'baseline_kfold')
os.makedirs(CHECKPOINT_DIR, exist_ok=True)

print(f"[Config] Image Size: {INPUT_SHAPE}")
print(f"[Config] Batch Size: {BATCH_SIZE}")
print(f"[Config] Learning Rate: {LEARNING_RATE}")
print(f"[Config] Epochs: {EPOCHS}")
print(f"[Config] K-Folds: {N_FOLDS}")
print(f"[Config] Pretrained Weights: {PRETRAINED_WEIGHTS_PATH}")
print(f"[Config] Split Directory: {SPLIT_DIR}")
print(f"[Config] Checkpoints: {CHECKPOINT_DIR}")

# Verify pretrained weights exist
if not os.path.exists(PRETRAINED_WEIGHTS_PATH):
    print(f"\nWARNING: Pretrained weights not found at {PRETRAINED_WEIGHTS_PATH}")
    print(f"Please run baseline_pretraining.ipynb first!")
else:
    print(f"\n > Pretrained weights found!")

# =============================================================================
# STEP 3: VERIFY FOLD FILES EXIST
# =============================================================================

In [None]:
# Verify fold files exist (generated by restructure_bhsig.py in baseline_pretraining.ipynb)
print(f" > [Info] Checking for fold split files in: {SPLIT_DIR}")

fold_files_found = True
for fold in range(N_FOLDS):
    fold_file = os.path.join(SPLIT_DIR, f'bhsig_meta_split_fold_{fold}.json')
    if not os.path.exists(fold_file):
        print(f"ERROR: Fold file not found: {fold_file}")
        fold_files_found = False

if fold_files_found:
    print(f" > [Info] All {N_FOLDS} fold files found!")
    print(f" > [Info] Using 110 evaluation users (same as meta-training)")
else:
    print(f"\nERROR: Some fold files are missing!")
    print(f"Please run baseline_pretraining.ipynb first to generate the fold splits.")

In [None]:
# =============================================================================
# DATASET CLASS FOR GENUINE VS FORGED CLASSIFICATION
# =============================================================================

class BHSigDataset(Dataset):
    """
    Dataset for Genuine vs Forged Classification (Binary Classification)
    Label 0: Genuine signature
    Label 1: Forged signature
    """
    def __init__(self, user_dict, user_list, transform=None):
        """
        Args:
            user_dict: Dictionary of users with genuine/forged image paths
            user_list: List of user IDs to use
            transform: Image transformation pipeline
        """
        self.samples = []
        
        # Collect all samples with binary labels (0=genuine, 1=forged)
        for uid in user_list:
            if uid in user_dict:
                user_data = user_dict[uid]
                
                # Add genuine samples (label = 0)
                for img_path in user_data['genuine']:
                    self.samples.append((img_path, 0))
                
                # Add forged samples (label = 1)
                for img_path in user_data['forged']:
                    self.samples.append((img_path, 1))
        
        self.transform = transform
        
        # Count samples per class
        num_genuine = sum(1 for _, label in self.samples if label == 0)
        num_forged = sum(1 for _, label in self.samples if label == 1)
        
        print(f"   Dataset initialized: {len(self.samples)} total samples")
        print(f"   - Genuine: {num_genuine} samples (label=0)")
        print(f"   - Forged: {num_forged} samples (label=1)")
        print(f"   - From {len(user_list)} users")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        
        try:
            img = Image.open(img_path).convert('RGB')
            if self.transform:
                img = self.transform(img)
        except Exception as e:
            print(f"Error loading image {img_path}: {e}")
            # Return a blank tensor on error
            img = torch.zeros(3, IMG_SIZE, IMG_SIZE)
        
        return img, label

# =============================================================================
# DATA TRANSFORMATIONS
# =============================================================================

# Training transformations with augmentation
train_transform = transforms.Compose([
    transforms.Resize(INPUT_SHAPE),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=10),
    transforms.RandomAffine(
        degrees=0,
        translate=(0.1, 0.1),
        scale=(0.9, 1.1),
        fill=0
    ),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Validation transformations without augmentation
val_transform = transforms.Compose([
    transforms.Resize(INPUT_SHAPE),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

print(" > [Info] Dataset class and transformations defined for binary classification")

# =============================================================================
# STEP 4: MODEL INITIALIZATION WITH PRETRAINED WEIGHTS
# =============================================================================

In [None]:
def initialize_model_with_pretrained(pretrained_path, device):
    """
    Initialize binary classification model and load pretrained weights.
    Uses 2 classes: 0=Genuine, 1=Forged
    
    Args:
        pretrained_path: Path to pretrained weights
        device: PyTorch device
    
    Returns:
        Model with loaded pretrained backbone
    """
    # Binary classification: 2 classes (genuine vs forged)
    num_classes = 2
    
    # Initialize model
    model = DenseNetFeatureExtractor(
        backbone_name='densenet121',
        output_dim=num_classes,
        pretrained=True,
        baseline=True
    ).to(device)
    
    # Load pretrained weights
    if os.path.exists(pretrained_path):
        print(f"   Loading pretrained weights from {os.path.basename(pretrained_path)}...")
        checkpoint = torch.load(pretrained_path, map_location=device, weights_only=False)
        
        # Extract state dict
        if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
            pretrained_state = checkpoint['model_state_dict']
        else:
            pretrained_state = checkpoint
        
        # Load weights directly (both should have 2 classes)
        try:
            model.load_state_dict(pretrained_state, strict=True)
            print(f"   Successfully loaded all pretrained weights!")
        except RuntimeError as e:
            # If there's a mismatch, load only backbone weights
            print(f"   Note: Loading backbone only (classifier may differ)")
            model_state = model.state_dict()
            pretrained_state_filtered = {
                k: v for k, v in pretrained_state.items() 
                if k in model_state and model_state[k].shape == v.shape
            }
            model_state.update(pretrained_state_filtered)
            model.load_state_dict(model_state)
            print(f"   Loaded {len(pretrained_state_filtered)} layers")
    else:
        print(f"   WARNING: Pretrained weights not found. Using ImageNet initialization.")
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    
    print(f"   Model: DenseNet121 Baseline (Binary Classification)")
    print(f"   Total Parameters: {total_params:,}")
    print(f"   Trainable Parameters: {trainable_params:,}")
    print(f"   Output Classes: {num_classes} (0=Genuine, 1=Forged)")
    
    return model

print(" > [Info] Model initialization function defined")

# =============================================================================
# STEP 5: TRAINING UTILITIES
# =============================================================================

In [None]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    """
    Train for one epoch
    
    Args:
        model: PyTorch model
        dataloader: Training data loader
        optimizer: Optimizer
        criterion: Loss function
        device: PyTorch device
    
    Returns:
        avg_loss: Average training loss
        avg_acc: Average training accuracy
    """
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    pbar = tqdm(dataloader, desc="Training", leave=False)
    
    for images, labels in pbar:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        # Calculate accuracy
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == labels).sum().item()
        total += labels.size(0)
        
        pbar.set_postfix({'loss': loss.item()})
    
    avg_loss = running_loss / len(dataloader)
    avg_acc = correct / total if total > 0 else 0.0
    
    return avg_loss, avg_acc

def validate_epoch(model, dataloader, criterion, device):
    """
    Validate model and compute comprehensive metrics including EER
    
    Args:
        model: PyTorch model
        dataloader: Validation data loader
        criterion: Loss function
        device: PyTorch device
    
    Returns:
        metrics: Dictionary containing loss, accuracy, precision, recall, f1, EER, AUC
        all_preds: All predictions
        all_labels: All ground truth labels
    """
    model.eval()
    running_loss = 0.0
    all_labels = []
    all_scores = []  # Probability scores for EER calculation
    
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            
            # Get probability scores for class 1 (forged)
            # For EER calculation, we need soft scores
            probs = torch.softmax(outputs, dim=1)[:, 1]  # Probability of being forged
            
            all_scores.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = running_loss / len(dataloader)
    
    # Use compute_metrics to get comprehensive evaluation (including EER and AUC)
    metrics = compute_metrics(all_labels, all_scores)
    
    # Add loss to metrics
    metrics['loss'] = avg_loss
    
    # Get hard predictions for compatibility
    all_preds = [1 if score > 0.5 else 0 for score in all_scores]
    
    return metrics, all_preds, all_labels

print(" > [Info] Training functions defined with EER support")

# =============================================================================
# STEP 6: K-FOLD CROSS-VALIDATION EXECUTION
# =============================================================================

In [None]:
print(f"\n{'='*60}")
print(f"Starting {N_FOLDS}-Fold Cross-Validation with Pretrained Weights")
print(f"Binary Classification: Genuine (0) vs Forged (1)")
print(f"Pretrained Weights: {os.path.basename(PRETRAINED_WEIGHTS_PATH)}")
print(f"Using 110 Evaluation Users")
print(f"Checkpoint Directory: {CHECKPOINT_DIR}")
print(f"{'='*60}\n")

fold_results = []

# Iterate through the 5 pre-generated fold files
for fold in range(N_FOLDS):
    print(f"\n{'-'*60}")
    print(f"FOLD {fold + 1}/{N_FOLDS}")
    print(f"{'-'*60}")
    
    # 1. LOAD FOLD DATA
    fold_file = os.path.join(SPLIT_DIR, f'bhsig_meta_split_fold_{fold}.json')
    print(f" > Loading fold file: {os.path.basename(fold_file)}")
    
    with open(fold_file, 'r') as f:
        fold_data = json.load(f)
    
    # meta-train: users for training in this fold
    # meta-test: users for validation in this fold
    train_users = list(fold_data['meta-train'].keys())
    val_users = list(fold_data['meta-test'].keys())
    
    print(f"   Train users: {len(train_users)}")
    print(f"   Val users: {len(val_users)}")
    
    # 2. DATALOADER SETUP
    print(" > Loading training dataset...")
    train_dataset = BHSigDataset(fold_data['meta-train'], train_users, transform=train_transform)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    
    print(" > Loading validation dataset...")
    val_dataset = BHSigDataset(fold_data['meta-test'], val_users, transform=val_transform)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)
    
    # 3. MODEL INITIALIZATION WITH PRETRAINED WEIGHTS
    model = initialize_model_with_pretrained(PRETRAINED_WEIGHTS_PATH, DEVICE)
    
    # Optimizer and loss
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, verbose=False)
    
    # 4. TRAINING LOOP (MINIMIZE EER)
    best_eer = 1.0
    best_acc = 0.0
    best_metrics = {}
    best_epoch = 0
    
    for epoch in range(EPOCHS):
        # Train
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion, DEVICE)
        
        # Validation (Calculate EER and Accuracy)
        val_metrics, _, _ = validate_epoch(model, val_loader, criterion, DEVICE)
        val_eer = val_metrics['eer']
        val_acc = val_metrics['accuracy']
        
        # Logging
        print(f"Epoch {epoch+1:03d} | "
              f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.2%} | "
              f"Val EER: {val_eer:.2%} | Val Acc: {val_acc:.2%}")
        
        scheduler.step(val_eer)
        
        # Save Best Model Logic (Minimize EER)
        if val_eer < best_eer:
            best_eer = val_eer
            best_acc = val_acc
            best_metrics = val_metrics
            best_epoch = epoch

            # Save Checkpoint
            ckpt_path = os.path.join(CHECKPOINT_DIR, f"best_model_fold_{fold}.pth")
            torch.save({
                'fold': fold,
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'num_classes': 2,  # Binary classification
                'val_eer': val_eer,
                'val_acc': val_acc,
                'metrics': best_metrics
            }, ckpt_path)
            print(f"   >>> Best Model Saved! (EER: {val_eer:.2%} | ACC: {val_acc:.2%})")
        
        elif val_eer == best_eer and val_acc > best_acc:
            best_acc = val_acc
            best_metrics = val_metrics
            best_epoch = epoch
            
            # Save Checkpoint
            ckpt_path = os.path.join(CHECKPOINT_DIR, f"best_model_fold_{fold}.pth")
            torch.save({
                'fold': fold,
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'num_classes': 2,  # Binary classification
                'val_eer': val_eer,
                'val_acc': val_acc,
                'metrics': best_metrics
            }, ckpt_path)
            print(f"   >>> Best Model Saved! (EER: {val_eer:.2%} | ACC: {val_acc:.2%})")
    
    # 5. FOLD SUMMARY
    print(f"\nFold {fold+1} Summary:")
    print(f"   Best Epoch: {best_epoch+1}")
    print(f"   Best Val EER: {best_eer:.2%}")
    print(f"   Best Val Accuracy: {best_acc:.2%}")
    print(f"   Best Val AUC: {best_metrics['auc']:.4f}")
    print(f"   Best Val Precision: {best_metrics['precision']:.4f}")
    print(f"   Best Val Recall: {best_metrics['recall']:.4f}")
    print(f"   Best Val F1: {best_metrics['f1']:.4f}")

    print(f"{'='*60}")

    fold_results.append(best_metrics)
    
print(f"All Folds Training Complete")
print(f"\n{'='*60}")

# =============================================================================
# STEP 7: FINAL CROSS-VALIDATION RESULTS
# =============================================================================

In [None]:
print(f"\n{'='*60}")
print(f"{'FINAL CROSS-VALIDATION RESULTS':^60}")
print(f"{'='*60}")

# Aggregate metrics across folds
avg_metrics = {
    'eer': [],
    'accuracy': [],
    'auc': [],
    'precision': [],
    'recall': [],
    'f1': [],
    'loss': []
}

for i, result in enumerate(fold_results):
    print(f"\nFold {i+1}:")
    print(f"   EER:       {result['eer']:.4f} ({result['eer']*100:.2f}%)")
    print(f"   Accuracy:  {result['accuracy']:.4f} ({result['accuracy']*100:.2f}%)")
    print(f"   AUC:       {result['auc']:.4f}")
    print(f"   Precision: {result['precision']:.4f}")
    print(f"   Recall:    {result['recall']:.4f}")
    print(f"   F1-Score:  {result['f1']:.4f}")
    print(f"   Val Loss:  {result['loss']:.4f}")
    
    for key in avg_metrics.keys():
        avg_metrics[key].append(result[key])

print("\n" + "-" * 60)

# Calculate mean and std
mean_eer = np.mean(avg_metrics['eer'])
std_eer = np.std(avg_metrics['eer'])

mean_acc = np.mean(avg_metrics['accuracy'])
std_acc = np.std(avg_metrics['accuracy'])

mean_auc = np.mean(avg_metrics['auc'])
std_auc = np.std(avg_metrics['auc'])

mean_prec = np.mean(avg_metrics['precision'])
std_prec = np.std(avg_metrics['precision'])

mean_rec = np.mean(avg_metrics['recall'])
std_rec = np.std(avg_metrics['recall'])

mean_f1 = np.mean(avg_metrics['f1'])
std_f1 = np.std(avg_metrics['f1'])

mean_loss = np.mean(avg_metrics['loss'])
std_loss = np.std(avg_metrics['loss'])

print(f"\nCROSS-VALIDATION SUMMARY (Mean ± Std):")
print(f"   EER:       {mean_eer:.4f} ± {std_eer:.4f}  ({mean_eer*100:.2f}% ± {std_eer*100:.2f}%)")
print(f"   Accuracy:  {mean_acc:.4f} ± {std_acc:.4f}  ({mean_acc*100:.2f}% ± {std_acc*100:.2f}%)")
print(f"   AUC:       {mean_auc:.4f} ± {std_auc:.4f}")
print(f"   Precision: {mean_prec:.4f} ± {std_prec:.4f}")
print(f"   Recall:    {mean_rec:.4f} ± {std_rec:.4f}")
print(f"   F1-Score:  {mean_f1:.4f} ± {std_f1:.4f}")
print(f"   Val Loss:  {mean_loss:.4f} ± {std_loss:.4f}")

print(f"{'='*60}")

# Save results to JSON
results_file = os.path.join(CHECKPOINT_DIR, 'cross_validation_results.json')
cv_results = {
    'model': 'DenseNet121_Baseline_Binary_Classification',
    'task': 'Genuine vs Forged Detection',
    'num_classes': 2,
    'pretrained_weights': os.path.basename(PRETRAINED_WEIGHTS_PATH),
    'config': {
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE,
        'epochs': EPOCHS,
        'folds': N_FOLDS,
        'img_size': INPUT_SHAPE
    },
    'fold_results': [
        {k: float(v) if isinstance(v, (np.floating, float)) else v 
         for k, v in result.items()} 
        for result in fold_results
    ],
    'summary': {
        'eer': {'mean': float(mean_eer), 'std': float(std_eer)},
        'accuracy': {'mean': float(mean_acc), 'std': float(std_acc)},
        'auc': {'mean': float(mean_auc), 'std': float(std_auc)},
        'precision': {'mean': float(mean_prec), 'std': float(std_prec)},
        'recall': {'mean': float(mean_rec), 'std': float(std_rec)},
        'f1': {'mean': float(mean_f1), 'std': float(std_f1)},
        'loss': {'mean': float(mean_loss), 'std': float(std_loss)}
    }
}

with open(results_file, 'w') as f:
    json.dump(cv_results, f, indent=4)

print(f"\n > [Info] Results saved to: {results_file}")

# =============================================================================
# STEP 8: VISUALIZATION
# =============================================================================

In [None]:
# Plot cross-validation metrics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Baseline DenseNet121 - Genuine vs Forged Classification Results', fontsize=16, fontweight='bold')

folds = list(range(1, N_FOLDS + 1))
accuracies = avg_metrics['accuracy']
precisions = avg_metrics['precision']
recalls = avg_metrics['recall']
f1_scores = avg_metrics['f1']

# Accuracy plot
axes[0, 0].bar(folds, accuracies, color='steelblue', alpha=0.7, edgecolor='black')
axes[0, 0].axhline(y=mean_acc, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_acc:.4f}')
axes[0, 0].set_xlabel('Fold', fontsize=12)
axes[0, 0].set_ylabel('Accuracy', fontsize=12)
axes[0, 0].set_title('Accuracy per Fold', fontsize=13, fontweight='bold')
axes[0, 0].set_ylim([0, 1])
axes[0, 0].set_xticks(folds)
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Precision plot
axes[0, 1].bar(folds, precisions, color='green', alpha=0.7, edgecolor='black')
axes[0, 1].axhline(y=mean_prec, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_prec:.4f}')
axes[0, 1].set_xlabel('Fold', fontsize=12)
axes[0, 1].set_ylabel('Precision', fontsize=12)
axes[0, 1].set_title('Precision per Fold', fontsize=13, fontweight='bold')
axes[0, 1].set_ylim([0, 1])
axes[0, 1].set_xticks(folds)
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# Recall plot
axes[1, 0].bar(folds, recalls, color='orange', alpha=0.7, edgecolor='black')
axes[1, 0].axhline(y=mean_rec, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_rec:.4f}')
axes[1, 0].set_xlabel('Fold', fontsize=12)
axes[1, 0].set_ylabel('Recall', fontsize=12)
axes[1, 0].set_title('Recall per Fold', fontsize=13, fontweight='bold')
axes[1, 0].set_ylim([0, 1])
axes[1, 0].set_xticks(folds)
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)

# F1-Score plot
axes[1, 1].bar(folds, f1_scores, color='purple', alpha=0.7, edgecolor='black')
axes[1, 1].axhline(y=mean_f1, color='red', linestyle='--', linewidth=2, label=f'Mean: {mean_f1:.4f}')
axes[1, 1].set_xlabel('Fold', fontsize=12)
axes[1, 1].set_ylabel('F1-Score', fontsize=12)
axes[1, 1].set_title('F1-Score per Fold', fontsize=13, fontweight='bold')
axes[1, 1].set_ylim([0, 1])
axes[1, 1].set_xticks(folds)
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plot_path = os.path.join(CHECKPOINT_DIR, 'cv_metrics.png')
plt.savefig(plot_path, dpi=150, bbox_inches='tight')
plt.show()

print(f" > [Info] Cross-validation metrics plot saved to: {plot_path}")

# =============================================================================
# VALIDATION SUMMARY
# =============================================================================

In [None]:
print("\n" + "="*60)
print("CROSS-VALIDATION SUMMARY")
print("="*60)
print(f"\nModel: Baseline DenseNet121 (Binary Classification)")
print(f"Task: Genuine vs Forged Signature Detection")
print(f"\nConfiguration:")
print(f"   - Image Size: {INPUT_SHAPE}")
print(f"   - Batch Size: {BATCH_SIZE}")
print(f"   - Learning Rate: {LEARNING_RATE}")
print(f"   - Epochs: {EPOCHS}")
print(f"   - K-Folds: {N_FOLDS}")
print(f"   - Classes: 2 (0=Genuine, 1=Forged)")
print(f"   - Pretrained Weights: {os.path.basename(PRETRAINED_WEIGHTS_PATH)}")
print(f"\nDataset:")
print(f"   - Training Users: {len(train_users)}")
print(f"   - Test Users: {len(test_users)}")
print(f"\nResults:")
print(f"   - Mean Accuracy: {mean_acc:.4f} ({mean_acc*100:.2f}%)")
print(f"   - Mean F1-Score: {mean_f1:.4f}")
print(f"   - Mean Precision: {mean_prec:.4f}")
print(f"   - Mean Recall: {mean_rec:.4f}")
print(f"\nSaved Artifacts:")
print(f"   - Checkpoints: {CHECKPOINT_DIR}")
print(f"   - Results JSON: {results_file}")
print(f"   - Metrics Plot: {plot_path}")
print(f"\nBest Models per Fold:")
for fold in range(N_FOLDS):
    model_path = os.path.join(CHECKPOINT_DIR, f"best_model_fold_{fold}.pth")
    if os.path.exists(model_path):
        print(f"   - Fold {fold+1}: {os.path.basename(model_path)}")
print("="*60)
print("\nCross-Validation Complete! ✓")
print("="*60)