# Model Validation on Holdout Set

This notebook evaluates a trained ResNet model on a holdout dataset that wasn't used during training.

In [None]:
# Cell 1: Import Libraries
import torch
import torch.nn as nn
import torchvision.models as models
import os
import numpy as np
from PIL import Image
import torchvision.transforms as T
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import re
from tqdm import tqdm

print("Libraries imported successfully!")

In [None]:
# Cell 2: Settings and Configuration
OUTPUT_FOLDER = "output_balanced"  # Where your model is saved
HOLDOUT_FOLDER = "path/to/your/holdout/images"  # CHANGE THIS to your holdout folder
IMAGE_SIZE = 224
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 16

print(f"Device: {DEVICE}")
print(f"Model folder: {OUTPUT_FOLDER}")
print(f"Holdout folder: {HOLDOUT_FOLDER}")
print(f"Image size: {IMAGE_SIZE}x{IMAGE_SIZE}")
print(f"Batch size: {BATCH_SIZE}")

In [None]:
# Cell 3: Utility Functions
def get_numeric_label(filename):
    """Extract numeric value from filename for regression"""
    match = re.search(r'(\d+)[pP]', filename)
    if match:
        return float(match.group(1))
    return 0.0

def make_square(image, size):
    """Resize image and pad to square while keeping RGB"""
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    w, h = image.size
    scale = size / max(w, h)
    new_w = int(w * scale)
    new_h = int(h * scale)
    
    resized = image.resize((new_w, new_h), Image.BILINEAR)
    square = Image.new('RGB', (size, size), (255, 255, 255))
    
    x = (size - new_w) // 2
    y = (size - new_h) // 2
    square.paste(resized, (x, y))
    
    return square

def round_to_interval(values, interval=5):
    """Round predictions to nearest interval (e.g., 5)"""
    return np.round(np.array(values) / interval) * interval

print("Utility functions defined successfully!")

In [None]:
# Cell 4: Image Processing Function
def process_single_image(image_path, filename):
    """Process a single image and return tensor, label, filename"""
    # Load and preprocess image
    image = Image.open(image_path)
    image = make_square(image, IMAGE_SIZE)
    
    # Apply ImageNet normalization
    normalize = T.Compose([
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    tensor = normalize(image)
    
    # Get label from filename
    label = get_numeric_label(filename)
    
    return tensor, label, filename

print("Image processing function defined!")

In [None]:
# Cell 5: Load Trained Model
print("Loading trained model...")

# Create model architecture (must match training setup)
model = models.resnet18(pretrained=True)
model.fc = nn.Sequential(
    nn.Dropout(0.5),
    nn.Linear(model.fc.in_features, 1)
)

# Load the saved weights
model_path = f"{OUTPUT_FOLDER}/trained_model.pth"
if not os.path.exists(model_path):
    print(f"Error: Model file not found at {model_path}")
    print("Please check the OUTPUT_FOLDER path and ensure the model was trained and saved.")
else:
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model = model.to(DEVICE)
    model.eval()
    print(f"✓ Model loaded successfully from {model_path}")
    print(f"✓ Model moved to {DEVICE}")
    print(f"✓ Model set to evaluation mode")

In [None]:
# Cell 6: Scan Holdout Dataset
print(f"Scanning holdout folder: {HOLDOUT_FOLDER}")

if not os.path.exists(HOLDOUT_FOLDER):
    print(f"❌ Error: Holdout folder does not exist: {HOLDOUT_FOLDER}")
    print("Please update the HOLDOUT_FOLDER path in the settings cell.")
else:
    # Get all image files from holdout folder
    image_files = [f for f in os.listdir(HOLDOUT_FOLDER) 
                   if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp'))]
    
    print(f"✓ Found {len(image_files)} images in holdout set")
    
    if len(image_files) == 0:
        print("⚠️ No images found! Please check the HOLDOUT_FOLDER path.")
    else:
        # Show first few filenames as examples
        print("Sample filenames:")
        for i, filename in enumerate(image_files[:5]):
            label = get_numeric_label(filename)
            print(f"  {i+1}. {filename} (label: {label})")
        
        if len(image_files) > 5:
            print(f"  ... and {len(image_files) - 5} more files")
        
        print(f"✓ Ready to evaluate on {len(image_files)} images")

In [None]:
# Cell 7: Make Predictions on Holdout Set
print("Processing images and making predictions...")

# Initialize lists to store results
all_predictions = []
all_targets = []
all_filenames = []

# Process images in batches for memory efficiency
for i in tqdm(range(0, len(image_files), BATCH_SIZE), desc="Processing batches"):
    batch_files = image_files[i:i+BATCH_SIZE]
    
    # Process current batch
    batch_tensors = []
    batch_labels = []
    batch_names = []
    
    for filename in batch_files:
        image_path = os.path.join(HOLDOUT_FOLDER, filename)
        try:
            tensor, label, fname = process_single_image(image_path, filename)
            batch_tensors.append(tensor)
            batch_labels.append(label)
            batch_names.append(fname)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue
    
    # Skip empty batches
    if len(batch_tensors) == 0:
        continue
    
    # Stack tensors into batch
    batch_data = torch.stack(batch_tensors).to(DEVICE)
    batch_targets = torch.tensor(batch_labels, dtype=torch.float32).to(DEVICE)
    
    # Make predictions for this batch
    with torch.no_grad():
        batch_predictions = model(batch_data).view(-1)
        batch_targets = batch_targets.view(-1)
        
        # Store results
        all_predictions.extend(batch_predictions.cpu().numpy())
        all_targets.extend(batch_targets.cpu().numpy())
        all_filenames.extend(batch_names)

# Convert to numpy arrays
predictions = np.array(all_predictions)
targets = np.array(all_targets)
predictions_rounded = round_to_interval(predictions, 5)

print(f"✓ Successfully processed {len(predictions)} images")
print(f"✓ Generated {len(predictions)} predictions")

In [None]:
# Cell 8: Calculate Comprehensive Metrics
print(f"{'='*60}")
print(f"HOLDOUT SET EVALUATION RESULTS")
print(f"{'='*60}")
print(f"Dataset size: {len(predictions)} images")

# Basic metrics
mse = mean_squared_error(targets, predictions)
rmse = np.sqrt(mse)
mae = mean_absolute_error(targets, predictions)
r2 = r2_score(targets, predictions)

# Rounded prediction metrics
mse_rounded = mean_squared_error(targets, predictions_rounded)
rmse_rounded = np.sqrt(mse_rounded)
mae_rounded = mean_absolute_error(targets, predictions_rounded)
r2_rounded = r2_score(targets, predictions_rounded)

print(f"\nRAW PREDICTIONS:")
print(f"  MSE:  {mse:.3f}")
print(f"  RMSE: {rmse:.3f}")
print(f"  MAE:  {mae:.3f}")
print(f"  R²:   {r2:.3f}")

print(f"\nROUNDED PREDICTIONS (to nearest 5):")
print(f"  MSE:  {mse_rounded:.3f}")
print(f"  RMSE: {rmse_rounded:.3f}")
print(f"  MAE:  {mae_rounded:.3f}")
print(f"  R²:   {r2_rounded:.3f}")

print(f"{'='*60}")

In [None]:
# Cell 9: Detailed Error Analysis
# Error analysis
abs_errors = np.abs(predictions_rounded - targets)
pct_errors = np.where(targets != 0, abs_errors / targets * 100, abs_errors)

perfect_predictions = np.sum(abs_errors == 0)
within_5 = np.sum(abs_errors <= 5)
within_10 = np.sum(abs_errors <= 10)

print(f"ERROR ANALYSIS:")
print(f"  Perfect predictions: {perfect_predictions}/{len(targets)} ({perfect_predictions/len(targets)*100:.1f}%)")
print(f"  Within ±5:          {within_5}/{len(targets)} ({within_5/len(targets)*100:.1f}%)")
print(f"  Within ±10:         {within_10}/{len(targets)} ({within_10/len(targets)*100:.1f}%)")
print(f"  Mean error:         {np.mean(pct_errors):.1f}%")
print(f"  Median error:       {np.median(pct_errors):.1f}%")
print(f"  Max error:          {np.max(abs_errors):.1f} ({np.max(pct_errors):.1f}%)")

# Per-class analysis
unique_targets = sorted(set(targets))
print(f"\nPER-CLASS ANALYSIS:")
for target_val in unique_targets:
    mask = targets == target_val
    if np.sum(mask) > 0:
        class_mae = mean_absolute_error(targets[mask], predictions_rounded[mask])
        class_count = np.sum(mask)
        class_perfect = np.sum(abs_errors[mask] == 0)
        print(f"  {target_val:3.0f}p: {class_count:2d} samples, MAE: {class_mae:5.2f}, Perfect: {class_perfect:2d}/{class_count} ({class_perfect/class_count*100:4.1f}%)")

print(f"{'='*60}")

In [None]:
# Cell 10: Show Worst Predictions
print(f"WORST PREDICTIONS (Top 10):")
error_indices = np.argsort(abs_errors)[::-1]

for i in range(min(10, len(error_indices))):
    idx = error_indices[i]
    filename = all_filenames[idx]
    print(f"  {filename:35s} | Actual: {targets[idx]:5.1f} | Predicted: {predictions_rounded[idx]:5.1f} | Error: {abs_errors[idx]:5.1f}")

# Show some good predictions too
print(f"\nBEST PREDICTIONS (Perfect matches):")
perfect_indices = np.where(abs_errors == 0)[0]
if len(perfect_indices) > 0:
    for i in range(min(10, len(perfect_indices))):
        idx = perfect_indices[i]
        filename = all_filenames[idx]
        print(f"  {filename:35s} | Actual: {targets[idx]:5.1f} | Predicted: {predictions_rounded[idx]:5.1f} | Error: {abs_errors[idx]:5.1f}")
else:
    print("  No perfect predictions found.")

In [None]:
# Cell 11: Comprehensive Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Predictions vs Actual
axes[0,0].scatter(targets, predictions_rounded, alpha=0.6, s=50)
min_val, max_val = min(targets), max(targets)
axes[0,0].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect Prediction')
axes[0,0].set_xlabel('Actual Values')
axes[0,0].set_ylabel('Predicted Values')
axes[0,0].set_title(f'Predictions vs Actual (R² = {r2_rounded:.3f})')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# 2. Error distribution
axes[0,1].hist(pct_errors, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,1].set_xlabel('Percentage Error (%)')
axes[0,1].set_ylabel('Frequency')
axes[0,1].set_title(f'Error Distribution (Mean: {np.mean(pct_errors):.1f}%)')
axes[0,1].grid(True, alpha=0.3)

# 3. Absolute errors
axes[1,0].hist(abs_errors, bins=15, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1,0].set_xlabel('Absolute Error')
axes[1,0].set_ylabel('Frequency')
axes[1,0].set_title(f'Absolute Error Distribution (MAE: {mae_rounded:.2f})')
axes[1,0].grid(True, alpha=0.3)

# 4. Per-class MAE
class_maes = []
class_labels = []
for target_val in unique_targets:
    mask = targets == target_val
    if np.sum(mask) > 0:
        class_mae = mean_absolute_error(targets[mask], predictions_rounded[mask])
        class_maes.append(class_mae)
        class_labels.append(f'{target_val:.0f}p')

bars = axes[1,1].bar(class_labels, class_maes, alpha=0.7, color='lightgreen')
axes[1,1].set_xlabel('Class')
axes[1,1].set_ylabel('Mean Absolute Error')
axes[1,1].set_title('MAE by Class')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].grid(True, alpha=0.3)

# Add values on bars
for bar, val in zip(bars, class_maes):
    axes[1,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                   f'{val:.1f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
# Cell 12: Final Summary
print(f"\n{'='*60}")
print(f"VALIDATION COMPLETE - FINAL SUMMARY")
print(f"{'='*60}")
print(f"✓ Model performance on {len(predictions)} holdout images:")
print(f"  • R² Score: {r2_rounded:.3f}")
print(f"  • RMSE: {rmse_rounded:.2f}")
print(f"  • MAE: {mae_rounded:.2f}")
print(f"  • Perfect predictions: {perfect_predictions/len(targets)*100:.1f}%")
print(f"  • Within ±5 error: {within_5/len(targets)*100:.1f}%")
print(f"  • Within ±10 error: {within_10/len(targets)*100:.1f}%")

# Model performance assessment
if r2_rounded > 0.9:
    performance = "Excellent"
elif r2_rounded > 0.8:
    performance = "Very Good"
elif r2_rounded > 0.7:
    performance = "Good"
elif r2_rounded > 0.5:
    performance = "Fair"
else:
    performance = "Needs Improvement"

print(f"\n📊 Overall Performance: {performance}")
print(f"{'='*60}")

# Save results summary
results_summary = {
    'dataset_size': len(predictions),
    'r2_score': float(r2_rounded),
    'rmse': float(rmse_rounded),
    'mae': float(mae_rounded),
    'perfect_predictions_pct': float(perfect_predictions/len(targets)*100),
    'within_5_pct': float(within_5/len(targets)*100),
    'within_10_pct': float(within_10/len(targets)*100),
    'performance_rating': performance
}

print(f"Results summary saved to memory. Consider saving detailed results to a file if needed.")