In [None]:
# Cell 1: Import libraries
import os
import numpy as np
from PIL import Image
import torch
import torchvision.transforms as T
import random
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error, r2_score
from tqdm import tqdm

In [None]:
# Cell 2: Settings
INPUT_FOLDER = "path/to/your/images"  # CHANGE THIS
OUTPUT_FOLDER = "output_balanced"     # CHANGE THIS
IMAGE_SIZE = 224
TARGET_COUNT = 30  # Target images per class
SEED = 42

random.seed(SEED)
np.random.seed(SEED)

In [None]:
# Cell: Clear output folders
import shutil

# Define folders to clear
folders_to_clear = [
    f"{OUTPUT_FOLDER}/augmented",
    f"{OUTPUT_FOLDER}/resized", 
    f"{OUTPUT_FOLDER}/final"
]

# Clear each folder
print("Clearing output folders...")
for folder in folders_to_clear:
    if os.path.exists(folder):
        # Remove all contents
        shutil.rmtree(folder)
        # Recreate empty folder
        os.makedirs(folder)
        print(f"  ✓ Cleared: {folder}")
    else:
        # Create if doesn't exist
        os.makedirs(folder)
        print(f"  ✓ Created: {folder}")

print("\nAll output folders cleared and ready!")

In [None]:
# Cell 3: Create folders
os.makedirs(f"{OUTPUT_FOLDER}/augmented", exist_ok=True)
os.makedirs(f"{OUTPUT_FOLDER}/resized", exist_ok=True)
os.makedirs(f"{OUTPUT_FOLDER}/final", exist_ok=True)

In [None]:
# Cell 4: Basic functions
def get_class_label(filename):
    """Get class from filename (e.g., '0p', '5p', '100p')"""
    match = re.search(r'(\d+)[pP]', filename)
    if match:
        return f"{match.group(1)}p"
    return "unknown"

def get_numeric_label(filename):
    """Extract numeric value from filename for regression"""
    match = re.search(r'(\d+)[pP]', filename)
    if match:
        return float(match.group(1))
    return 0.0

def make_square(image, size):
    """Resize image and pad to square while keeping RGB"""
    # Convert to RGB if not already
    if image.mode != 'RGB':
        image = image.convert('RGB')
    
    # Calculate new size
    w, h = image.size
    scale = size / max(w, h)
    new_w = int(w * scale)
    new_h = int(h * scale)
    
    # Resize
    resized = image.resize((new_w, new_h), Image.BILINEAR)
    
    # Create square with white background
    square = Image.new('RGB', (size, size), (255, 255, 255))
    
    # Paste in center
    x = (size - new_w) // 2
    y = (size - new_h) // 2
    square.paste(resized, (x, y))
    
    return square

In [None]:
# Cell 5: Define augmentations
def aug1(img): return img.rotate(90, expand=True)     # 90° rotation
def aug2(img): return img.rotate(180, expand=True)    # 180° rotation  
def aug3(img): return img.rotate(270, expand=True)    # 270° rotation
def aug4(img): return img.transpose(Image.FLIP_LEFT_RIGHT)
def aug5(img): return img.transpose(Image.FLIP_TOP_BOTTOM)

augmentation_list = [aug1, aug2, aug3, aug4, aug5]

In [None]:
# Cell 6: Group images by class
image_files = [f for f in os.listdir(INPUT_FOLDER) 
               if f.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp'))]

# Group by class
classes = {}
for filename in image_files:
    label = get_class_label(filename)
    if label not in classes:
        classes[label] = []
    classes[label].append(filename)

# Show counts
print("Images per class:")
for label, images in sorted(classes.items()):
    print(f"  {label}: {len(images)} images")

In [None]:
# Cell 7: Copy originals and create augmentations
print("\nProcessing images...")

for class_label, filenames in sorted(classes.items()):
    print(f"\nClass {class_label}:")
    
    # Copy all originals first
    for filename in filenames:
        img = Image.open(os.path.join(INPUT_FOLDER, filename))
        # Keep as RGB (no binary conversion)
        if img.mode != 'RGB':
            img = img.convert('RGB')
        
        base = os.path.splitext(filename)[0]
        img.save(f"{OUTPUT_FOLDER}/augmented/{base}_original.jpg")
    
    # Create augmentations if needed
    current_count = len(filenames)
    if current_count < TARGET_COUNT:
        needed = TARGET_COUNT - current_count
        print(f"  Creating {needed} augmentations...")
        
        for i in range(needed):
            # Pick random original
            source_file = random.choice(filenames)
            img = Image.open(os.path.join(INPUT_FOLDER, source_file))
            # Keep as RGB (no binary conversion)
            if img.mode != 'RGB':
                img = img.convert('RGB')
            
            # Apply 3-4 random augmentations
            num_augs = random.randint(3, 4)
            selected_augs = random.sample(augmentation_list, num_augs)
            
            augmented = img
            for aug_func in selected_augs:
                augmented = aug_func(augmented)
            
            # Save
            base = os.path.splitext(source_file)[0]
            augmented.save(f"{OUTPUT_FOLDER}/augmented/{base}_aug{i}.jpg")

In [None]:
# Cell 8: Resize all images
print("\nResizing all images...")
augmented_files = [f for f in os.listdir(f"{OUTPUT_FOLDER}/augmented") 
                   if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

for i, filename in enumerate(augmented_files):
    if i % 50 == 0:
        print(f"  {i}/{len(augmented_files)}")
    
    img = Image.open(f"{OUTPUT_FOLDER}/augmented/{filename}")
    resized = make_square(img, IMAGE_SIZE)
    
    # Save as JPG to maintain RGB
    base = os.path.splitext(filename)[0]
    resized.save(f"{OUTPUT_FOLDER}/resized/{base}.jpg")

In [None]:
# Cell 9: Normalize for ResNet (ImageNet normalization)
print("\nNormalizing images for ResNet...")
resized_files = [f for f in os.listdir(f"{OUTPUT_FOLDER}/resized") 
                 if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

# ImageNet normalization values for ResNet
normalize = T.Compose([
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

for i, filename in enumerate(resized_files):
    if i % 50 == 0:
        print(f"  {i}/{len(resized_files)}")
    
    # Load RGB image
    img = Image.open(f"{OUTPUT_FOLDER}/resized/{filename}")
    
    # Apply ResNet normalization
    tensor = normalize(img)
    
    # Save tensor
    base = os.path.splitext(filename)[0]
    np.save(f"{OUTPUT_FOLDER}/final/{base}.npy", tensor.numpy())
    
    # Save image for viewing
    img.save(f"{OUTPUT_FOLDER}/final/{base}.jpg")

In [None]:
# Cell 10: Check final counts
print("\nFinal image count per class:")
final_counts = {}
for filename in os.listdir(f"{OUTPUT_FOLDER}/final"):
    if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        label = get_class_label(filename)
        final_counts[label] = final_counts.get(label, 0) + 1

for label, count in sorted(final_counts.items()):
    print(f"  {label}: {count} images")

In [None]:
# Cell 11: Simple Model Setup with Dropout
import torch.nn as nn
import torchvision.models as models

# Create ResNet model for regression with dropout
model = models.resnet18(pretrained=True)
# Add dropout before final layer
model.fc = nn.Sequential(
    nn.Dropout(0.5),
    nn.Linear(model.fc.in_features, 1)
)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Loss and optimizer with stronger weight decay
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)  # Increased weight decay

print(f"Model ready on {device} with dropout and weight decay")

In [None]:
# Cell 12: Dataset with Original/Augmented Split
from torch.utils.data import Dataset, DataLoader

class ImageDataset(Dataset):
    def __init__(self, folder, file_list=None):
        if file_list is None:
            self.files = [f for f in os.listdir(folder) if f.endswith('.npy')]
        else:
            self.files = file_list
        self.folder = folder
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        file = self.files[idx]
        tensor = torch.from_numpy(np.load(f"{self.folder}/{file}")).float()
        label = torch.tensor(get_numeric_label(file), dtype=torch.float32)
        return tensor, label

# Get all .npy files
all_files = [f for f in os.listdir(f"{OUTPUT_FOLDER}/final") if f.endswith('.npy')]

# Separate original and augmented files
original_files = [f for f in all_files if '_original' in f]
augmented_files = [f for f in all_files if '_aug' in f]

print(f"Found {len(original_files)} original files, {len(augmented_files)} augmented files")

# Split original files for validation (20% of originals)
from sklearn.model_selection import train_test_split
orig_train_files, orig_val_files = train_test_split(original_files, test_size=0.2, random_state=SEED)

print(f"Validation will use {len(orig_val_files)} original images")
print(f"Training will start with {len(orig_train_files)} original images")

# Remove augmentations of validation originals from training
# Extract base names (without _original suffix) from validation files
val_base_names = set()
for val_file in orig_val_files:
    # Remove .npy and _original to get base name
    base_name = val_file.replace('.npy', '').replace('_original', '')
    val_base_names.add(base_name)

print(f"Will remove augmentations for {len(val_base_names)} base images from training")

# Filter out augmentations that correspond to validation originals
filtered_aug_files = []
removed_count = 0
for aug_file in augmented_files:
    # Extract base name from augmentation file (remove _augX.npy)
    base_name = aug_file.replace('.npy', '')
    # Remove _aug followed by digits
    import re
    base_name = re.sub(r'_aug\d+$', '', base_name)
    
    if base_name not in val_base_names:
        filtered_aug_files.append(aug_file)
    else:
        removed_count += 1

print(f"Removed {removed_count} augmented files to prevent data leakage")
print(f"Training will use {len(orig_train_files)} originals + {len(filtered_aug_files)} augmentations = {len(orig_train_files) + len(filtered_aug_files)} total")

# Create final train and validation file lists
train_files = orig_train_files + filtered_aug_files
val_files = orig_val_files

# Create datasets
train_dataset = ImageDataset(f"{OUTPUT_FOLDER}/final", train_files)
val_dataset = ImageDataset(f"{OUTPUT_FOLDER}/final", val_files)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

print(f"\nFinal dataset split:")
print(f"  Training: {len(train_dataset)} samples (originals + clean augmentations)")
print(f"  Validation: {len(val_dataset)} samples (originals only)")
print(f"  No data leakage: validation originals and their augmentations are separate")

In [None]:
# Cell 13: Simple Training Loop with Early Stopping
print("Starting training...")

# Early stopping parameters
best_val_loss = float('inf')
patience = 5
patience_counter = 0
best_model_state = None

# Training loop
num_epochs = 30
train_losses = []
val_losses = []

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        output = model(data).view(-1)
        target = target.view(-1)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []
    
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data).view(-1)
            target = target.view(-1)
            loss = criterion(output, target)
            val_loss += loss.item()
            
            val_preds.extend(output.cpu().numpy())
            val_targets.extend(target.cpu().numpy())
    
    # Calculate average losses
    avg_train_loss = train_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    
    # Calculate metrics
    val_mae = mean_absolute_error(val_targets, val_preds)
    val_r2 = r2_score(val_targets, val_preds)
    
    print(f'Epoch {epoch+1:2d}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val MAE: {val_mae:.2f}, Val R²: {val_r2:.3f}')
    
    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        best_model_state = model.state_dict().copy()
        print(f'  → New best validation loss: {best_val_loss:.4f}')
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f'  → Early stopping after {epoch+1} epochs')
            break

# Load best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f'\nLoaded best model with validation loss: {best_val_loss:.4f}')

# Save the trained model
torch.save(model.state_dict(), f'{OUTPUT_FOLDER}/trained_model.pth')
print(f'Model saved to {OUTPUT_FOLDER}/trained_model.pth')

print('\nTraining completed!')

In [None]:
# Cell 14: Simple Model Evaluation
print("Evaluating model on validation set...")

model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for data, target in val_loader:
        data, target = data.to(device), target.to(device)
        output = model(data).view(-1)
        target = target.view(-1)
        
        all_preds.extend(output.cpu().numpy())
        all_targets.extend(target.cpu().numpy())

# Convert to numpy arrays
all_preds = np.array(all_preds)
all_targets = np.array(all_targets)

# Round predictions to nearest 5
def round_to_interval(values, interval=5):
    return np.round(np.array(values) / interval) * interval

all_preds_rounded = round_to_interval(all_preds, 5)

# Calculate metrics
mae = mean_absolute_error(all_targets, all_preds)
r2 = r2_score(all_targets, all_preds)
mae_rounded = mean_absolute_error(all_targets, all_preds_rounded)

print("="*50)
print("VALIDATION SET RESULTS")
print("="*50)
print(f"MAE (Raw):      {mae:.3f}")
print(f"R²:             {r2:.3f}")
print(f"MAE (Rounded):  {mae_rounded:.3f}")
print("="*50)

# Show some sample predictions
print("\nSample Predictions:")
for i in range(min(10, len(all_preds))):
    print(f"Raw: {all_preds[i]:.1f} → Rounded: {all_preds_rounded[i]:.0f}, Actual: {all_targets[i]:.0f}")

# Plot training curves
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Plot predictions vs actual
plt.subplot(1, 2, 2)
plt.scatter(all_targets, all_preds_rounded, alpha=0.6)
min_val, max_val = min(all_targets), max(all_targets)
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Predictions vs Actual')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Cell 15: Comprehensive Error Analysis
import matplotlib.pyplot as plt
import numpy as np

# Collect all validation predictions
print("Collecting validation predictions...")
model.eval()
all_preds, all_targets = [], []

with torch.no_grad():
    for data, target in val_loader:
        data, target = data.to(device), target.to(device)
        pred = model(data).view(-1)
        target = target.view(-1)
        
        all_preds.extend(pred.cpu().numpy())
        all_targets.extend(target.cpu().numpy())

# Convert to arrays and round predictions
all_preds = np.array(all_preds)
all_targets = np.array(all_targets)
all_preds_rounded = round_to_interval(all_preds, 5)

print(f"Analyzing {len(all_preds)} predictions...")

# Calculate errors
abs_errors = np.abs(all_preds_rounded - all_targets)
pct_errors = np.where(all_targets != 0, 
                      abs_errors / all_targets * 100, 
                      abs_errors)

# Create plots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Error distribution
axes[0,0].hist(pct_errors, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Prediction Errors (%)')
axes[0,0].set_xlabel('Percentage Error')
axes[0,0].grid(True, alpha=0.3)

# Predictions vs actual
axes[0,1].scatter(all_targets, all_preds_rounded, alpha=0.6, color='coral')
min_val, max_val = min(all_targets), max(all_targets)
axes[0,1].plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect')
axes[0,1].set_xlabel('Actual')
axes[0,1].set_ylabel('Predicted')
axes[0,1].set_title('Predictions vs Actual')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)

# Absolute errors
axes[1,0].hist(abs_errors, bins=15, alpha=0.7, color='lightgreen', edgecolor='black')
axes[1,0].set_title('Absolute Errors')
axes[1,0].set_xlabel('Error Magnitude')
axes[1,0].grid(True, alpha=0.3)

# Error stats
stats = {
    'Mean %': np.mean(pct_errors),
    'Median %': np.median(pct_errors),
    '90th %tile': np.percentile(pct_errors, 90),
    'Perfect': np.sum(abs_errors == 0),
    'Max %': np.max(pct_errors)
}

colors = ['lightcoral', 'lightblue', 'gold', 'plum', 'lightgreen']
bars = axes[1,1].bar(stats.keys(), stats.values(), color=colors, alpha=0.7)
axes[1,1].set_title('Error Statistics')
axes[1,1].tick_params(axis='x', rotation=45)

# Add values on bars
for bar, val in zip(bars, stats.values()):
    axes[1,1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
                   f'{val:.1f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Summary
print(f"\n{'='*40}")
print(f"ERROR ANALYSIS SUMMARY")
print(f"{'='*40}")
print(f"Total samples: {len(all_preds)}")
print(f"Mean error: {np.mean(pct_errors):.1f}%")
print(f"Median error: {np.median(pct_errors):.1f}%")
print(f"90% within: {np.percentile(pct_errors, 90):.1f}% error")
print(f"Perfect: {np.sum(abs_errors == 0)}/{len(abs_errors)} ({np.sum(abs_errors == 0)/len(abs_errors)*100:.1f}%)")
print(f"Worst error: {np.max(pct_errors):.1f}%")
print(f"{'='*40}")