In [None]:
# Import all necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import tensorflow as tf
import torch.nn.functional as F
from torchvision import transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from torch.optim.lr_scheduler import ReduceLROnPlateau
#import zipfile # for unzipping the dataset - not needed in the end, pics unzipped locally in folder spr-x-ray
import os
import time
# ===================================================================================================================================
# Final Project: Part 2
# Design and implement a single hidden layer shallow fully connected network for performing both the regression (age) and 
# classification (male/female) tasks, and report on their regression/classification performance.
# ===================================================================================================================================
# Part 0: Initialize Device (CUDA)
# GPU used on my personal device: GeForce GTX 1660 Ti
USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
print(f"Part 0: Using device: {device}")
print("=========================================\n")
# ===================================================================================================================================
# Part 1: Load  and preprocess Data
# Define the path for the CSVs and the PNG images
image_dir = "./spr-x-ray/train"
# Load the CSVs properly
# Load the CSVs
age_df = pd.read_csv("train_age.csv")
gender_df = pd.read_csv("train_gender.csv")

# Check column names
#print("Age CSV columns:", age_df.columns.tolist())
#print("Gender CSV columns:", gender_df.columns.tolist())

# merge the CSV's for processing
labels_ag = pd.merge(age_df, gender_df, on='imageId')
# Create correct filenames (zero-padded to match files like 000000.png)
labels_ag['filename'] = labels_ag['imageId'].astype(str).str.zfill(6) + ".png"
# Keep only needed columns
labels_ag = labels_ag[['filename', 'age', 'gender']]

# Confirm merge was properly executed
print("Merged and processed labels:", labels_ag.shape)
print(labels_ag.head()) # default: prints first 5 rows
print(labels_ag.tail(10))   # Shows last 10
print("=========================================\n")

# Define transform for grayscale X-rays (e.g., resize, normalize)
png_transform = transforms.Compose([
    transforms.Resize((128, 128)),       # Resize from 1024x1024 to 224x224 or 128x128. 224x224 clogs cpu, do 128x128
    transforms.ToTensor(),               # Convert to tensor
    transforms.Normalize([0.5], [0.5])   # Normalize grayscale [mean], [std]
])
# ===================================================================================================================================
# Part 2: Create custom class to load images and labels, read and transform the image files, and
# utilizes DataLoader (via pytorch) to batch/shuffle/load data during training

# Define custom dataset class as stated previously
class ChestXrayDataset(Dataset):
    # Initialize the class
    def __init__(self, dataframe, image_dir, transform=None):
        self.data = dataframe # containing image filenames and labels
        self.image_dir = image_dir # image folder directory
        self.transform = transform # image transform for application

    def __len__(self): # Get  length of the dataset
        return len(self.data)

    def __getitem__(self, idx): # Get item from the dataset
        row = self.data.iloc[idx] # Get row of data
        img_path = os.path.join(self.image_dir, row['filename']) # Get image path
        image = Image.open(img_path).convert('L')  # If path exists, convert to grayscale

        if self.transform: # Apply transform if provided
            image = self.transform(image) # resize + normalize image

        # convert to pytorch sensors
        age = torch.tensor(row['age'], dtype=torch.float32) # convert age value to float32
        gender = torch.tensor(row['gender'], dtype=torch.float32)  # BCEWithLogitsLoss expects float
        # return image with corresponging values
        # transformed image tensor: shape [1, 224, 224]
        return image, age, gender
# end class

# Create full dataset object (all images)
dataset = ChestXrayDataset(labels_ag, image_dir=image_dir, transform=png_transform) 
# Limit dataset to subset (for speed)
#subset_size = 2000 # try 500, 1k or 2k
#full_dataset = ChestXrayDataset(labels_ag, image_dir=image_dir, transform=png_transform)
#dataset = Subset(full_dataset, range(subset_size))

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders
#train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2, pin_memory=True)
#val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2, pin_memory=True)
# DataLoaders with batch size 64 and 4 workers (for faster training)
# Note: pin_memory=True and persistent_workers=True are used for faster data loading on GPU
#train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, # default: 4 workers, test 1 to see if it fixes slow startup
#                           num_workers= 4, pin_memory=True, persistent_workers=True)
#val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False,
#                        num_workers=4, pin_memory=True, persistent_workers=True)

# issue vixed by pinning memory and using only 1 cpu core (num_worker = 0)
train_loader = DataLoader(train_dataset, batch_size = 64, shuffle = True,
                          num_workers = 0, pin_memory = True)
val_loader = DataLoader(val_dataset, batch_size = 64, shuffle = True,
                          num_workers = 0, pin_memory = True)
# Confirm structure
print(f"Total samples in batch: {len(dataset)}")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print("=========================================\n")
# Optional: test one batch - takes WAY TOO LONG, over 15min. Best to leave commented
#sample_batch = next(iter(train_loader))
#images, ages, genders = sample_batch
#print(f"Batch shapes → Images: {images.shape}, Ages: {ages.shape}, Genders: {genders.shape}")
# ====================================================================================================================================
# Part 3: Define and train the model 
class CNNModel(nn.Module):
    def __init__(self): # Initialize the model
        super(CNNModel, self).__init__()
        self.features = nn.Sequential( # Feature extraction layers
            nn.Conv2d(1, 16, kernel_size=3, padding=1), # 1 input channel (grayscale), 16 output channels
            nn.BatchNorm2d(16), # Batch normalization
            nn.ReLU(), # Activation function
            nn.MaxPool2d(2),  # 128 → 64, downsampled by 2. repeat down below

            nn.Conv2d(16, 32, kernel_size=3, padding=1), # repeat batch norm, relu, maxpool
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 64 → 32

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 32 → 16
        )

        self.flatten = nn.Flatten() # Flatten the output from the convolutional layers
        self.fc1 = nn.Linear(64 * 16 * 16, 256) # Fully connected layer
        self.dropout = nn.Dropout(0.3) # Dropout for regularization
        self.age_output = nn.Linear(256, 1) # Output layer for age regression
        self.gender_output = nn.Linear(256, 1) # Output layer

    def forward(self, x):
        x = self.features(x) # Pass through feature extraction layers
        x = self.flatten(x) # Flatten the output
        x = self.fc1(x) # Pass through fully connected layer
        x = self.dropout(x) # drop neurons
        age = self.age_output(x).squeeze(1) # Output for age regression
        gender = self.gender_output(x).squeeze(1) # Outout for gender
        return age, gender
#end class

# Instantiate model
model = CNNModel().to(device)

# Loss functions and optimizer (with weight decay for regularization)
age_loss_fn = nn.MSELoss() # mean square error loss for age regression
gender_loss_fn = nn.BCEWithLogitsLoss() # Binary Cross-Entropy with logits loss
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
#optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay = 1e-6) # default 1e-5,try: 1e-4, 1e-6
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)


num_epochs = 10 # originally at 10, test other vals later. Final verdict: 10 is optimal for all 3 models

patience = 5 # Early stopping patience, 5 epochs without improvement
best_val_loss = float('inf') # Initialize best validation loss to infinity
epochs_no_improve = 0 # Counter for early stopping
mae_loss_fn = nn.L1Loss() # Mean Absolute Error (MAE) loss function for age regression

# Variables to store performance history
train_age_losses = []
train_gender_losses = []
train_gender_accuracies = []
val_age_losses = []
val_gender_losses = []
val_gender_accuracies = []
train_age_mae = []
val_age_mae = []

#looper = 0
#start_time = time.time() # for debugging

# Loop for training the model
for epoch in range(num_epochs):
    model.train() # Set model to training mode
    # Reset running loss and accuracy variables
    running_age_loss = 0.0
    running_gender_loss = 0.0
    running_age_mae = 0.0
    correct_gender = 0
    total_gender = 0

    print(f"Epoch {epoch+1}/{num_epochs} training...") # for debugging/tracking where the model is

    #for images, ages, genders in train_loader:
    for batch_idx, (images, ages, genders) in enumerate(train_loader): # loop thru batch, not whole dataset
        # for debugging
        #print(f" Looper: {looper+1}")
        #looper += 1 # for debugging, not needed otherwise
        #if batch_idx == 0: # First batch only
            #print(f"First batch loaded in {time.time() - start_time:.2f} seconds")

        # Move data to device (GPU or CPU)
        images = images.to(device) 
        ages = ages.to(device)
        genders = genders.to(device)

        predicted_ages, predicted_genders = model(images) # Get predictions from model

        # Calculate losses
        loss_age = age_loss_fn(predicted_ages, ages)
        loss_gender = gender_loss_fn(predicted_genders, genders)
        mae_age = mae_loss_fn(predicted_ages, ages) # MEAN SBOLUTE error

        loss = loss_age + loss_gender # Total loss

        optimizer.zero_grad() # Zero gradients before backward pass
        loss.backward() # backwards pass
        optimizer.step() # Update weights

        # Track training performance
        running_age_loss += loss_age.item() * images.size(0)
        running_gender_loss += loss_gender.item() * images.size(0)
        running_age_mae += mae_age.item() * images.size(0)

        # Calculate accuracy
        preds = (torch.sigmoid(predicted_genders) > 0.5).float()
        correct_gender += (preds == genders).sum().item()
        total_gender += genders.size(0) # total number of samples
        # Progress update
        #processed = (batch_idx + 1) * images.size(0)
        #print(f"   → Batch {batch_idx+1}/{len(train_loader)} | Processed: {processed} samples", end='\r')

    # calculate average losses and accuracies
    train_age_losses.append(running_age_loss / len(train_dataset))
    train_gender_losses.append(running_gender_loss / len(train_dataset))
    train_gender_accuracies.append(correct_gender / total_gender)
    train_age_mae.append(running_age_mae / len(train_dataset)) # add mae calc

    # Validation phase
    model.eval() # Set model to evaluation mode
    # Reset validation loss and accuracy variables
    val_age_loss = 0.0 
    val_gender_loss = 0.0
    correct_gender_val = 0
    total_gender_val = 0

    with torch.no_grad(): # Disable gradient calculation for validation
        for images, ages, genders in val_loader: # loop thru batch, not whole dataset
            # Move data to device (GPU or CPU)  
            images = images.to(device)
            ages = ages.to(device)
            genders = genders.to(device)

            predicted_ages, predicted_genders = model(images) # Get prediction from the model

            loss_age = age_loss_fn(predicted_ages, ages) # Calculate loss for age
            loss_gender = gender_loss_fn(predicted_genders, genders)  # gender loss
            mae_age = mae_loss_fn(predicted_ages, ages) # mae loss

            # Calculate total loss for age gender and mae
            val_age_loss += loss_age.item() * images.size(0)
            val_gender_loss += loss_gender.item() * images.size(0)
            val_age_mae_total += mae_age.item() * images.size(0)

            # Calculate accuracy for gender
            preds = (torch.sigmoid(predicted_genders) > 0.5).float()
            correct_gender_val += (preds == genders).sum().item()
            total_gender_val += genders.size(0)
    # end the loop
    # Calculate average losses and accuracies for validation
    val_age_losses.append(val_age_loss / len(val_dataset))
    val_gender_losses.append(val_gender_loss / len(val_dataset))
    val_gender_accuracies.append(correct_gender_val / total_gender_val)
    val_age_mae.append(val_age_mae_total / len(val_dataset)) # add calc for mae

    # Step scheduler
    scheduler.step(val_age_losses[-1])

    # Early stopping logic
    if val_age_losses[-1] < best_val_loss: # If validation loss improved
        best_val_loss = val_age_losses[-1]
        epochs_no_improve = 0
    else: # If no improvement, increment counter
        #print(f"Validation loss did not improve at epoch {epoch+1}") # for debugging
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            #print(f"Early stopping triggered at epoch {epoch+1}") #mfor debugging
            break

    # Logging
    #print(f"Epoch {epoch+1}/{num_epochs} | "
          #f"Train Age Loss: {train_age_losses[-1]:.4f} | Train Gender Loss: {train_gender_losses[-1]:.4f} | Train Gender Acc: {train_gender_accuracies[-1]*100:.2f}% | "
          #f"Val Age Loss: {val_age_losses[-1]:.4f} | Val Gender Loss: {val_gender_losses[-1]:.4f} | Val Gender Acc: {val_gender_accuracies[-1]*100:.2f}%")
# end loop

print("=========================================\n")
print("Final Performance:")
print(f"Train Age Loss       : {train_age_losses[-1]:.4f}")
print(f"Train Age MAE        : {train_age_mae[-1]:.4f}")
print(f"Train Gender Loss    : {train_gender_losses[-1]:.4f}")
print(f"Train Gender Accuracy: {train_gender_accuracies[-1]*100:.2f}%")
print(f"Val Age Loss         : {val_age_losses[-1]:.4f}")
print(f"Val Age MAE          : {val_age_mae[-1]:.4f}")
print(f"Val Gender Loss      : {val_gender_losses[-1]:.4f}")
print(f"Val Gender Accuracy  : {val_gender_accuracies[-1]*100:.2f}%")
print("=========================================\n")
# ====================================================================================================================================
# Part 4A: Plot the training and validation losses and accuracies
plt.figure(figsize=(12, 5))

# Age loss
plt.subplot(1, 2, 1)
plt.plot(train_age_losses, label="Train Age Loss")
plt.plot(val_age_losses, label="Val Age Loss")
plt.title("Age Regression Loss")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.legend()

# Gender accuracy
plt.subplot(1, 2, 2)
plt.plot(train_gender_accuracies, label="Train Accuracy")
plt.plot(val_gender_accuracies, label="Val Accuracy")
plt.title("Gender Classification Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.tight_layout()
plt.show()
# =====================================================================================================================================
# Part 4B: Plot the training and validation MSE and MAE
plt.figure(figsize=(14, 6))

# MSE Loss
plt.subplot(1, 2, 1)
plt.plot(train_age_losses, label="Train MSE")
plt.plot(val_age_losses, label="Val MSE")
plt.title("Age Regression - MSE Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

# MAE Loss
plt.subplot(1, 2, 2)
plt.plot(train_age_mae, label="Train MAE")
plt.plot(val_age_mae, label="Val MAE")
plt.title("Age Regression - MAE")
plt.xlabel("Epoch")
plt.ylabel("Absolute Error")
plt.legend()

plt.tight_layout()
plt.show()
# ====================================================================================================================================
# Part 5: Inference on the full dataset and export predictions to CSV

def export_predictions_to_csv(model, dataset, output_file="model_predictions.csv", batch_size=64):
    # Runs inference on a trained model and saves the age and gender predictions in CSV format:
    # Columns: imageId, age, gender

    model.eval() # Set model to evaluation mode
    model.to(device) # Move model to device

    # Create DataLoader for the full dataset
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    predicted_ages = [] # List to store predicted ages, genders, ids
    predicted_genders = []
    image_ids = []

    with torch.no_grad(): # Disable gradient calculation for inference
        for batch in loader: # Get batches of data
            images, _, _ = batch
            images = images.to(device) # Move images to device

            # Forward pass
            age_preds, gender_logits = model(images) # Get predictions from model

            # Post-process predictions
            gender_preds = (torch.sigmoid(gender_logits) > 0.5).int().cpu().numpy().tolist()
            age_preds = age_preds.cpu().numpy().tolist()

            predicted_ages.extend(age_preds) # Convert to numpy and extend list for age
            predicted_genders.extend(gender_preds) # Convert to numpy and extend list again for gender

    # Extract imageId (remove '.png') from filename column
    image_ids = [dataset.data.iloc[i]['filename'].replace('.png', '') for i in range(len(dataset))]

    # Build DataFrame in required format
    output_df = pd.DataFrame({
        "imageId": image_ids,
        "age": [max(0, round(age)) for age in predicted_ages],  # Round and clamp to 0 Ensuring age is positive integer
        "gender": predicted_genders  # Already binary, 0 femal or 1 male
    })

    output_df.to_csv(output_file, index=False)
    print(f"Predictions saved to: {output_file}")

# Call the function  
export_predictions_to_csv(model, dataset, output_file="model3_predictions.csv")
# end part 3 of Final Project
# ====================================================================================================================================