## Setup and Imports


In [10]:
import sys
from pathlib import Path
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Add src directory to path
current_dir = Path.cwd()
if current_dir.name == 'notebooks':
 project_root = current_dir.parent.parent.parent
 method_root = current_dir.parent
else:
 project_root = Path.cwd()
 method_root = project_root / 'methods' / 'deep_learning'

sys.path.append(str(method_root / 'src'))

# Import custom modules
from data_loader import get_data_loaders
from model1_unet import create_unet_model
from trainer import ModelTrainer
from evaluator import ModelEvaluator

print(f"Project root: {project_root}")
print(f"Method root: {method_root}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
# Check for MPS (Apple Silicon GPU)
if hasattr(torch.backends, 'mps'):
 print(f"MPS (Apple Silicon) available: {torch.backends.mps.is_available()}")
else:
 print("MPS not available in this PyTorch version")


Project root: /Users/osamahshamsan/Desktop/Master/CV/TRACE
Method root: /Users/osamahshamsan/Desktop/Master/CV/TRACE/methods/deep_learning
PyTorch version: 2.8.0
CUDA available: False
MPS (Apple Silicon) available: True


## Load Configuration


In [6]:
# Check for corrupted files and verify dataset integrity
print("Validating dataset files...")
print("=" * 60)

# Load config if not already loaded (in case this cell is run before the config cell)
if 'config' not in globals():
    config_path = method_root / 'configs' / 'dl_config.json'
    with open(config_path, 'r') as f:
        config = json.load(f)
    # Resolve data paths relative to project root
    for key in config['data_paths']:
        path = config['data_paths'][key]
        if not Path(path).is_absolute():
            levels_up = path.count('../')
            if levels_up > 0:
                actual_path = '/'.join(path.split('/')[levels_up:])
                config['data_paths'][key] = str(project_root / actual_path)
            else:
                config['data_paths'][key] = str(project_root / path)

# Recreate data loaders to see if any files are corrupted
# The data loader will now automatically skip corrupted files
train_loader, val_loader, test_loader = get_data_loaders(config)

# Check dataset sizes
print(f"\nDataset sizes after validation:")
print(f" Train: {len(train_loader.dataset)} pairs")
print(f" Validation: {len(val_loader.dataset)} pairs")
print(f" Test: {len(test_loader.dataset)} pairs")

# Check if there were any corrupted files
if hasattr(train_loader.dataset, 'corrupted_files') and train_loader.dataset.corrupted_files:
 print(f"\nFound {len(train_loader.dataset.corrupted_files)} corrupted file pairs in training set")
 print("These files will be skipped during training.")
else:
 print("\nAll training files are valid!")

# Check for existing checkpoints
# Check both possible locations (notebooks/outputs and method_root/outputs)
checkpoint_latest = method_root / 'outputs' / 'models' / 'unet_model1_latest.pth'
checkpoint_best = method_root / 'outputs' / 'models' / 'unet_model1_best.pth'

# Also check in notebooks directory (where checkpoints might have been saved)
notebooks_checkpoint = method_root / 'notebooks' / 'outputs' / 'models' / 'unet_model1_latest.pth'
if not checkpoint_latest.exists() and notebooks_checkpoint.exists():
    checkpoint_latest = notebooks_checkpoint
    checkpoint_best = method_root / 'notebooks' / 'outputs' / 'models' / 'unet_model1_best.pth'

if checkpoint_latest.exists():
 # Use device if available, otherwise use 'cpu' for checkpoint loading
 # (device will be defined in the Setup Device cell)
 checkpoint_device = device if 'device' in globals() else 'cpu'
 checkpoint = torch.load(checkpoint_latest, map_location=checkpoint_device)
 print(f"\nFound checkpoint from epoch {checkpoint['epoch']}")
 print(f" Training will automatically resume from epoch {checkpoint['epoch'] + 1}")
 print(f" Best validation loss so far: {checkpoint['best_val_loss']:.4f}")
 print(f" Best validation IoU so far: {checkpoint['best_val_iou']:.4f}")
else:
 print("\nNo checkpoint found - training will start from epoch 1")


Validating dataset files...

Dataset sizes after validation:
 Train: 5858 pairs
 Validation: 1256 pairs
 Test: 1256 pairs

All training files are valid!

Found checkpoint from epoch 10
 Training will automatically resume from epoch 11
 Best validation loss so far: 0.6896
 Best validation IoU so far: 0.2811


In [12]:
# Load configuration
config_path = method_root / 'configs' / 'dl_config.json'
with open(config_path, 'r') as f:
 config = json.load(f)

# Resolve data paths relative to project root (fix relative path issues)
# Config paths are like "../../../data/processed/..." - need to extract actual path
for key in config['data_paths']:
 path = config['data_paths'][key]
 if not Path(path).is_absolute():
  # Count ../ levels and extract actual path
  levels_up = path.count('../')
  if levels_up > 0:
   # Remove ../ parts and get the actual path (e.g., "data/processed/train/images")
   actual_path = '/'.join(path.split('/')[levels_up:])
   config['data_paths'][key] = str(project_root / actual_path)
  else:
   config['data_paths'][key] = str(project_root / path)

# Add method_root to config so trainer can resolve output paths correctly
config['method_root'] = str(method_root)

print("Configuration loaded:")
print(json.dumps(config, indent=2))


Configuration loaded:
{
  "model_settings": {
    "image_size": [
      512,
      512
    ],
    "num_channels": 3,
    "num_classes": 2,
    "batch_size": 8,
    "learning_rate": 0.0001,
    "num_epochs": 50,
    "device": "cuda"
  },
  "model1_unet": {
    "name": "U-Net Segmentation Model",
    "encoder": "resnet34",
    "encoder_weights": "imagenet",
    "activation": "sigmoid",
    "loss": "bce_with_logits",
    "optimizer": "adam",
    "save_best_only": true,
    "patience": 5
  },
  "model2_resnet": {
    "name": "ResNet Encoder-Decoder Model",
    "backbone": "resnet50",
    "backbone_weights": "imagenet",
    "decoder_channels": [
      256,
      128,
      64,
      32,
      16
    ],
    "activation": "sigmoid",
    "loss": "dice",
    "optimizer": "adam",
    "save_best_only": true,
    "patience": 5
  },
  "data_paths": {
    "train_images": "/Users/osamahshamsan/Desktop/Master/CV/TRACE/data/processed/train/images",
    "train_masks": "/Users/osamahshamsan/Desktop/Maste

## Setup Device and Data Loaders


In [13]:
# Setup device - check for CUDA, MPS (Apple Silicon), or CPU
if torch.cuda.is_available() and config['model_settings']['device'] == 'cuda':
 device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
 device = torch.device('mps')
 print("Using MPS (Apple Silicon GPU)")
else:
 device = torch.device('cpu')
 print("Using CPU (no GPU acceleration available)")

print(f"Using device: {device}")

# Verify data paths exist
print("\nVerifying data paths:")
for split in ['train', 'val', 'test']:
 img_path = Path(config['data_paths'][f'{split}_images'])
 mask_path = Path(config['data_paths'][f'{split}_masks'])
 img_exists = img_path.exists()
 mask_exists = mask_path.exists()
 img_count = len(list(img_path.glob('*.png'))) if img_exists else 0
 mask_count = len(list(mask_path.glob('*.png'))) if mask_exists else 0
 print(f" {split}: Images={img_count} ({'' if img_exists else ''}), Masks={mask_count} ({'' if mask_exists else ''})")

# Create data loaders
print("\nLoading datasets...")
train_loader, val_loader, test_loader = get_data_loaders(config)

print(f"\nData loaders created:")
print(f" Train batches: {len(train_loader)}")
print(f" Validation batches: {len(val_loader)}")
print(f" Test batches: {len(test_loader)}")

# Check a sample batch
sample_images, sample_masks = next(iter(train_loader))
print(f"\nSample batch shape:")
print(f" Images: {sample_images.shape}")
print(f" Masks: {sample_masks.shape}")
print(f" Image value range: [{sample_images.min():.3f}, {sample_images.max():.3f}]")
print(f" Mask value range: [{sample_masks.min():.3f}, {sample_masks.max():.3f}]")


Using MPS (Apple Silicon GPU)
Using device: mps

Verifying data paths:
 train: Images=5858 (), Masks=5858 ()
 val: Images=1256 (), Masks=1256 ()
 test: Images=1256 (), Masks=1256 ()

Loading datasets...

Data loaders created:
 Train batches: 733
 Validation batches: 157
 Test batches: 157

Sample batch shape:
 Images: torch.Size([8, 3, 512, 512])
 Masks: torch.Size([8, 1, 512, 512])
 Image value range: [0.000, 1.000]
 Mask value range: [0.000, 1.000]


## Create U-Net Model


In [16]:
# Reload module to pick up any code changes
import importlib
import model1_unet
importlib.reload(model1_unet)
from model1_unet import create_unet_model

# Create U-Net model
model = create_unet_model(config)
model = model.to(device)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Model: {config['model1_unet']['name']}")
print(f"Encoder: {config['model1_unet']['encoder']}")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

# Test forward pass
model.eval()
with torch.no_grad():
 test_output = model(sample_images[:2].to(device))
 print(f"\nTest forward pass:")
 print(f" Input shape: {sample_images[:2].shape}")
 print(f" Output shape: {test_output.shape}")
 print(f" Expected output: [batch, 1, 512, 512]")
 if test_output.shape[1] == 1 and test_output.shape[2] == 512 and test_output.shape[3] == 512:
  print(" Output shape is correct!")
 else:
  print(f" Output shape mismatch! Expected [batch, 1, 512, 512]")


Model: U-Net Segmentation Model
Encoder: resnet34
Total parameters: 24,522,785
Trainable parameters: 24,522,785

Test forward pass:
 Input shape: torch.Size([2, 3, 512, 512])
 Output shape: torch.Size([2, 1, 512, 512])
 Expected output: [batch, 1, 512, 512]
 Output shape is correct!


## Train Model


In [None]:
# Create trainer
trainer = ModelTrainer(
 model=model,
 config=config,
 device=device,
 model_name='unet_model1'
)

# Train model
# resume_from_checkpoint=True will automatically resume from the latest checkpoint
# Set to False if you want to start training from scratch
num_epochs = config['model_settings']['num_epochs']
trainer.train(
 train_loader=train_loader,
 val_loader=val_loader,
 num_epochs=num_epochs,
 resume_from_checkpoint=True # Automatically resume from latest checkpoint
)


Loading checkpoint from outputs/models/unet_model1_latest.pth
Resuming from epoch 10
Best validation loss so far: 0.6896
Best validation IoU so far: 0.2811
Starting training for unet_model1
Device: mps
Number of epochs: 50
Starting from epoch: 11
------------------------------------------------------------
Epoch 11/50
  Train Loss: 0.6890, Train IoU: 0.3275
  Val Loss: 0.6893, Val IoU: 0.2962
Saved best model at epoch 11 with val_loss=0.6893, val_iou=0.2962
Epoch 12/50
  Train Loss: 0.6888, Train IoU: 0.3368
  Val Loss: 0.6893, Val IoU: 0.2804
Epoch 13/50
  Train Loss: 0.6887, Train IoU: 0.3306
  Val Loss: 0.6889, Val IoU: 0.3027
Saved best model at epoch 13 with val_loss=0.6889, val_iou=0.3027
Epoch 14/50
  Train Loss: 0.6885, Train IoU: 0.3426
  Val Loss: 0.6889, Val IoU: 0.3031
Saved best model at epoch 14 with val_loss=0.6889, val_iou=0.3031
Epoch 15/50
  Train Loss: 0.6886, Train IoU: 0.3291
  Val Loss: 0.6890, Val IoU: 0.2909
Epoch 16/50
  Train Loss: 0.6886, Train IoU: 0.3366
  

## Evaluate Model on Test Set


In [None]:
# Load best model
checkpoint_path = method_root / 'outputs' / 'models' / 'unet_model1_best.pth'
if checkpoint_path.exists():
 checkpoint = torch.load(checkpoint_path, map_location=device)
 model.load_state_dict(checkpoint['model_state_dict'])
 print(f"Loaded best model from epoch {checkpoint['epoch']}")
 print(f"Best validation loss: {checkpoint['best_val_loss']:.4f}")
 print(f"Best validation IoU: {checkpoint['best_val_iou']:.4f}")
else:
 print("Best model checkpoint not found, using current model state")

# Create evaluator
output_dir = method_root / config['output_paths']['predictions']
results_dir = method_root / config['results_paths']['metrics']

evaluator = ModelEvaluator(
 model=model,
 device=device,
 output_dir=output_dir,
 results_dir=results_dir
)

# Evaluate on test set
test_metrics = evaluator.evaluate(
 test_loader=test_loader,
 save_predictions=config['evaluation']['save_predictions'],
 num_visualizations=config['evaluation']['num_visualizations']
)

# Print metrics
evaluator.print_metrics(test_metrics)


## Training History Visualization


In [None]:
import matplotlib.pyplot as plt

# Load training history
history_path = method_root / 'outputs' / 'models' / 'unet_model1_history.json'
if history_path.exists():
 with open(history_path, 'r') as f:
 history = json.load(f)
 
 # Plot training curves
 fig, axes = plt.subplots(1, 2, figsize=(15, 5))
 
 # Loss plot
 axes[0].plot(history['train_loss'], label='Train Loss')
 axes[0].plot(history['val_loss'], label='Validation Loss')
 axes[0].set_xlabel('Epoch')
 axes[0].set_ylabel('Loss')
 axes[0].set_title('Training and Validation Loss')
 axes[0].legend()
 axes[0].grid(True)
 
 # IoU plot
 axes[1].plot(history['train_iou'], label='Train IoU')
 axes[1].plot(history['val_iou'], label='Validation IoU')
 axes[1].set_xlabel('Epoch')
 axes[1].set_ylabel('IoU')
 axes[1].set_title('Training and Validation IoU')
 axes[1].legend()
 axes[1].grid(True)
 
 plt.tight_layout()
 plt.savefig(method_root / 'outputs' / 'visualizations' / 'training_history.png', dpi=150, bbox_inches='tight')
 plt.show()
 
 print(f"Training completed in {len(history['train_loss'])} epochs")
else:
 print("Training history not found")


## Summary

U-Net model training and evaluation completed. Results saved to:
- **Models**: `outputs/models/`
- **Predictions**: `outputs/predictions/`
- **Visualizations**: `outputs/visualizations/`
- **Metrics**: `results/metrics/`
