In [3]:
import os
import torch
import torch.nn as nn
import torch.optim as optim

def get_next_version(base_dir):
    """
    Get the next version number based on the existing directories.
    
    Parameters:
    base_dir (str): The base directory where versions are stored.
    
    Returns:
    int: The next version number.
    """
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)
        return 0
    
    existing_versions = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
    version_numbers = [int(d.split('_')[1]) for d in existing_versions if d.startswith('version_') and d.split('_')[1].isdigit()]
    
    if not version_numbers:
        return 0
    
    return max(version_numbers) + 1

def create_version_dir(base_dir):
    """
    Create a new version directory.
    
    Parameters:
    base_dir (str): The base directory where versions are stored.
    
    Returns:
    str: The path to the new version directory.
    """
    next_version = get_next_version(base_dir)
    version_dir = os.path.join(base_dir, f'version_{next_version}')
    os.makedirs(version_dir, exist_ok=True)
    
    print(f"Created version directory: {version_dir}")
    return version_dir

def save_model_checkpoint(model, optimizer, epoch, version_dir):
    """
    Save the model checkpoint with an incremented version number.
    
    Parameters:
    model (nn.Module): The model to save.
    optimizer (optim.Optimizer): The optimizer state to save.
    epoch (int): The current epoch number.
    version_dir (str): The version directory where checkpoints will be saved.
    """
    checkpoint_path = os.path.join(version_dir, f'checkpoint_epoch_{epoch}.pth')
    os.makedirs(os.path.dirname(checkpoint_path), exist_ok=True)
    
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }, checkpoint_path)
    
    print(f"Model checkpoint saved to: {checkpoint_path}")

# Example usage within a training loop:
# Define a simple model, optimizer, and criterion for demonstration purposes
model = nn.Sequential(
    nn.Linear(10, 50),
    nn.ReLU(),
    nn.Linear(50, 10)
)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Create a new version directory for the current run
base_checkpoint_dir = './checkpoints'
version_dir = create_version_dir(base_checkpoint_dir)

# Example training loop with checkpoint saving
num_epochs = 50

for epoch in range(1, num_epochs + 1):
    # Your training code here
    # ...
    
    # Save the model checkpoint every 10 epochs
    if epoch % 10 == 0:
        save_model_checkpoint(model, optimizer, epoch, version_dir)


Created version directory: ./checkpoints/version_1
Model checkpoint saved to: ./checkpoints/version_1/checkpoint_epoch_10.pth
Model checkpoint saved to: ./checkpoints/version_1/checkpoint_epoch_20.pth
Model checkpoint saved to: ./checkpoints/version_1/checkpoint_epoch_30.pth
Model checkpoint saved to: ./checkpoints/version_1/checkpoint_epoch_40.pth
Model checkpoint saved to: ./checkpoints/version_1/checkpoint_epoch_50.pth
