In [9]:
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch

from tqdm import tqdm
import os


In [10]:
from Models import HomographyRegressor, HomographyClassifier
from Models import HomographyPairDataset, FixedSrcRandomDispDataset

from Models import save_checkpoint, load_latest_checkpoint
from Models import offsets_to_class_indices, classes_to_offsets
from Models import classification_loss

from Generator import get_images_from_names, get_random_images, get_all_images

In [11]:
def nn_train_classify(model, dataloader, num_epochs, model_file_name, optimizer, criterion,
                      checkpoint_dir="checkpoints", num_classes=21, disp_range=(-16, 16)):
    os.makedirs(checkpoint_dir, exist_ok=True)
    device = next(model.parameters()).device
    start_epoch = load_latest_checkpoint(checkpoint_dir, model, optimizer, device)
    writer = SummaryWriter(log_dir=os.path.join(checkpoint_dir, "runs"))

    epoch_pbar = tqdm(
        range(start_epoch, num_epochs),
        desc="Training",
        ncols=120,
        miniters=1,
        smoothing=0,
        dynamic_ncols=True,
        initial=start_epoch,
        total=num_epochs
    )

    try:
        for epoch in epoch_pbar:
            model.train()
            running_loss = 0.0
            running_rmse_hard = 0.0
            running_rmse_soft = 0.0
            count = 0

            for pairs, offsets in dataloader:
                pairs = pairs.to(device)
                offsets = offsets.to(device)
                B = pairs.shape[0]

                # --- Forward ---
                logits = model(pairs)  # (B, 8, 21)

                # --- Compute loss ---
                loss = classification_loss(criterion, logits, offsets,
                                           disp_range=disp_range, num_classes=num_classes)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # --- Metrics ---
                with torch.no_grad():
                    pred_hard = classes_to_offsets(logits, disp_range, soft=False)
                    pred_soft = classes_to_offsets(logits, disp_range, soft=True)

                    # rmse_hard_val = rmse(pred_hard, offsets).mean().item()
                    # rmse_soft_val = rmse(pred_soft, offsets).mean().item()
                    rmse_hard_val = torch.sqrt(((pred_hard - offsets) ** 2).mean(dim=-1)).item()
                    rmse_soft_val = torch.sqrt(((pred_soft - offsets) ** 2).mean(dim=-1)).item()

                # --- Logging ---
                running_loss += loss.item() * B
                running_rmse_hard += rmse_hard_val * B
                running_rmse_soft += rmse_soft_val * B
                count += B

            # Average metrics per epoch
            avg_loss = running_loss / count
            avg_rmse_hard = running_rmse_hard / count
            avg_rmse_soft = running_rmse_soft / count

            epoch_pbar.set_postfix({
                "loss": f"{avg_loss:.4f}",
                "rmse_hard": f"{avg_rmse_hard:.3f}px",
                "rmse_soft": f"{avg_rmse_soft:.3f}px"
            })

            # TensorBoard
            writer.add_scalar("Loss/train", avg_loss, epoch + 1)
            writer.add_scalar("RMSE/hard", avg_rmse_hard, epoch + 1)
            writer.add_scalar("RMSE/soft", avg_rmse_soft, epoch + 1)

            # --- Checkpoint every N epochs ---
            if (epoch + 1) % 1000 == 0 or (epoch + 1) == num_epochs:
                save_checkpoint(checkpoint_dir, epoch + 1, model, optimizer)

        # Save final model
        torch.save(model.state_dict(), model_file_name)
        print(f"‚úÖ Final model saved: {model_file_name}")

    except KeyboardInterrupt:
        epoch_pbar.close()
        print(f"\n‚ö†Ô∏è Interrupted at epoch {epoch + 1}")
        save_checkpoint(checkpoint_dir, epoch + 1, model, optimizer)
        print("‚úÖ Checkpoint saved")

    finally:
        epoch_pbar.close()
        writer.close()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()



In [12]:

def nn_train_regressor(model, num_epochs, batch_size, samples_per_epoch, model_file_name,
                       images, optimizer, criterion,
                       checkpoint_dir="checkpoints"):
    os.makedirs(checkpoint_dir, exist_ok=True)
    device = next(model.parameters()).device
    start_epoch = load_latest_checkpoint(checkpoint_dir, model, optimizer, device)
    writer = SummaryWriter(log_dir=os.path.join(checkpoint_dir, "runs"))

    dataset = HomographyPairDataset(images, samples_per_epoch)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,
                            num_workers=0, pin_memory=True)

    epoch_pbar = tqdm(
        range(start_epoch, num_epochs),
        desc="Training",
        ncols=120,
        miniters=1,
        smoothing=0,
        dynamic_ncols=True,
        initial=start_epoch,
        total=num_epochs
    )
    try:
        for epoch in epoch_pbar:
            model.train()
            epoch_loss, epoch_mae, epoch_rmse = 0.0, 0.0, 0.0

            for pairs, offsets in dataloader:
                pairs = pairs.to(device)
                offsets = offsets.to(device)

                preds = model(pairs)

                # === Compute loss ===
                loss = criterion(preds, offsets)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # === Metrics ===
                with torch.no_grad():
                    mae = torch.mean(torch.abs(preds - offsets)).item()
                    rmse = torch.sqrt(torch.mean((preds - offsets) ** 2) + 1e-8).item()

                epoch_loss += loss.item()
                epoch_mae += mae
                epoch_rmse += rmse

            # === Epoch summary ===
            avg_loss = epoch_loss / len(dataloader)
            avg_mae = epoch_mae / len(dataloader)
            avg_rmse = epoch_rmse / len(dataloader)

            epoch_pbar.set_postfix({
                "loss": f"{avg_loss:.4f}",
                "mae": f"{avg_mae:.3f}px",
                "rmse": f"{avg_rmse:.3f}px"
            })

            writer.add_scalar("Loss/MSE", avg_loss, epoch)
            writer.add_scalar("Error/MAE", avg_mae, epoch)
            writer.add_scalar("Error/RMSE", avg_rmse, epoch)

            # === Checkpoint every N epochs ===
            if (epoch + 1) % 1000 == 0 or (epoch + 1) == num_epochs:
                save_checkpoint(checkpoint_dir, epoch + 1, model, optimizer)

        # Save final model
        torch.save(model.state_dict(), model_file_name)
        print(f"‚úÖ Final model saved: {model_file_name}")

    except KeyboardInterrupt:
        epoch_pbar.close()
        print(f"\n‚ö†Ô∏è Interrupted at epoch {epoch + 1}")
        save_checkpoint(checkpoint_dir, epoch + 1, model, optimizer)
        print("‚úÖ Checkpoint saved")

    finally:
        epoch_pbar.close()
        writer.close()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

In [17]:
# # TRAIN Regressor
#
# PREPROCESSED_DIR = "datasets/val2017_preprocessed"
# num_epochs = 30000
# batch_size = 32
# learning_rate = 1e-4
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
#
# model = HomographyRegressor(dropout_rate=0.1).to(device)
# criterion = nn.MSELoss()
#
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#
# image_names = [
#     "000000002299.jpg",
#     #     # "000000000285.jpg",
#     #     # "000000000632.jpg",
# ]
# # images = get_images_from_names(image_names, PREPROCESSED_DIR)
# # images = get_random_images(1, image_dir=PREPROCESSED_DIR)
# images = get_all_images(PREPROCESSED_DIR)
#
# print(f"üì∑ Loaded {len(images)} image(s) for training")
#
# nn_train_regressor(
#     model=model,
#     num_epochs=num_epochs,
#     batch_size=batch_size,
#     samples_per_epoch=64,
#     model_file_name=f"h_regressor_all.pth",
#     images=images,
#     optimizer=optimizer,
#     criterion=criterion,
#     checkpoint_dir="checkpoints_homography_regressor_all"
# )

üöÄ Starting training from scratch.


Training:  79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 39555/50000 [07:09<01:53, 92.05it/s, loss=81.4260, rmse=9.024px]  



‚ö†Ô∏è Interrupted at epoch 39556
‚úÖ Checkpoint saved


In [13]:
# # TRAIN Classifier
#
# PREPROCESSED_DIR = "datasets/val2017_preprocessed"
# num_epochs = 30000
# samples_per_epoch = 128
# batch_size = 384
# learning_rate = 1e-3
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")
#
# model = HomographyClassifier(num_classes=21, class_dim=8, dropout_rate=0.1).to(device)
# criterion = nn.CrossEntropyLoss()
#
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
#
# # image_names = [
# #     "000000002299.jpg",
# #     #     # "000000000285.jpg",
# #     #     # "000000000632.jpg",
# # ]
# # images = get_images_from_names(image_names, PREPROCESSED_DIR)
# # images = get_random_images(image_dir=PREPROCESSED_DIR, num_images=16)
# images = get_all_images(image_dir=PREPROCESSED_DIR)
#
# print(f"üì∑ Loaded {len(images)} image(s) for training")
#
# dataset = HomographyPairDataset(images, samples_per_epoch=batch_size)
# dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0, pin_memory=True, shuffle=True)
#
# nn_train_classify(
#     model=model,
#     dataloader=dataloader,
#     num_epochs=num_epochs,
#     model_file_name=f"h_classify.pth",
#     optimizer=optimizer,
#     criterion=criterion,
#     checkpoint_dir="checkpoints_homography_classify_all"
# )

Using device: cuda
üì∑ Loaded 5000 image(s) for training
