In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
from pathlib import Path

import hydra
import torch
import wandb
from hydra.utils import instantiate
from omegaconf import DictConfig, OmegaConf

from opr.datasets.dataloader_factory import make_dataloaders
from opr.testing import test
from opr.training import epoch_loop
from opr.utils import flatten_dict, set_seed



In [3]:
from hydra import initialize, compose
from hydra.utils import instantiate

with initialize(version_base=None, config_path="configs"):
    # cfg = compose(config_name='config_nclt_text.yaml')  # only_text
    cfg = compose(config_name='nclt_text5_cam5.yaml')  # lidat + cam
    # cfg = compose(config_name='nclt_lidar_cam5.yaml')  # lidat + cam
    # cfg = compose(config_name='config.yaml')  # lidar + cam

In [4]:
cfg.dataset

{'dataset': {'_target_': 'opr.datasets.nclt.NCLTDataset', 'dataset_root': '/home/docker_opr/Datasets/NCLT_preprocessed', 'modalities': '${general.modalities}', 'images_subdir': 'lb3_small/Cam5', 'text_embs_dir': 'clip-vit-base-patch32', 'mink_quantization_size': 0.5, 'coords_limit': [-100, 100]}, 'sampler': {'_target_': 'opr.datasets.samplers.batch_sampler.BatchSampler', 'batch_size': 8, 'batch_size_limit': 160, 'batch_expansion_rate': 1.4, 'positives_per_group': 2, 'seed': '${general.seed}'}, 'num_workers': '${general.num_workers}'}

In [5]:
cfg.model

{'_target_': 'opr.models.base_models.ComposedModel', 'cloud_module': None, 'text_module': {'_target_': 'opr.models.base_models.MultiTextModule', 'text_module': {'_target_': 'opr.models.base_models.TextModule', 'text_emb_size': 512, 'hidden_size': 128}, 'fusion_module': {'_target_': 'opr.models.fusion.Add'}}, 'image_module': {'_target_': 'opr.models.base_models.MultiImageModule', 'image_module': {'_target_': 'opr.models.base_models.ImageModule', 'backbone': {'_target_': 'opr.models.resnet.ResNet18FPNExtractor', 'lateral_dim': 256, 'fh_num_bottom_up': 4, 'fh_num_top_down': 0, 'pretrained': True}, 'head': {'_target_': 'opr.models.layers.gem.GeM', 'p': 3, 'eps': 1e-06}}, 'fusion_module': {'_target_': 'opr.models.fusion.Add'}}, 'fusion_module': {'_target_': 'opr.models.fusion.Concat'}}

In [6]:
cfg.general

{'debug': False, 'seed': 31299, 'checkpoints_dir': 'checkpoints/', 'device': 'cuda', 'num_workers': 4, 'batch_expansion_th': 0.7, 'modalities': ['image_cam5', 'text_cam5'], 'test_modality': 'fusion', 'epochs': 120}

In [7]:
print("=> Instantiating model...")
model = instantiate(cfg.model)

print("=> Instantiating loss...")
loss_fn = instantiate(cfg.loss)

print("=> Making dataloaders...")
dataloaders = make_dataloaders(
    dataset_cfg=cfg.dataset.dataset,
    batch_sampler_cfg=cfg.dataset.sampler,
    num_workers=cfg.dataset.num_workers,
)

print("=> Instantiating optimizer...")
params_list = []
modalities = list(set([m.split("_")[0] for m in cfg.general.modalities]))
for modality in modalities:
    params_list.append(
        {
            "params": getattr(model, f"{modality}_module").parameters(),
            "lr": cfg.optimizer.learning_rates[f"{modality}_lr"],
        }
    )
optimizer = instantiate(cfg.optimizer.fn, params=params_list)
print("Instantiating scheduler...")
scheduler = instantiate(cfg.scheduler, optimizer=optimizer)

model = model.to(cfg.general.device)

=> Instantiating model...
=> Instantiating loss...
=> Making dataloaders...
=> Instantiating optimizer...
Instantiating scheduler...


In [8]:
batch, _, _ = next(iter(dataloaders["train"]))
# batch["text_emb_cam1"]

In [9]:
batch.keys()

dict_keys(['images_cam5', 'text_emb_cam5', 'utms'])

In [10]:
# dataloaders["train"].dataset[0].keys()

In [11]:
model, optimizer, modalities

(ComposedModel(
   (image_module): MultiImageModule(
     (image_module): ImageModule(
       (backbone): ResNet18FPNExtractor(
         (resnet_fe): ModuleList(
           (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
           (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
           (2): ReLU(inplace=True)
           (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
           (4): Sequential(
             (0): BasicBlock(
               (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
               (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
               (relu): ReLU(inplace=True)
               (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
               (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
             )


In [12]:
# if not cfg.general.debug and not cfg.wandb.disabled:
#     config_dict = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
#     wandb.init(
#         name=cfg.wandb.run_name,
#         project=cfg.wandb.project,
#         settings=wandb.Settings(start_method="thread"),
#         config=config_dict,
#     )
#     wandb.save(f"configs/{wandb.run.name}.yaml")
#     run_name = wandb.run.name
# else:
#     run_name = "debug"

In [14]:
# checkpoints_dir = (
#     Path(cfg.general.checkpoints_dir) / f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}_{run_name}"
# )
# if not checkpoints_dir.exists():
#     checkpoints_dir.mkdir(parents=True)

In [15]:
# best_recall_at_1 = 0.0

for epoch in range(60, 120):
# for epoch in range(cfg.general.epochs):
    print(f"\n\n=====> Epoch {epoch+1}:")
    # TODO: resolve mypy typing here
    train_batch_size = dataloaders["train"].batch_sampler.batch_size  # type: ignore
    val_batch_size = dataloaders["val"].batch_sampler.batch_size  # type: ignore

    print("\n=> Training:\n")

    train_stats, train_rate_non_zero = epoch_loop(
        dataloader=dataloaders["train"],
        model=model,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        phase="train",
        device=cfg.general.device,
    )

    print(f"\ntrain_rate_non_zero = {train_rate_non_zero}")

    batch_expansion_th = cfg.general.batch_expansion_th
    if batch_expansion_th is not None:
        if batch_expansion_th == 1.0:
            print("Batch expansion rate is set to every epoch. Increasing batch size.")
            # TODO: resolve mypy typing here
            dataloaders["train"].batch_sampler.expand_batch()  # type: ignore
        elif train_rate_non_zero is None:
            print(
                "\nWARNING: 'batch_expansion_th' was set, but 'train_rate_non_zero' is None. ",
                "The batch size was not expanded.",
            )
        elif train_rate_non_zero < batch_expansion_th:
            print(
                "Average non-zero triplet ratio is less than threshold: ",
                f"{train_rate_non_zero} < {batch_expansion_th}",
            )
            # TODO: resolve mypy typing here
            dataloaders["train"].batch_sampler.expand_batch()  # type: ignore

    print("\n=> Validating:\n")

    val_stats, val_rate_non_zero = epoch_loop(
        dataloader=dataloaders["val"],
        model=model,
        loss_fn=loss_fn,
        optimizer=optimizer,
        phase="val",
        device=cfg.general.device,
    )

    print(f"\nval_rate_non_zero = {val_rate_non_zero}")

    print("\n=> Testing:\n")

    recall_at_n, recall_at_one_percent, mean_top1_distance = test(
        model=model,
        descriptor_key=cfg.general.test_modality,
        dataloader=dataloaders["test"],
        device=cfg.general.device,
    )

    stats_dict = {}
    stats_dict["test"] = {
        "mean_top1_distance": mean_top1_distance,
        "recall_at_1%": recall_at_one_percent,
        "recall_at_1": recall_at_n[0],
        "recall_at_3": recall_at_n[2],
        "recall_at_5": recall_at_n[4],
        "recall_at_10": recall_at_n[9],
    }
    stats_dict["train"] = train_stats
    stats_dict["train"]["batch_size"] = train_batch_size
    stats_dict["val"] = val_stats
    stats_dict["val"]["batch_size"] = val_batch_size

    # saving checkpoints
    checkpoint_dict = {
        "epoch": epoch + 1,
        "config": cfg,
        "stats_dict": stats_dict,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    }
    torch.save(checkpoint_dict, checkpoints_dir / f"epoch_{epoch+1}.pth")
    # wandb logging
    if not cfg.general.debug and not cfg.wandb.disabled:
        wandb.log(flatten_dict(stats_dict))
        wandb.save(str((checkpoints_dir / f"epoch_{epoch+1}.pth").relative_to(".")))
    if recall_at_n[0] > best_recall_at_1:
        print("Recall@1 improved!")
        torch.save(checkpoint_dict, checkpoints_dir / "best.pth")
        best_recall_at_1 = recall_at_n[0]
        if not cfg.general.debug and not cfg.wandb.disabled:
            wandb.save(str((checkpoints_dir / "best.pth").relative_to(".")))



=====> Epoch 61:

=> Training:



train: 100%|██████████| 377/377 [00:22<00:00, 17.08it/s]


train_rate_non_zero = 0.9526256789187824

=> Validating:




val: 100%|██████████| 18/18 [00:07<00:00,  2.51it/s]



val_rate_non_zero = 0.9982374195102705

=> Testing:



Calculating test set descriptors: 100%|██████████| 36/36 [00:13<00:00,  2.68it/s]
Calculating metrics: 100%|██████████| 90/90 [00:08<00:00, 10.84it/s]

Mean Recall@N:
[0.364847   0.47737934 0.54198757 0.58595289 0.62002509 0.64851242
 0.67238183 0.69198659 0.71037164 0.72473293 0.73892659 0.75063084
 0.76093891 0.77060654 0.77845038 0.7872167  0.79509037 0.80268662
 0.80853372 0.81456342 0.82032649 0.82621596 0.83094287 0.83575654
 0.84083118]
Mean Recall@1% = 0.6395850355605354
Mean top-1 distance = 0.3866297509901749





NameError: name 'checkpoints_dir' is not defined

In [16]:
%debug    

> [0;32m/home/docker_opr/OpenPlaceRecognition/opr/models/fusion.py[0m(20)[0;36mforward[0;34m()[0m
[0;32m     18 [0;31m        [0;31m# assert "image" in data[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m        [0;31m# assert "cloud" in data[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 20 [0;31m        [0mfusion_global_descriptor[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0mconcat[0m[0;34m([0m[0mlist[0m[0;34m([0m[0mdata[0m[0;34m.[0m[0mvalues[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m,[0m [0mdim[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     21 [0;31m        [0;32mreturn[0m [0mfusion_global_descriptor[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;31m[0;34m[0m[0m
[0m
None
tensor([[-0.0014,  0.0217, -0.0136,  ..., -0.0067, -0.1070,  0.0102],
        [ 0.0535, -0.0068, -0.0186,  ...,  0.0521, -0.0939,  0.2553],
        [-0.0908, -0.0039, -0.0207,  ...,  0.0003, -0.1779,  0.0565]

In [20]:
# data = {"text" : 1, 
#         "image" : 1, 
#         "cloud" : None}

data = {"text" : torch.Tensor(1, 1), 
        "image" : torch.Tensor(1, 1),
        "cloud" : None}

In [21]:
{key: value for key, value in data.items() if value is not None} 

{'text': tensor([[-3.8658e-22]]), 'image': tensor([[1.5766e-19]])}

In [14]:
import torch.nn as nn

class TextModule(nn.Module):
    def __init__(self, text_emb_size=100, hidden_size=100):
        super().__init__()
        self.fc1 = nn.Linear(text_emb_size, hidden_size)
        self.relu = nn.ReLU() 
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        
    def forward(self, embedding):
        x = self.fc1(embedding)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [15]:
model = TextModule()

In [18]:
model(batch["text_emb_cam1"])

tensor([[-9.6135e-03, -1.0577e-01,  8.7988e-02, -2.3182e-02,  1.1544e-01,
          1.1407e-04, -6.5322e-02,  4.3350e-03,  2.6404e-03, -1.9650e-02,
         -4.0838e-02,  6.1159e-02, -1.6879e-01,  9.8357e-02, -4.2016e-03,
          2.9686e-03,  1.2739e-03, -7.9826e-02, -5.5356e-02, -9.6956e-02,
          3.2541e-02,  8.7272e-02,  6.0897e-02, -1.0772e-01,  7.4620e-02,
         -5.5626e-02,  8.2666e-02, -2.2510e-02,  9.8700e-02,  7.0989e-02,
         -1.1105e-01,  7.0539e-02,  6.2379e-02, -1.8779e-02,  2.2416e-02,
         -9.5671e-02, -8.8208e-03, -2.8558e-02, -7.1731e-02, -1.8757e-02,
         -4.6004e-02,  5.4501e-02,  6.0142e-03, -1.6162e-02, -6.1365e-03,
         -5.0851e-02, -1.0993e-01, -1.0689e-01,  2.7982e-02,  2.3153e-02,
         -1.7139e-02, -2.7945e-02,  3.5807e-02, -5.4931e-02, -9.9132e-02,
         -1.0072e-01, -7.1896e-02, -1.9962e-02, -4.1751e-02,  2.4051e-02,
          7.1095e-03, -5.3357e-02, -1.1899e-01, -8.3624e-03,  7.1089e-02,
          1.4980e-02,  4.2275e-02, -4.

In [20]:
t = torch.Tensor(1, 100)
t

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.4013e-45,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          3.1389e-43,  0.0000e+00,  1.4013e-45,  0.0000e+00,  9.1844e-41,
          1.1551e-40,  1.1481e-41,  2.0739e-43,  1.0712e+29,  0.0000e+00,
          4.4982e-43,  0.0000e+00,  7.7751e+28,  0.0000e+00,  1.1743e+29,
          0.0000e+00,  1.4013e-45,  0.0000e+00,  0.0000e+00,  4.5849e-41,
          6.7262e-44,  0.0000e+00,  2.9147e-43,  0.0000e+00,  2.4687e+15,
          4.5850e-41,  0.0000e+00,  0.0000e+00,  1.4013e-45,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  7.7868e+28,  0.0000e+00,  2.0792e-36,  0.0000e+00,
         -7.8279e-28,  4.5849e-41,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.

In [21]:
t.requires_grad = True

In [22]:
t

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          1.4013e-45,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          3.1389e-43,  0.0000e+00,  1.4013e-45,  0.0000e+00,  9.1844e-41,
          1.1551e-40,  1.1481e-41,  2.0739e-43,  1.0712e+29,  0.0000e+00,
          4.4982e-43,  0.0000e+00,  7.7751e+28,  0.0000e+00,  1.1743e+29,
          0.0000e+00,  1.4013e-45,  0.0000e+00,  0.0000e+00,  4.5849e-41,
          6.7262e-44,  0.0000e+00,  2.9147e-43,  0.0000e+00,  2.4687e+15,
          4.5850e-41,  0.0000e+00,  0.0000e+00,  1.4013e-45,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  7.7868e+28,  0.0000e+00,  2.0792e-36,  0.0000e+00,
         -7.8279e-28,  4.5849e-41,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.