In [1]:
%load_ext autoreload
%autoreload 2

# Module for generating global vector representations of multimodal outdoor data

A module that implements an algorithm for generating global vector representations of multimodal data invariant to the change of seasons. Invariance is achieved through the use of LiDAR and modalities that rely on semantic information.

In [2]:
from time import time

import faiss

from tqdm import tqdm

from hydra.utils import instantiate
import numpy as np
from omegaconf import OmegaConf
from scipy.spatial.transform import Rotation
import torch
from torch.utils.data import DataLoader

from opr.datasets.itlp import ITLPCampus
from opr.pipelines.place_recognition import PlaceRecognitionPipeline

In [3]:
DATABASE_TRACK_DIR = "/home/docker_opr/Datasets/ITLP-Campus-data/subsampled_data/indoor/00_2023-10-25-night"
QUERY_TRACK_DIR = "/home/docker_opr/Datasets/ITLP-Campus-data/subsampled_data/indoor/01_2023-11-09-twilight"

SENSOR_SUITE = ["front_cam", "back_cam", "lidar"]

BATCH_SIZE = 64
NUM_WORKERS = 4
DEVICE = "cuda"

SOC_WEIGHTS_PATH = "../../weights/place_recognition/soc.pth"
MODEL_CONFIG_PATH = "../../configs/model/place_recognition/multi-image_multi-semantic_lidar_late-fusion.yaml"
WEIGHTS_PATH = "../../weights/place_recognition/multi-image_multi-semantic_lidar_late-fusion_nclt.pth"

DATASET_CONFIG_PATH = "../../configs/dataset/itlp.yaml"

In [4]:
def pose_to_matrix(pose):
    """From the 6D poses in the [tx ty tz qx qy qz qw] format to 4x4 pose matrices."""
    position = pose[:3]
    orientation_quat = pose[3:]
    rotation = Rotation.from_quat(orientation_quat)
    pose_matrix = np.eye(4)
    pose_matrix[:3,:3] = rotation.as_matrix()
    pose_matrix[:3,3] = position
    return pose_matrix


def compute_error(estimated_pose, gt_pose):
    """For the 6D poses in the [tx ty tz qx qy qz qw] format."""
    estimated_pose = pose_to_matrix(estimated_pose)
    gt_pose = pose_to_matrix(gt_pose)
    error_pose = np.linalg.inv(estimated_pose) @ gt_pose
    dist_error = np.sum(error_pose[:3, 3]**2) ** 0.5
    r = Rotation.from_matrix(error_pose[:3, :3])
    rotvec = r.as_rotvec()
    angle_error = (np.sum(rotvec**2)**0.5) * 180 / np.pi
    angle_error = abs(90 - abs(angle_error-90))
    return dist_error, angle_error

## Prepare faiss database index

In [5]:
from hydra import compose, initialize
from omegaconf import OmegaConf

with initialize(version_base=None, config_path="../../configs"):
    cfg = compose(config_name="train_soc")

db_dataset_cfg = cfg.dataset
db_dataset_cfg.dataset_root = DATABASE_TRACK_DIR
db_dataset_cfg.csv_file = "track.csv"
db_dataset_cfg.sensors = SENSOR_SUITE
db_dataset_cfg.load_semantics = True
db_dataset = instantiate(db_dataset_cfg, subset="test")
# ITLPCampus(
#     dataset_root=DATABASE_TRACK_DIR,
#     sensors=["front_cam", "back_cam", "lidar"],
#     mink_quantization_size=0.5,
#     load_semantics=True,
#     subset="test",
#     test_split=[2, 4],
# )

In [39]:
db_dataset.dataset_df

Unnamed: 0,index,floor,timestamp,front_cam_ts,back_cam_ts,lidar_ts,tx,ty,tz,qx,qy,qz,qw,in_query
0,241,2,1698264504041122048,1698264504002182709,1698264504047208484,1698264504047677052,0.936002,-0.005618,99.978601,-0.002076,0.001280,-0.068510,0.997647,True
1,242,2,1698264507070846464,1698264507119317501,1698264507057790564,1698264507073838801,1.912760,0.557575,100.056236,-0.002415,0.004182,-0.037406,0.999288,True
2,243,2,1698264508581344000,1698264508585594131,1698264508578256338,1698264508589850480,2.830138,0.239738,100.065229,-0.003375,-0.008576,-0.475835,0.879486,True
3,244,2,1698264509993314304,1698264510033376489,1698264510019705506,1698264509997810962,3.150359,-0.712376,100.126280,0.000956,-0.018858,-0.707919,0.706042,True
4,245,2,1698264511203823360,1698264511217856467,1698264511206572579,1698264511206429225,3.112751,-1.740502,100.169964,0.008021,-0.017280,-0.785810,0.618174,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,1001,4,1698264217389181184,1698264217449162733,1698264217367434022,1698264217393970056,2.521885,3.035713,299.942319,-0.012613,-0.006326,0.697654,-0.716296,True
521,1002,4,1698264218601385472,1698264218566726750,1698264218595017170,1698264218607669071,2.589945,1.982781,299.957112,-0.015227,-0.004615,0.801606,-0.597641,True
522,1003,4,1698264220012621312,1698264219980302808,1698264219999798153,1698264220019656143,2.090019,1.050351,299.872502,-0.022825,0.007833,0.942468,-0.333424,True
523,1004,4,1698264222131331840,1698264222180504333,1698264222108331975,1698264222134351042,1.265012,0.510816,299.879513,-0.019465,0.008826,0.984409,-0.174590,True


In [7]:
db_dataloader = DataLoader(
    db_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    collate_fn=db_dataset.collate_fn,
)


In [9]:
soc_model_config = cfg.model
soc_model = instantiate(soc_model_config)
soc_model_weights = torch.load(SOC_WEIGHTS_PATH)["model_state_dict"]
soc_model.load_state_dict(soc_model_weights, strict=True)
soc_model = soc_model.to(DEVICE)
soc_model.eval();
soc_model

SOCMLPMixer(
  (mlp_mixer): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=1, p2=1)
    (1): Linear(in_features=30, out_features=64, bias=True)
    (2): Sequential(
      (0): PreNormResidual(
        (fn): Sequential(
          (0): Conv1d(72, 288, kernel_size=(1,), stride=(1,))
          (1): GELU(approximate=none)
          (2): Dropout(p=0.0, inplace=False)
          (3): Conv1d(288, 72, kernel_size=(1,), stride=(1,))
          (4): Dropout(p=0.0, inplace=False)
        )
        (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (1): PreNormResidual(
        (fn): Sequential(
          (0): Linear(in_features=64, out_features=32, bias=True)
          (1): GELU(approximate=none)
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=32, out_features=64, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
        (norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
    )
    (3

In [11]:
model_config = OmegaConf.load(MODEL_CONFIG_PATH)
model = instantiate(model_config)
model.load_state_dict(torch.load(WEIGHTS_PATH), strict=True)
model.soc_module = soc_model
model = model.to(DEVICE)
model.eval();

In [12]:
model.fusion_module

Concat()

In [13]:
descriptors = []
with torch.no_grad():
    for batch in tqdm(db_dataloader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        print(batch.keys())
        final_descriptor = model(batch)["final_descriptor"]
        print(final_descriptor.shape)
        descriptors.append(final_descriptor.detach().cpu().numpy())

descriptors = np.concatenate(descriptors, axis=0)

  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)


  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = tor

dict_keys(['idxs', 'poses', 'images_front_cam', 'masks_front_cam', 'images_back_cam', 'masks_back_cam', 'pointclouds_lidar_coords', 'pointclouds_lidar_feats', 'soc'])


  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = tor

torch.Size([64, 1024])
dict_keys(['idxs', 'poses', 'images_front_cam', 'masks_front_cam', 'images_back_cam', 'masks_back_cam', 'pointclouds_lidar_coords', 'pointclouds_lidar_feats', 'soc'])


  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)


torch.Size([64, 1024])
dict_keys(['idxs', 'poses', 'images_front_cam', 'masks_front_cam', 'images_back_cam', 'masks_back_cam', 'pointclouds_lidar_coords', 'pointclouds_lidar_feats', 'soc'])


  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)


torch.Size([64, 1024])
dict_keys(['idxs', 'poses', 'images_front_cam', 'masks_front_cam', 'images_back_cam', 'masks_back_cam', 'pointclouds_lidar_coords', 'pointclouds_lidar_feats', 'soc'])


  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)


torch.Size([64, 1024])


  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = tor

dict_keys(['idxs', 'poses', 'images_front_cam', 'masks_front_cam', 'images_back_cam', 'masks_back_cam', 'pointclouds_lidar_coords', 'pointclouds_lidar_feats', 'soc'])
torch.Size([64, 1024])


  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
 67%|██████▋   | 6/9 [00:56<00:22,  7.53s/it]

dict_keys(['idxs', 'poses', 'images_front_cam', 'masks_front_cam', 'images_back_cam', 'masks_back_cam', 'pointclouds_lidar_coords', 'pointclouds_lidar_feats', 'soc'])
torch.Size([64, 1024])


  pc = torch.tensor(pc, dtype=torch.float32)
 78%|███████▊  | 7/9 [00:56<00:10,  5.14s/it]

dict_keys(['idxs', 'poses', 'images_front_cam', 'masks_front_cam', 'images_back_cam', 'masks_back_cam', 'pointclouds_lidar_coords', 'pointclouds_lidar_feats', 'soc'])
torch.Size([64, 1024])


  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)


dict_keys(['idxs', 'poses', 'images_front_cam', 'masks_front_cam', 'images_back_cam', 'masks_back_cam', 'pointclouds_lidar_coords', 'pointclouds_lidar_feats', 'soc'])
torch.Size([64, 1024])


  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
  pc = torch.tensor(pc, dtype=torch.float32)
100%|██████████| 9/9 [00:58<00:00,  6.46s/it]

dict_keys(['idxs', 'poses', 'images_front_cam', 'masks_front_cam', 'images_back_cam', 'masks_back_cam', 'pointclouds_lidar_coords', 'pointclouds_lidar_feats', 'soc'])
torch.Size([13, 1024])





In [14]:
print(descriptors.shape)

(525, 1024)


In [15]:
index = faiss.IndexFlatL2(descriptors.shape[1])
index.add(descriptors)

In [16]:
faiss.write_index(index, DATABASE_TRACK_DIR + "/index.faiss")


## Test

In [40]:
pipe = PlaceRecognitionPipeline(
    database_dir=DATABASE_TRACK_DIR,
    model=model,
    model_weights_path=None,
    device=DEVICE,
)

pipe.database_df = db_dataset.dataset_df  # !!!!!

In [41]:
query_dataset_cfg = cfg.dataset
query_dataset_cfg.dataset_root = QUERY_TRACK_DIR
query_dataset_cfg.csv_file = "track.csv"
query_dataset_cfg.sensors = SENSOR_SUITE
query_dataset_cfg.load_semantics = True
query_dataset = instantiate(query_dataset_cfg, subset="test")

# query_dataset = ITLPCampus(
#     dataset_root=QUERY_TRACK_DIR,
#     sensors=SENSOR_SUITE,
#     mink_quantization_size=0.5,
#     load_semantics=True,
# )


In [42]:
query_dataset.dataset_df


Unnamed: 0,index,floor,timestamp,front_cam_ts,back_cam_ts,lidar_ts,tx,ty,tz,qx,qy,qz,qw,in_query
0,239,2,1699538667594320384,1699538667595033775,1699538667576960360,1699538667597536130,0.178382,0.067620,100.008046,0.001788,0.002456,0.056518,0.998397,True
1,240,2,1699538671628715008,1699538671601782558,1699538671650855565,1699538671631259615,1.178769,0.125520,100.219715,0.006708,0.076623,0.006842,0.997014,True
2,241,2,1699538675763979776,1699538675794472055,1699538675788658434,1699538675766731756,2.128260,0.192668,100.084871,0.009729,0.004442,0.004434,0.999933,True
3,242,2,1699538676974509824,1699538676993606473,1699538676990979660,1699538676978077620,3.114585,0.151316,100.129671,0.013395,0.006937,-0.371526,0.928300,True
4,243,2,1699538678386366208,1699538678393822673,1699538678385266571,1699538678388709906,3.391258,-0.801008,100.086859,0.017389,-0.006268,-0.712724,0.701201,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,986,4,1699540505373902336,1699540505424862712,1699540505399149612,1699540505377875178,2.993672,1.640322,300.304113,0.004784,-0.001289,0.783418,-0.621475,True
510,987,4,1699540506785989376,1699540506798236499,1699540506802697189,1699540506789051673,2.515123,0.767124,300.452379,0.016930,0.011591,0.945305,-0.325542,True
511,988,4,1699540509711051008,1699540509693246831,1699540509741109741,1699540509713791844,1.589661,0.461763,300.494753,0.015372,0.017912,0.998429,-0.050819,True
512,989,4,1699540512434140160,1699540512393776793,1699540512416448390,1699540512436590628,0.584536,0.380240,300.509881,0.004482,0.020040,0.999061,-0.038159,True


In [None]:
PR_THRESHOLD = 25.0

pr_matches = []
dist_errors = []
angle_errors = []
times = []

ns1 = []
ns2 = []
ns3 = []
ns4 = []

i = 0
for query in tqdm(query_dataset):
    t = time()
    output = pipe.infer(query)
    times.append(time() - t)
    print(f"{i} : {output['idx']}")
    i += 1
    n1, n2, n3, n4 = np.linalg.norm(output["descriptor"][:256]), np.linalg.norm(output["descriptor"][256:512]), np.linalg.norm(output["descriptor"][512:768]), np.linalg.norm(output["descriptor"][768:])
    ns1.append(n1)
    ns2.append(n2)
    ns3.append(n3)
    ns4.append(n4)
    dist_error, angle_error = compute_error(output["pose"], query["pose"])
    print(f"dist_error: {dist_error}, angle_error: {angle_error}")
    pr_matches.append(dist_error < PR_THRESHOLD)
    dist_errors.append(dist_error)
    angle_errors.append(angle_error)

times = times[1:]  # the first query is always slower

In [44]:
print(np.mean(ns1), np.mean(ns2), np.mean(ns3), np.mean(ns4))

4.543165 3.3033552 9.375395 0.036797244


In [45]:
print(f"Recall@1: {(np.mean(pr_matches))*100:.2f}")
print(f"Mean distance error: {np.mean(dist_errors):.2f}, mean angle error: {np.mean(angle_errors):.2f}")
print(f"Median distance error: {np.median(dist_errors):.2f}, median angle error: {np.median(angle_errors):.2f}")

Recall@1: 91.25
Mean distance error: 14.54, mean angle error: 8.69
Median distance error: 0.92, median angle error: 4.73


In [46]:
print(f"Mean inference time: {np.mean(times)*1000:.2f} ms, median inference time: {np.median(times)*1000:.2f} ms")

Mean inference time: 28.00 ms, median inference time: 26.34 ms
