## Create KittiDatasetCustom 

In [None]:
from pcdet.datasets.kitti.kitti_dataset_custom import *
from pcdet.datasets.dataset import *
import yaml
from easydict import EasyDict
from pathlib import Path
from pcdet.utils import common_utils

dataset_cfg = EasyDict(yaml.safe_load(open('/home/rlab10/OpenPCDet/tools/cfgs/dataset_configs/kitti_dataset_custom.yaml')))
class_names = ['Car', 'Pedestrian', 'Cyclist']
file_path = '/home/rlab10/OpenPCDet/pcdet/datasets/kitti/kitti_dataset_custom.py' 
ROOT_DIR = (Path(file_path).resolve().parent / '../../../').resolve()
data_path = ROOT_DIR / 'data' / 'kitti' # raw data path
save_path = ROOT_DIR / 'data' / 'kitti'
kitti_infos = []
num_features = len(dataset_cfg.POINT_FEATURE_ENCODING.src_feature_list)

def create_kitti_infos(dataset_cfg, class_names, data_path, save_path, workers=4):
    from time import sleep

    dataset = KittiDatasetCustom(dataset_cfg=dataset_cfg, class_names=class_names, root_path=data_path, training=False, logger=common_utils.create_logger())
    
    train_split, val_split, test_split = 'train', 'val', 'test'
    num_features = len(dataset_cfg.POINT_FEATURE_ENCODING.src_feature_list)

    train_filename = save_path / ('kitti_infos_%s.pkl' % train_split)
    val_filename = save_path / ('kitti_%s_dataset.pkl' % val_split)
    trainval_filename = save_path / ('kitti_infos_%s%s.pkl' % (train_split, val_split))
    test_filename = save_path / ('kitti_infos_%s.pkl' % test_split)

    print('\n' + '-' * 36 + 'Start to generate data infos' + '-' * 37)
    print('---------------CAUTION: Custom code is configured to serve as Augmentor NOT training-----------------')

    dataset.set_split(train_split)
    # ensure that get_infos() processes the single scene.
    # NOTE: get_infos() collects infos about all classes (except 'DontCare'), filter unwanted classes with param `used_classes` in create_groundtruth_database.
    kitti_infos_train = dataset.get_infos(num_workers=workers, has_label=True, count_inside_pts=True, num_features=num_features)
    with open(train_filename, 'wb') as f:
        pickle.dump(kitti_infos_train, f)
    print('Kitti info train file is saved to %s\n' % train_filename)
    sleep(3)

    dataset.set_split(val_split)
    # ensure process single scene with PointFeatureEncoder, DataProcessor
    dataset.training = False
    allowed_classes = class_names
    kitti_val_dataset = dataset.get_infos_val(num_workers=workers, has_label=True, count_inside_pts=True, num_features=num_features, 
                                              class_names=allowed_classes, with_beam_label=dataset_cfg.WITH_BEAM_LABEL)
    with open(val_filename, 'wb') as f:
        pickle.dump(kitti_val_dataset, f)
    print('Kitti info val dataset is saved to %s\n' % val_filename)
    sleep(3)

    with open(trainval_filename, 'wb') as f:
        pickle.dump(kitti_infos_train + kitti_val_dataset, f)
    print('Kitti info trainval file is saved to %s\n' % trainval_filename)
    sleep(3)

    dataset.set_split(test_split)
    kitti_infos_test = dataset.get_infos(num_workers=workers, has_label=False, count_inside_pts=False)
    with open(test_filename, 'wb') as f:
       pickle.dump(kitti_infos_test, f)
    print('Kitti info test file is saved to %s\n' % test_filename)
    sleep(3)

    print('\n---------------Start creating groundtruth database for later data augmentation-------------------------')
    print('---------------CAUTION: Custom code is configured to serve as Augmentor NOT training-------------------')
    print('---------------No DataProcessor and PointFeatureEncoder required, handled by training data creation----')
    
    # Input the 'kitti_infos_train.pkl' to generate gt_database (cutted objects of samples)
    dataset.set_split(train_split)
    dataset.create_groundtruth_database(info_path=train_filename, used_classes=class_names, split=train_split, with_beam_labels=dataset_cfg.WITH_BEAM_LABEL)
    print(f'---------------These groundtruth {train_split} objects are randomly inserted into samples (augmentation)-------')
    print('-' * 41 + 'Data preparation Done' + '-' * 41)

def save_data_list_kitti(data_list=None, save_path=None, root_path=None, sample_id_list=None, augmentors=None):

    root_path = root_path if root_path is not None else Path(dataset_cfg.DATA_PATH) 
    split = dataset_cfg.DATA_SPLIT['train']
    split_dir = root_path / 'ImageSets' / (split + '.txt')
    #sample_id_list = [x.strip() for x in open(split_dir).readlines()] if split_dir.exists() else None
    
    train_split = 'train'
    train_filename = save_path / ('kitti_%s_dataset.pkl' % train_split)

    #aug_config_list = augmentors
    #num_features = len(dataset_cfg.POINT_FEATURE_ENCODING.src_feature_list)
    
    print('\n' + '-' * 35 + 'Start to save data infos(original+augmented)' + '-' * 37)
    
    with open(train_filename, 'wb') as f:
        pickler = pickle.Pickler(f, protocol=pickle.HIGHEST_PROTOCOL)
        pickler.dump(data_list)
        pickler.clear_memo()

    #for sample_idx in sample_id_list:
    #    applied_augmentations = [str(name) for name in aug_config_list]
    #    aug_str = ', '.join(applied_augmentations)
    #    print(f"{split} sample_idx: {sample_idx} (original, {aug_str})")
    
    print('Kitti info train/aug file is saved to %s' % train_filename)
    print('-' * 49 + 'Data saving Done' + '-' * 51 + '\n') 


# Step 1 : Create the data_infos, only validation data_infos and gt_database are important. 
# The val data gets post-processed through DataProcessor, PointFeatureEncoder, also includes points (w FoV).
# The gt_database is necessary for successfully creating augmented training samples.
create_kitti_infos(dataset_cfg, class_names, data_path, save_path, workers=32)

# Step 2: Create the training set with data augmentation
dataset = KittiDatasetCustom(dataset_cfg=dataset_cfg, class_names=class_names, root_path=data_path, training=True, logger=common_utils.create_logger()) # the training flag allows data augmentation before training

# Step 3: Call the member method to catch information
dataset.dataset_w_all_infos = dataset.get_infos(num_workers=32, has_label=True, count_inside_pts=True, num_features=num_features)

## Save KittiDatasetCustom

In [None]:
import gc
from IPython import get_ipython

# Step 4: save it
dataset_as_list = []

for idx in range(len(dataset)):
    data, applied_augmentors = dataset[idx]
    # debug
    #sample_idx = data[0]['frame_id']
    #print(f"{sample_idx}")
    dataset_as_list.append(data)   
    # dataset_as_list.append(dataset[idx])

gc.collect()

save_data_list_kitti(data_list=dataset_as_list, save_path=save_path, root_path=None, sample_id_list=None, augmentors=applied_augmentors)

# clean up variables after saving
del dataset, dataset_as_list
del data, applied_augmentors
gc.collect()

# clean up variables in notebook & # restart ipython kernel
if get_ipython():
    get_ipython().run_line_magic('reset', '-sf')

## Create KittiDatasetCustom with range-based densification

In [None]:
from pcdet.datasets.augmentor.data_augmentor import DataAugmentor
from pcdet.utils import common_utils

import yaml
from easydict import EasyDict
from pathlib import Path
import pickle
from tqdm import tqdm
import gc
gc.enable()

dataset_cfg = EasyDict(yaml.safe_load(open('/home/rlab10/OpenPCDet/tools/cfgs/dataset_configs/DG_KITTI/kitti_dataset_custom_densification.yaml')))
class_names = ['Car', 'Pedestrian', 'Cyclist']
file_path = '/home/rlab10/OpenPCDet/pcdet/datasets/kitti/kitti_dataset_custom.py'
ROOT_DIR = (Path(file_path).resolve().parent / '../../../').resolve()
data_path = ROOT_DIR / 'data' / 'kitti' # raw data path
save_path = ROOT_DIR / 'data' / 'kitti' / 'Domain Generalization' / 'densification'

aug_cfg = dataset_cfg.DATA_AUGMENTOR['AUG_CONFIG_LIST'][0]
name = aug_cfg.get('NAME')
num_point_copies = aug_cfg.get('NUM_POINT_COPIES', 3)
delta_r = aug_cfg.get('DELTA_R_RANGE', [0.05, 0.1])

train_split, val_split = 'train', 'val'
train_filename = data_path / ('kitti_%s_dataset.pkl' % train_split)
dens_train_filename = save_path / ('kitti_%s_dataset_%sx_densified.pkl' % (train_split, num_point_copies))
val_filename = data_path / ('kitti_%s_dataset.pkl' % val_split)
dens_val_filename = save_path / ('kitti_%s_dataset_%sx_densified.pkl' % (val_split, num_point_copies))

augmentor = DataAugmentor(root_path=data_path, augmentor_configs=dataset_cfg.DATA_AUGMENTOR, class_names=class_names, logger=common_utils.create_logger())
if num_point_copies and delta_r and augmentor.logger is not None:
    augmentor.logger.info('Range based densification enabled with Δr %s and num_copies %d' % (str(delta_r), num_point_copies))

# training 
# with open(train_filename, 'rb') as f:
#     data_list = pickle.load(f)

# # Reduce point precision, bc .pkl is gettin really big
# for sample in data_list:
#     for data_dict in sample:
#         if 'points' in data_dict:
#             data_dict['points'] = data_dict['points'].astype('float16')

# for sample in tqdm(data_list, desc="Samples"):
#     #print(f"Processing frame_id: {sample[0].get('frame_id', 'N/A')}")
#     for i, data_dict in enumerate(sample):
#         data_dict.pop('cam_info')
#         data_dict.pop('lidar_aug_matrix')
#         data_dict.pop('use_lead_xyz')
#         data_dict.pop('flip_x', None)
#         data_dict.pop('num_aug_beams', None)
#         data_dict.pop('noise_world_rotation', None)
#         data_dict.pop('noise_local_rotation', None)
#         data_dict.pop('noise_local_scaling', None)
#         data_dict.pop('noise_world_translation', None)
#         for aug_func in augmentor.data_augmentor_queue:
#             sample[i] = aug_func(data_dict)

# with open(dens_train_filename, 'wb') as f:
#         pickler = pickle.Pickler(f, protocol=pickle.HIGHEST_PROTOCOL)
#         pickler.dump(data_list)
#         print('Kitti info train dataset densified is saved to %s\n' % dens_train_filename)
#         pickler.clear_memo()

# # empty RAM
# del data_list
# gc.collect()

# validation
with open(val_filename, 'rb') as f:
     data_list = pickle.load(f)

for sample in tqdm(data_list, desc="Samples"):
        for aug_func in augmentor.data_augmentor_queue:
            sample = aug_func(sample)

with open(dens_val_filename, 'wb') as f:
        pickler = pickle.Pickler(f, protocol=pickle.HIGHEST_PROTOCOL)
        pickler.dump(data_list)
        print('Kitti info val dataset densified is saved to %s\n' % dens_val_filename)
        pickler.clear_memo()

## Create KittiDatasetCustom with Random Beam Re-Sampling (RBRS)

In [None]:
from pcdet.datasets.augmentor.data_augmentor import DataAugmentor
from pcdet.utils import common_utils

import yaml
from easydict import EasyDict
from pathlib import Path
import pickle
from tqdm import tqdm
import gc
gc.enable()

dataset_cfg = EasyDict(yaml.safe_load(open('/home/rlab10/OpenPCDet/tools/cfgs/dataset_configs/DG_KITTI/kitti_dataset_custom_rbrs.yaml')))
class_names = ['Car', 'Pedestrian', 'Cyclist']
file_path = '/home/rlab10/OpenPCDet/pcdet/datasets/kitti/kitti_dataset_custom.py'
ROOT_DIR = (Path(file_path).resolve().parent / '../../../').resolve()
data_path = ROOT_DIR / 'data' / 'kitti'
save_path = ROOT_DIR / 'data' / 'kitti' / 'Domain Generalization' / 'random beam re-sampling'

train_split, val_split = 'train', 'val'
train_filename = data_path / ('kitti_%s_dataset.pkl' % train_split)
rbrs_train_filename = save_path / ('kitti_%s_dataset_rbrs.pkl' % train_split)
val_filename = data_path / ('kitti_%s_dataset.pkl' % val_split)
rbrs_val_filename = save_path / ('kitti_%s_dataset_rbrs.pkl' % val_split)

aug_cfg = dataset_cfg.DATA_AUGMENTOR['AUG_CONFIG_LIST'][0]
name = aug_cfg.get('NAME')
upsampling_prob = aug_cfg.get('BEAM_UPSAMPLE_PROB', 1)
phi = aug_cfg.get('PHI_THRESHOLD', 0.00301592894)
range = aug_cfg.get('R_THRESHOLD', 2.0)
num_interp_beams = aug_cfg.get('NUM_INTERP_BEAMS', 1)
num_workers = 32

augmentor = DataAugmentor(root_path=data_path, augmentor_configs=dataset_cfg.DATA_AUGMENTOR, class_names=class_names, logger=common_utils.create_logger())
if phi and range and augmentor.logger is not None:
    augmentor.logger.info('Random beam re-sampling enabled with upsample prob: %s, φ %s, r: %s and num. interp. beams: %s' % (upsampling_prob, phi, range, num_interp_beams))

# train
# with open(train_filename, 'rb') as f:
#     data_list = pickle.load(f)

# for sample in tqdm(data_list, desc="Samples"):
#     #print(f"Processing frame_id: {sample[0].get('frame_id', 'N/A')}")
#     for i, data_dict in enumerate(sample):
#         data_dict.pop('cam_info')
#         data_dict.pop('lidar_aug_matrix')
#         data_dict.pop('use_lead_xyz')
#         data_dict.pop('flip_x', None)
#         data_dict.pop('num_aug_beams', None)
#         data_dict.pop('noise_world_rotation', None)
#         data_dict.pop('noise_local_rotation', None)
#         data_dict.pop('noise_local_scaling', None)
#         data_dict.pop('noise_world_translation', None)
#         for aug_func in augmentor.data_augmentor_queue:
#             sample[i] = aug_func(data_dict)

# with open(rbrs_train_filename, 'wb') as f:
#         pickler = pickle.Pickler(f, protocol=pickle.HIGHEST_PROTOCOL)
#         pickler.dump(data_list)
#         print('Kitti info train dataset upsampled is saved to %s\n' % rbrs_train_filename)
#         pickler.clear_memo()

# # empty RAM
# del data_list
# gc.collect()

# validation
with open(val_filename, 'rb') as f:
     data_list = pickle.load(f)

for sample in tqdm(data_list, desc="Samples"):
        for aug_func in augmentor.data_augmentor_queue:
            sample = aug_func(sample)


with open(rbrs_val_filename, 'wb') as f:
        pickler = pickle.Pickler(f, protocol=pickle.HIGHEST_PROTOCOL)
        pickler.dump(data_list)
        print('Kitti info val dataset upsampled is saved to %s\n' % rbrs_val_filename)
        pickler.clear_memo()

## Create KittiDatasetCustom with PDRW

In [None]:
from pcdet.datasets.augmentor.data_augmentor import DataAugmentor
from pcdet.utils import common_utils
from pcdet.datasets.augmentor import augmentor_utils

import yaml
from easydict import EasyDict
from pathlib import Path
import pickle
from tqdm import tqdm
import numpy as np
import concurrent.futures as futures
import time
import gc
gc.enable()

dataset_cfg = EasyDict(yaml.safe_load(open('/home/rlab10/OpenPCDet/tools/cfgs/dataset_configs/DG_KITTI/kitti_dataset_custom_pdrw.yaml')))
class_names = ['Car', 'Pedestrian', 'Cyclist']
file_path = '/home/rlab10/OpenPCDet/pcdet/datasets/kitti/kitti_dataset_custom.py'
ROOT_DIR = (Path(file_path).resolve().parent / '../../../').resolve()
data_path = ROOT_DIR / 'data' / 'kitti'
save_path = ROOT_DIR / 'data' / 'kitti' / 'Domain Generalization' / 'pdrw interpolation'

train_split, val_split = 'train', 'val'
train_filename = data_path / ('kitti_%s_dataset.pkl' % train_split)
pdrw_train_filename = save_path / ('kitti_%s_dataset_pdrw.pkl' % train_split)
val_filename = data_path / ('kitti_%s_dataset.pkl' % val_split)
pdrw_val_filename = save_path / ('kitti_%s_dataset_pdrw.pkl' % val_split)

incl_file = data_path / 'training' / 'kitti scanning parameters' / 'incl.txt'
height_file = data_path / 'training' / 'kitti scanning parameters' / 'height.txt'

beam_layers = dataset_cfg.get('NUM_BEAMS', 64)
range_image_width = dataset_cfg.get('WIDTH', 2048)
max_range = dataset_cfg.get('MAX_RANGE', 60.0)

incl = np.loadtxt(incl_file)
height = np.loadtxt(height_file)
num_workers = 32

augmentor = DataAugmentor(root_path=data_path, augmentor_configs=[], class_names=class_names, logger=common_utils.create_logger())
if beam_layers and range_image_width and augmentor.logger is not None:
    augmentor.logger.info('Pixel-Distance and Range Weighted Interpolation with beams: %s, width rng_img: %s' % (beam_layers, range_image_width, ))

# train
# with open(train_filename, 'rb') as f:
#      data_list = pickle.load(f)

# def interpolate_sample_list(sample_list):
#     for data_dict in sample_list:
#         data_dict.pop('cam_info')
#         data_dict.pop('lidar_aug_matrix')
#         data_dict.pop('use_lead_xyz')
#         data_dict.pop('flip_x', None)
#         data_dict.pop('num_aug_beams', None)
#         data_dict.pop('noise_world_rotation', None)
#         data_dict.pop('noise_local_rotation', None)
#         data_dict.pop('noise_local_scaling', None)
#         data_dict.pop('noise_world_translation', None)
#         points = data_dict['points']
#         #beam_label = points[:, -1].astype(int)
    
#         range_image = augmentor_utils.get_range_image_hdl64e(
#         points=points[:, :4],
#         incl=incl,
#         height=height
#         )
#         # range_image = augmentor_utils.get_range_image_hdl64e_beam_labels(points=points[:,:4],
#         #                                                                 beam_labels=beam_label,
#         #                                                                 num_beams=beam_layers,
#         #                                                                 width=range_image_width)

#         range_image_upsampled = augmentor.pixel_distance_range_weighted_interpolation(range_image, MAX_RANGE=max_range)
#         points_interp = augmentor_utils.range_image_to_cartesian(range_image=range_image_upsampled, beam_label=True)

#         data_dict['points'] = points_interp

#     return sample_list

# start_time = time.time()
# with futures.ProcessPoolExecutor(num_workers) as executor:
# # with futures.ThreadPoolExecutor(num_workers) as executor:
#     data_list = list(tqdm(executor.map(interpolate_sample_list, data_list), total=len(data_list)))
# end_time = time.time()
# print("Total time for loading dataset: ", end_time - start_time, "s")
# print("Loading speed for data: ", len(data_list) / (end_time - start_time), "sample/s")

# with open(pdrw_train_filename, 'wb') as f:
#         pickler = pickle.Pickler(f, protocol=pickle.HIGHEST_PROTOCOL)
#         pickler.dump(data_list)
#         print('Kitti info train dataset interpolated is saved to %s\n' % pdrw_train_filename)
#         pickler.clear_memo()

# # empty RAM
# del data_list
# gc.collect()

# validation
with open(val_filename, 'rb') as f:
     data_list = pickle.load(f)

def interpolate_sample(sample):
    points = sample['points']
    #beam_label = points[:, -1].astype(int)
    
    range_image = augmentor_utils.get_range_image_hdl64e(
       points=points[:, :4],
       incl=incl,
       height=height
    )
    # range_image = augmentor_utils.get_range_image_hdl64e_beam_labels(points=points[:,:4],
    #                                                                 beam_labels=beam_label,
    #                                                                 num_beams=beam_layers,
    #                                                                 width=range_image_width)

    range_image_upsampled = augmentor.pixel_distance_range_weighted_interpolation(range_image, MAX_RANGE=max_range)
    points_interp = augmentor_utils.range_image_to_cartesian(range_image=range_image_upsampled, beam_label=True)

    sample['points'] = points_interp

    return sample

start_time = time.time()
with futures.ProcessPoolExecutor(num_workers) as executor:
    data_list = list(tqdm(executor.map(interpolate_sample, data_list), total=len(data_list)))
end_time = time.time()
print("Total time for loading dataset: ", end_time - start_time, "s")
print("Loading speed for data: ", len(data_list) / (end_time - start_time), "sample/s")

with open(pdrw_val_filename, 'wb') as f:
        pickler = pickle.Pickler(f, protocol=pickle.HIGHEST_PROTOCOL)
        pickler.dump(data_list)
        print('Kitti info val dataset interpolated is saved to %s\n' % pdrw_val_filename)
        pickler.clear_memo()

## Create ZODDatasetCustom

In [None]:
from pcdet.datasets.zod.zod_dataset_custom import *
from pcdet.datasets.dataset import *
import yaml
from easydict import EasyDict
from pathlib import Path
from pcdet.utils import common_utils

dataset_cfg = EasyDict(yaml.safe_load(open('/home/rlab10/OpenPCDet/tools/cfgs/dataset_configs/zod_dataset_custom.yaml')))
class_names = ['Vehicle_Car', 'Pedestrian', 'VulnerableVehicle_Bicycle']
file_path = '/home/rlab10/OpenPCDet/pcdet/datasets/zod/zod_dataset_custom.py' 
ROOT_DIR = (Path(file_path).resolve().parent / '../../../').resolve()
data_path = ROOT_DIR / 'data' / 'zod'
save_path = ROOT_DIR / 'data' / 'zod'
zod_infos = []
num_features = len(dataset_cfg.POINT_FEATURE_ENCODING.src_feature_list)

def create_zod_infos(dataset_cfg, class_names, data_path, save_path, workers=4):
    from time import sleep

    dataset = ZODDatasetCustom(dataset_cfg=dataset_cfg, class_names=class_names, root_path=data_path, training=False, logger=common_utils.create_logger(), creating_pkl_infos=True)
    
    train_split, val_split = 'train', 'val'
    version = 'full'
    num_features = len(dataset_cfg.POINT_FEATURE_ENCODING.src_feature_list)

    train_filename = save_path / ('zod_infos_%s_%s.pkl' % (train_split, version))
    val_filename = save_path / ('zod_%s_dataset.pkl' % val_split)
    trainval_filename = save_path / ('zod_infos_trainval_%s.pkl' % version)

    print('\n' + '-' * 36 + 'Start to generate data infos' + '-' * 37)
    print('---------------CAUTION: Custom code is configured to serve as Augmentor NOT training-----------------')

    dataset.set_split(train_split, version)
    zod_infos_train = dataset.get_infos(num_workers=workers, has_label=True, count_inside_pts=True, num_features=num_features)
    with open(train_filename, 'wb') as f:
        pickle.dump(zod_infos_train, f)
    print('Zod info train file is saved to %s\n' % train_filename)
    sleep(3) 

    dataset.set_split(val_split, version)
    # ensure process single scene with PointFeatureEncoder, DataProcessor
    dataset.training = False
    allowed_classes = class_names
    zod_val_dataset = dataset.get_infos_val(num_workers=workers, has_label=True, count_inside_pts=True, num_features=num_features, class_names=allowed_classes)
    with open(val_filename, 'wb') as f:
        pickle.dump(zod_val_dataset, f)
    print('Zod info val file is saved to %s\n' % val_filename)
    sleep(3)

    with open(trainval_filename, 'wb') as f:
        pickle.dump(zod_infos_train + zod_val_dataset, f)
    print('Zod info trainval file is saved to %s\n' % trainval_filename)
    sleep(3)

    print('\n---------------Start creating groundtruth database for later data augmentation-------------------------')
    print('---------------CAUTION: Custom code is configured to serve as Augmentor NOT training-------------------')
    print('---------------No DataProcessor and PointFeatureEncoder required, handled by training data creation----')

    # Input the 'zod_infos_train_full.pkl' to generate gt_database (cutted objects of samples)
    dataset.set_split(train_split, version)
    dataset.create_groundtruth_database(info_path=train_filename, version=version, split=train_split)
    print(f'---------------These groundtruth {train_split} objects are randomly inserted into samples (augmentation)-------')
    print('-' * 41 + 'Data preparation Done' + '-' * 41)

def save_data_list_zod(data_list=None, save_path=None, root_path=None, sample_id_list=None, augmentors=None):
    #import blosc
    #import zstandard as zstd

    root_path = root_path if root_path is not None else Path(dataset_cfg.DATA_PATH) 
    split = dataset_cfg.DATA_SPLIT['train']
    
    train_split = 'train'
    train_filename = save_path / ('zod_%s_dataset.pkl' % train_split)

    aug_config_list = augmentors
    num_features = len(dataset_cfg.POINT_FEATURE_ENCODING.src_feature_list)
    
    print('\n' + '-' * 35 + 'Start to save data infos(original+augmented)' + '-' * 37)
    
    # experimental
    # raw = pickle.dumps(data_list, protocol=pickle.HIGHEST_PROTOCOL)
    # cctx = zstd.ZstdCompressor(level=15, threads=-1)
    # comp = cctx.compress(raw)
    # with open(train_filename, 'wb') as f:
    #     f.write(comp)
    
    # experimental
    #cctx = zstd.ZstdCompressor(level=3, threads=-1)
    #with open(train_filename, 'wb') as fh, cctx.stream_writer(fh) as zfh:
    #    pickler = pickle.Pickler(zfh, protocol=pickle.HIGHEST_PROTOCOL)
    #    pickler.dump(data_list)
    #    pickler.clear_memo()
    
    with open(train_filename, 'wb') as f:
        pickler = pickle.Pickler(f, protocol=pickle.HIGHEST_PROTOCOL)
        pickler.dump(data_list)
        pickler.clear_memo()
    
    # for sample_idx in sample_id_list:
    #     applied_augmentations = [str(name) for name in aug_config_list]
    #     aug_str = ', '.join(applied_augmentations)

    #     print(f"{split} sample_idx: {sample_idx} (original, {aug_str})")
    
    print('Zod info train/aug file is saved to %s' % train_filename)
    print('-' * 49 + 'Data saving Done' + '-' * 51 + '\n') 


# Step 1 : Create the data_infos, only validatiosn data_infos and gt_database are important. 
# The val data gets post-processed through DataProcessor, PointFeatureEncoder, also includes points (w FoV).
# The gt_database is necessary for successfully creating augmented training samples.
create_zod_infos(dataset_cfg, class_names, data_path, save_path, workers=32)

# Step 2: Create the training set with data augmentation
dataset = ZODDatasetCustom(dataset_cfg=dataset_cfg, class_names=class_names, root_path=data_path, training=True, logger=common_utils.create_logger(), creating_pkl_infos=False)

# Step 3: Call the member method to catch information
train_split = 'train'
version = 'full'
train_filename = data_path / ('zod_infos_%s_%s.pkl' % (train_split, version))

with open(train_filename, 'rb') as f:
    zod_infos_train_full = pickle.load(f)

sample_id_list = [info['point_cloud']['lidar_idx'] for info in zod_infos_train_full]

dataset.dataset_w_all_infos = dataset.get_infos(num_workers=24, has_label=True, count_inside_pts=True, sample_id_list=sample_id_list, num_features=num_features)

## Save ZODDatasetCustom

In [None]:
# Step 4: save it (new) 
save_data_list_zod(
    data_list=[dataset[i][0] for i in range(len(dataset.dataset_w_all_infos))], # dataset.dataset_w_all_infos
    save_path=save_path,
    root_path=None,
    sample_id_list=sample_id_list,
    augmentors=[cfg.get('NAME', str(cfg)) for cfg in dataset_cfg.DATA_AUGMENTOR.AUG_CONFIG_LIST],
)

# Step 4: save it (old, slower, OOM danger)
#dataset_as_list = []

#for idx in range(len(dataset)):
#    data, applied_augmentors = dataset[idx]
    # debug
    #sample_idx = data[0]['frame_id']
    #print(f"{sample_idx}")
#    dataset_as_list.append(data)   
    # dataset_as_list.append(dataset[idx])

#save_data_list_zod(data_list=dataset_as_list, save_path=save_path, root_path=None, sample_id_list=sample_id_list, augmentors=applied_augmentors)

In [None]:
import json
import pickle
from pathlib import Path
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

PKL_PATH = Path("/home/rlab10/OpenPCDet/data/zod/zod_val_dataset.pkl")
ZOD_FRAMES_ROOT = Path("/media/rlab10/Dataset/zod/single_frames")

def iter_frame_ids_from_pkl(obj):
    if isinstance(obj, dict):
        fid = obj.get("frame_id")
        if fid is None:
            fid = obj.get("point_cloud", {}).get("lidar_idx")
        if fid is not None:
            yield str(fid)
        return
    if isinstance(obj, (list, tuple)):
        for x in obj:
            yield from iter_frame_ids_from_pkl(x)

def read_road_type_for_frame(frame_id: str):
    meta_path = ZOD_FRAMES_ROOT / frame_id / "metadata.json"
    if not meta_path.exists():
        return frame_id, None, "metadata_missing"
    try:
        with meta_path.open("r", encoding="utf-8") as f:
            meta = json.load(f)
        return frame_id, meta.get("road_type", None), None
    except Exception as e:
        return frame_id, None, f"json_error: {type(e).__name__}"

def raw_key(val):
    if val is None:
        return "<missing>"
    s = str(val).strip()
    return s if s else "<missing>"

with PKL_PATH.open("rb") as f:
    data = pickle.load(f)

frame_ids = list(iter_frame_ids_from_pkl(data))
frame_id_freq = Counter(frame_ids)              # zählt Duplikate im PKL
frame_ids_unique = list(frame_id_freq.keys())   # jede metadata.json nur 1x lesen

raw_counts = Counter()  # road_type_raw -> count (mit PKL-Duplikaten)
errors = Counter()

max_workers = 32
with ThreadPoolExecutor(max_workers=max_workers) as ex:
    futures = [ex.submit(read_road_type_for_frame, fid) for fid in frame_ids_unique]
    for fut in tqdm(as_completed(futures), total=len(futures), desc="Reading metadata.json"):
        fid, road_raw, err = fut.result()
        mult = frame_id_freq[fid]
        if err:
            errors[err] += mult
        raw_counts[raw_key(road_raw)] += mult

print(f"Total PKL frame_id occurrences (incl duplicates): {sum(frame_id_freq.values())}")
print(f"Unique frames (metadata reads): {len(frame_ids_unique)}")

print("\nAll unique road_type strings (raw):")
for s in sorted(raw_counts.keys()):
    print(f"  {s}")

print("\nRaw road_type counts (incl duplicates):")
for s, c in raw_counts.most_common():
    print(f"  {s:25s} {c}")

if errors:
    print("\nMetadata read issues (top):")
    for k, v in errors.most_common(10):
        print(f"  {k}: {v}")

In [None]:
import pickle
import numpy as np
from pathlib import Path
from collections import Counter
from tqdm import tqdm

# --- Config ---
#PKL_PATH = Path("/home/rlab10/OpenPCDet/data/kitti/kitti_train_dataset.pkl")
PKL_PATH = Path("/home/rlab10/OpenPCDet/data/zod/zod_train_dataset.pkl")

LABEL_TO_NAME = {1: "Car", 2: "Pedestrian", 3: "Cyclist"}
REQUIRED_LABELS = {1, 2, 3}   # must be present in ONE sample
N_FIND = 5                    # how many matching samples to print
MAX_SCAN = None               # e.g. 5000 to limit search, or None for all

# Optional: if you already know an index and just want to inspect it
SAMPLE_IDX = None             # e.g. 3400, or None to skip direct inspection


# --- Helpers ---
def get_original_dict(sample):
    # Your pkl: each element is usually [original, aug1, aug2, ...]
    if isinstance(sample, dict):
        return sample
    if isinstance(sample, (list, tuple)) and len(sample) > 0 and isinstance(sample[0], dict):
        return sample[0]
    raise TypeError(f"Unexpected sample type/shape: {type(sample)}")

def extract_gt_boxes_lidar(d):
    # Robust lookup across possible key layouts
    boxes = d.get("gt_boxes_lidar", None) or d.get("gt_boxes", None)
    if boxes is None and isinstance(d.get("annos", None), dict):
        boxes = d["annos"].get("gt_boxes_lidar", None) or d["annos"].get("gt_boxes", None)

    if boxes is None:
        return np.zeros((0, 0), dtype=np.float32)

    arr = np.asarray(boxes)
    if arr.size == 0:
        return np.zeros((0, 0), dtype=np.float32)
    if arr.ndim == 1:
        arr = arr.reshape(1, -1)
    return arr

def get_frame_id(d0):
    return d0.get("frame_id", d0.get("point_cloud", {}).get("lidar_idx", "<unknown>"))

def per_class_counts_from_labels(labels):
    c = Counter(labels.astype(int, copy=False))
    out = {LABEL_TO_NAME[k]: int(c.get(k, 0)) for k in sorted(REQUIRED_LABELS)}
    unknown = {int(k): int(v) for k, v in c.items() if int(k) not in LABEL_TO_NAME}
    return out, unknown


# --- Load ---
with PKL_PATH.open("rb") as f:
    data = pickle.load(f)

print(f"Loaded PKL: {PKL_PATH}")
print(f"Total samples in PKL: {len(data)}")


# --- Optional: inspect one chosen sample ---
if SAMPLE_IDX is not None:
    sample = data[SAMPLE_IDX]
    d0 = get_original_dict(sample)
    gt = extract_gt_boxes_lidar(d0)

    print("\n--- Inspect one sample ---")
    print("SAMPLE_IDX:", SAMPLE_IDX)
    print("frame_id:", get_frame_id(d0))
    print("gt shape:", gt.shape)

    if gt.shape[1] == 0:
        print("No gt boxes found in this sample.")
    else:
        labels = gt[:, -1]
        counts, unknown = per_class_counts_from_labels(labels)
        print("Present classes (counts):", counts)
        if unknown:
            print("Unknown label IDs:", unknown)


# --- Search for samples that contain ALL 3 classes in original ---
print("\n--- Search samples with all 3 classes (original only) ---")

found = []
data_iter = data if MAX_SCAN is None else data[:MAX_SCAN]

for idx, sample in enumerate(tqdm(data_iter, desc="Scanning")):
    d0 = get_original_dict(sample)
    gt = extract_gt_boxes_lidar(d0)

    if gt.shape[1] == 0:
        continue

    labels = gt[:, -1].astype(int, copy=False)
    present = set(labels.tolist())

    if REQUIRED_LABELS.issubset(present):
        counts, unknown = per_class_counts_from_labels(labels)
        found.append((idx, get_frame_id(d0), counts, unknown))

        if len(found) >= N_FIND:
            break

print(f"Found {len(found)} matching samples.")
for idx, frame_id, counts, unknown in found:
    print(f"  SAMPLE_IDX={idx}  frame_id={frame_id}  counts={counts}")
    if unknown:
        print(f"    unknown_label_ids={unknown}")

if found:
    best_idx = found[0][0]
    print(f"\nTip: set SAMPLE_IDX = {best_idx} to inspect the first hit.")
else:
    print("\nNo sample found with all 3 classes within the scan range.")
    print("Try MAX_SCAN=None (full scan), or verify that label IDs are really {1,2,3} in the last gt column.")

## Check Beam-Labels with Range Image

In [None]:
import pickle
from pcdet.datasets.augmentor.data_augmentor import DataAugmentor
import matplotlib.pyplot as plt

pkl_path = "/home/rlab10/OpenPCDet/data/kitti/kitti_val_dataset_beamlabels.pkl"

with open(pkl_path, 'rb') as f:
     data_list = pickle.load(f)
    
points = data_list[0]['points']
beam_label = points[:, -1].astype(int)

augmentor = DataAugmentor(root_path=None, augmentor_configs=[], class_names=[])
polar_image = augmentor.get_polar_image(points[:, :3], with_limit_range=False)
phi = polar_image[:, 0]
theta = polar_image[:, 1]
range = polar_image[:, 2] 

plt.figure(figsize=(12, 6))
sc = plt.scatter(phi, theta, c=range, cmap='jet', s=1)
plt.xlabel('Azimuth (phi)')
plt.ylabel('Elevation (theta)')
plt.title('Range Image colored by Beam Label')
plt.colorbar(sc, label='Beam Label')
plt.show()

## Debug with MultiProcessing

In [None]:
# source: https://github.com/microsoft/debugpy/issues/1168#issuecomment-1377998813

import sys

sys.modules['debugpy'].__file__

# go to '/home/rlab10/anaconda3/envs/pcdet/lib/python3.11/site-packages/debugpy
# find ebugpy/server/api.py
# change "subProcess": True to "subProcess": False

## Create Range Image upsampled and 3D Point Cloud

In [None]:
from pcdet.datasets.augmentor.data_augmentor import DataAugmentor
from pcdet.utils import common_utils
from pcdet.datasets.augmentor import augmentor_utils

import yaml
from easydict import EasyDict
from pathlib import Path
import pickle
from tqdm import tqdm
import gc
import matplotlib.pyplot as plt
import numpy as np

dataset_cfg = EasyDict(yaml.safe_load(open('/home/rlab10/OpenPCDet/tools/cfgs/dataset_configs/DG_KITTI/kitti_dataset_custom_pdrw.yaml')))
class_names = ['Car', 'Pedestrian', 'Cyclist']
file_path = '/home/rlab10/OpenPCDet/pcdet/datasets/kitti/kitti_dataset_custom.py'
ROOT_DIR = (Path(file_path).resolve().parent / '../../../').resolve()
data_path = ROOT_DIR / 'data' / 'kitti'
save_path = ROOT_DIR / 'data' / 'kitti' / 'Domain Generalization' / 'pdrw interpolation'

val_split = 'val'
val_filename = data_path / ('kitti_%s_dataset.pkl' % val_split)
pdrw_val_filename = save_path / ('kitti_%s_dataset_pdrw.pkl' % val_split)

# aug_cfg = dataset_cfg.DATA_AUGMENTOR['AUG_CONFIG_LIST'][0]
# name = aug_cfg.get('NAME')
# upsample_factor = aug_cfg.get('UPSAMPLE_FACTOR', 1)
# sigma_d = aug_cfg.get('SIGMA_D', 0.5)
# use_intensity = aug_cfg.get('USE_INTENSITY', True)

# augmentor = DataAugmentor(root_path=data_path, augmentor_configs=dataset_cfg.DATA_AUGMENTOR, class_names=class_names, logger=common_utils.create_logger())
# if upsample_factor and sigma_d and augmentor.logger is not None:
#     augmentor.logger.info('Pixel-Distance and Range Weighted Interpolation with factor: %s, σ_d: %s, intensity: %s' % (upsample_factor, sigma_d, use_intensity))

# validation
with open(val_filename, 'rb') as f:
     data_list = pickle.load(f)

points = data_list[5]['points'] # (N, 5)
lidar_idx = data_list[5]['point_cloud']['lidar_idx']
print(lidar_idx)
beam_label = points[:, -1].astype(int)
#augmentor = DataAugmentor(root_path=data_path, augmentor_configs=dataset_cfg.DATA_AUGMENTOR, class_names=class_names, logger=common_utils.create_logger())
augmentor = DataAugmentor(root_path=data_path, augmentor_configs=[], class_names=class_names, logger=common_utils.create_logger())
# KITTI scanning parameters, obtained from Hough transformation
height = np.loadtxt("/home/rlab10/OpenPCDet/data/kitti/training/kitti scanning parameters/height.txt")
zenith = np.loadtxt("/home/rlab10/OpenPCDet/data/kitti/training/kitti scanning parameters/zenith.txt")
incl = np.loadtxt("/home/rlab10/OpenPCDet/data/kitti/training/kitti scanning parameters/incl.txt")

# 1. Point Cloud -> Low-resolution 2D Range Image
range_image = augmentor_utils.get_range_image_hdl64e(points[:, :4], incl=incl, height=height)

plt.figure(figsize=(16, 6))
plt.imshow(range_image[:, :, 0], cmap='turbo')  # 0: Range, 1-3: x/y/z, 4: intensity
plt.title('Range-Image (without calculated Beam-Labels)')
plt.xlabel('Azimuth')
plt.ylabel('Beam (row)')#
plt.colorbar(label='Range [m]')
plt.show()

# Just an alternative to the function get_range_image_hdl64e()
# range_image_2 = augmentor_utils.get_range_image_hdl64e_beam_labels(points=points[:, :4], beam_labels=beam_label, num_beams=64, width=2048)
# plt.figure(figsize=(16, 6))
# plt.imshow(range_image_2[:, :, 0], cmap='turbo')  # 0: Range, 1-3: x/y/z, 4: intensity
# plt.title('Range-Image (with calculated Beam-Labels)')
# plt.xlabel('Azimuth')
# plt.ylabel('Beam (row)')
# plt.colorbar(label='Range [m]')
# plt.show()

# print("Empty pixels before:", (range_image[:, :, 0] < 0).sum())
# print("Empty pixels after Pixel-Distance Weighted Interpolation:", (range_image_upsampled[:, :, 0] < 0).sum())

# 4. Better Upsampling/Interpolation on Range-Image
range_image_upsampled_ = augmentor.pixel_distance_range_weighted_interpolation(range_image, MAX_RANGE=60.0)

plt.figure(figsize=(16, 6))
plt.imshow(range_image_upsampled_[:, :, 0], cmap='turbo')  # 0: Range, 1-3: x/y/z, 4: intensity
plt.title('Range-Image (Upsampled Interpolation (advanced))')
plt.xlabel('Azimuth')
plt.ylabel('Beam (row)')
plt.colorbar(label='Range [m]')
plt.show()

#print("Empty pixels before:", (range_image_2[:, :, 0] < 0).sum())
#print("Empty pixels after Pixel-Distance and Range Weighted Interpolation:", (range_image_upsampled_[:, :, 0] < 0).sum())

points_result = augmentor_utils.range_image_to_cartesian(range_image=range_image_upsampled_, beam_label=True)

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(points_result[:, 0], points_result[:, 1], points_result[:, 2], 
           s=0.1, c=points_result[:, 2], cmap='jet')
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
ax.set_title('3D Point Cloud interpolated')
plt.show()