# Train a sensor processing model using a Convolutional Variational Autoencoder 

Using the Julian-8897-Conv-VAE-PyTorch implementation to train a sensor processing model based on convolutional variational autoencoder. 

The parameters of the training are described by an experiment run of type "sensorprocessing_conv_vae". The result of runing the code in this notebook is the model files that are stored in the experiment directory. 

As the model files will have unpredictable date-time dependent names, after running a satisfactory model, the mode name and directory will need to be copied to the experiment/run yaml file, in the model_subdir and model_checkpoint fields.


In [1]:
import sys
sys.path.append("..")
from settings import Config
import pathlib
from pprint import pprint
import shutil
import torch

# adding the Julian-8897-Conv-VAE-PyTorch into the path
sys.path.append(Config()["conv_vae"]["code_dir"])

# At some point in the development, this hack was necessary for some reason.
# It seems that as of Feb 2025, the code runs on Windows and Linux without it.
#temp = pathlib.PosixPath
#pathlib.PosixPath = pathlib.WindowsPath

from conv_vae import get_conv_vae_config, create_configured_vae_json, train

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Loading pointer config file: /home/ssheikholeslami/.config/BerryPicker/mainsettings.yaml
Loading machine-specific config file: /home/ssheikholeslami/SaharaBerryPickerData/settings-sahara.yaml
Using device: cuda


In [2]:
# If it is set to true, no actual copying will be done
dry_run = False

# Specify and load the experiment
experiment = "sensorprocessing_conv_vae"
run = "proprio_128"
# run = "proprio_256"
exp = Config().get_experiment(experiment, run)
pprint(exp)

No system dependent experiment file
 /home/ssheikholeslami/SaharaBerryPickerData/experiments-Config/sensorprocessing_conv_vae/proprio_128_sysdep.yaml,
 that is ok, proceeding.
Configuration for experiment: sensorprocessing_conv_vae/proprio_128 successfully loaded
{'data_dir': PosixPath('/home/ssheikholeslami/SaharaBerryPickerData/experiment_data/sensorprocessing_conv_vae/proprio_128'),
 'epochs': 300,
 'exp_run_sys_indep_file': PosixPath('/lustre/fs1/home/ssheikholeslami/BerryPicker/src/experiment_configs/sensorprocessing_conv_vae/proprio_128.yaml'),
 'group_name': 'sensorprocessing_conv_vae',
 'json_template_name': 'conv-vae-config-default.json',
 'latent_size': 128,
 'model_checkpoint': 'checkpoint-epoch300.pth',
 'model_dir': 'models',
 'model_name': 'VAE_Robot',
 'model_subdir': '0313_151618',
 'run_name': 'proprio_128',
 'save_period': 5,
 'training_data_dir': 'vae-training-data',
 'training_task': 'proprio_sp_training',
 'validation_demo': '2024_10_26__16_23_22',
 'validation_tas

### Create the training data for the Conv-VAE

We collect the training data for the Conv-VAE by gathering all the pictures from all the demonstrations of a specific task. One can select the pictures by creating a specific task, and copy there all the relevant demonstrations. 

The collected pictures are put in a newly created training directory for the run:

```
$experiment\vae-training-data\Images\*.jpg
```

In [4]:
def copy_images_to_training_dir(taskname, training_image_dir):
    """Copy all the images from a specific task into the training image dir."""
    task_dir = pathlib.Path(demos_dir, taskname)
    # _, task_dir = ui_choose_task(offer_task_creation=True)

    for demo in task_dir.iterdir():
        if not demo.is_dir(): continue
        for item in demo.iterdir():
            if item.suffix != ".jpg": continue
            name = f"{demo.name}_{item.stem}.jpg"
            destination = pathlib.Path(training_image_dir, name)
            print(f"copy {item} to \n{destination}")
            if not dry_run:
                shutil.copyfile(item, destination)

In [5]:
demos_top = pathlib.Path(Config()["demos"]["directory"])
demos_dir = pathlib.Path(demos_top, "demos")

subdir_count = sum(1 for item in demos_dir.iterdir() if item.is_dir())
print(f"Number of demo directories: {subdir_count}")

# Deciding on the location of the training data
training_data_dir = pathlib.Path(exp["data_dir"], exp["training_data_dir"])
# training_data_dir = pathlib.Path(Config()["conv_vae"]["training_data_dir"])
training_image_dir = pathlib.Path(training_data_dir, "Images")
training_image_dir.mkdir(exist_ok = False, parents=True)

print(f"Training data dir={training_image_dir}")

# Define a set of common image file extensions
image_extensions = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"}
# Count the image files
image_count = sum(1 for item in training_image_dir.iterdir() if item.suffix.lower() in image_extensions and item.is_file())

print(f"Number of image files in training dir: {image_count}")

if image_count == 0:
    taskname = exp['training_task']
    copy_images_to_training_dir(
        taskname = taskname, training_image_dir=training_image_dir)
else:
    print("There are already images in training image dir {training_image_dir}. Do not repeat the copying.")


Number of demo directories: 11
Training data dir=/home/ssheikholeslami/SaharaBerryPickerData/experiment_data/sensorprocessing_conv_vae/proprio_128/vae-training-data/Images
Number of image files in training dir: 0
copy /home/ssheikholeslami/SaharaBerryPickerData/demonstrations/demos/proprio_sp_training/2025_03_08__14_49_59/00135_dev2.jpg to 
/home/ssheikholeslami/SaharaBerryPickerData/experiment_data/sensorprocessing_conv_vae/proprio_128/vae-training-data/Images/2025_03_08__14_49_59_00135_dev2.jpg
copy /home/ssheikholeslami/SaharaBerryPickerData/demonstrations/demos/proprio_sp_training/2025_03_08__14_49_59/00008_dev2.jpg to 
/home/ssheikholeslami/SaharaBerryPickerData/experiment_data/sensorprocessing_conv_vae/proprio_128/vae-training-data/Images/2025_03_08__14_49_59_00008_dev2.jpg
copy /home/ssheikholeslami/SaharaBerryPickerData/demonstrations/demos/proprio_sp_training/2025_03_08__14_49_59/00167_dev2.jpg to 
/home/ssheikholeslami/SaharaBerryPickerData/experiment_data/sensorprocessing_co

# Run the training

Actually run the training. This is done by creating the json-based configuration file of the Conv-VAE library with the parameters specified in the library. Then we call the code of the library to perform the training. 

In [6]:
# Create the vae configuration, based on the experiment
file = create_configured_vae_json(exp)
print(file)
vae_config = get_conv_vae_config(file)

# actually run the training
print(f'Running the trainer from scratch for {vae_config["trainer"]["epochs"]}')
trainer = train(vae_config)

/lustre/fs1/home/ssheikholeslami/BerryPicker/src/sensorprocessing/conv-vae-config-default.json
{'name': 'VAE_Robot', 'n_gpu': 1, 'arch': {'type': 'VanillaVAE', 'args': {'in_channels': 3, 'latent_dims': 128, 'flow': False}}, 'data_loader': {'type': 'CelebDataLoader', 'args': {'data_dir': '/home/ssheikholeslami/SaharaBerryPickerData/experiment_data/sensorprocessing_conv_vae/proprio_128/vae-training-data', 'batch_size': 64, 'shuffle': True, 'validation_split': 0.2, 'num_workers': 2}}, 'optimizer': {'type': 'Adam', 'args': {'lr': 0.005, 'weight_decay': 0.0, 'amsgrad': True}}, 'loss': 'elbo_loss', 'metrics': [], 'lr_scheduler': {'type': 'StepLR', 'args': {'step_size': 50, 'gamma': 0.1}}, 'trainer': {'epochs': 300, 'save_dir': '/home/ssheikholeslami/SaharaBerryPickerData/experiment_data/sensorprocessing_conv_vae/proprio_128/models', 'save_period': 5, 'verbosity': 2, 'monitor': 'min val_loss', 'early_stop': 10, 'tensorboard': True}}
/home/ssheikholeslami/SaharaBerryPickerData/experiment_data/

2025-03-14 00:39:43.985537: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-14 00:39:48.812742: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741927189.622932 2459988 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741927190.184776 2459988 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-14 00:39:52.126239: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [7]:
# These are the metrics recorded
# they are of utils/util.py / MetricTracker which has a pandas dataframe as data
print(trainer.train_metrics)
print(trainer.valid_metrics)
#
trainer.train_metrics._data
# trainer.valid_metrics._data

<utils.util.MetricTracker object at 0x7f5f6db0d250>
<utils.util.MetricTracker object at 0x7f5f0106db90>


Unnamed: 0,total,counts,average
loss,75964.678711,27,2813.506619


__Important__ After the training finished, in order to use the resulting system, one need to edit the run file (eg: vae_01.yaml) and enter into it the location of the checkpoint. This is the content printed by the code cell below

In [8]:

print(f"model_subdir: '{trainer.checkpoint_dir.name}'")
print(f"model_checkpoint: 'checkpoint-epoch{trainer.epochs}.pth'")


model_subdir: '0314_003916'
model_checkpoint: 'checkpoint-epoch300.pth'


In [9]:
if "exp_run_sys_dep_file" in exp:
    print(f'The text above to be put into \n the system dependent experiment run file {exp["exp_run_sys_dep_file"]}')
else:
    print(f'As the system dependent experiment run file does not exist,\n the text can be put into the system independent file\n {exp["exp_run_sys_indep_file"]}')


As the system dependent experiment run file does not exist,
 the text can be put into the system independent file
 /lustre/fs1/home/ssheikholeslami/BerryPicker/src/experiment_configs/sensorprocessing_conv_vae/proprio_128.yaml
