# Preprocessing

#### This should be run on your local machine and will create 6 numpy files you can upload to the drive.

## Import Packages


In [2]:
import numpy as np
import os
import csv
from tqdm import tqdm
import random

from PIL import Image
import glob

## Image Preprocessing

In [None]:
def get_paths(trial_folders):
    """Retrieves all file paths to every image and label file in the given directories

    Args:
        trial_folders (list[str]): All the directories to extract data from

    Returns:
        image_paths (list[str]), label_paths (list[str]): File paths to every file in the directories
    """
    image_paths = []
    label_paths = []

    for i in range(len(trial_folders)):
            image_pattern = os.path.join(trial_folders[i], 'images/*.jpg')
            label_pattern = os.path.join(trial_folders[i], 'labels.csv')

            image_paths.append(glob.glob(image_pattern))
            label_paths.append(glob.glob(label_pattern))
            
    return image_paths, label_paths

In [None]:
def get_dict(label_paths):
    """Returns a dictionary mapping every image name to associated logged data (steering history, steering label)

    Args:
        label_paths (list[str]): A list of file paths to every label file to generate dict from

    Returns:
        dict{str : list}: Mapping from image name to steering and throttle data
    """
    labels_dict = {}

    for label_paths in label_paths:
        for label_path in label_paths:
            # open the label file
            with open(label_path, mode='r') as f:
                reader = csv.reader(f, delimiter='\t')
                header = next(reader)

                for row in reader:
                    timestamp = row[0]
                    steering_lookback = [int(x) for x in row[1].split(";")]
                    throttle_pulse_idx = int(row[2])
                    steering_command = int(row[3])
                    image_name = timestamp + '.jpg'

                    labels_dict[image_name] = [*steering_lookback, throttle_pulse_idx, steering_command]
    return labels_dict

In [None]:
def get_random_sample(image_paths, sample_proportion):
    sample_size = int(len(image_paths) * sample_proportion)
    image_paths_sample = random.sample(image_paths, sample_size)
    return image_paths_sample

In [None]:
# Apparently as long as the model 1) Uses Global Average Pooling or 2) Does not use a pretrained model i.e RESNET, the 6:4 aspect ratio should be fine

def preprocess(image_path):
  # Scale down from 640 x 480 pixels to 6x smaller.
  target_image_size = (80, 60)

  # Get image and resize.
  PIL_image = Image.open(image_path)
  PIL_image_resized = PIL_image.resize(target_image_size, resample=Image.LANCZOS)

  # Convert to NumPy array and normalize
  image_arr = np.array(PIL_image_resized)
  normalized_arr = image_arr / 255.0

  return normalized_arr

In [None]:
def build_arrays(image_paths, labels_dict):
  """Build aligned image and label arrays from file paths and a label mapping.

  Args:
      image_paths (list[str]): Paths to image files that will be included in output
      labels_dict (dict{str : list}): Mapping from image filename to its associated logged data.

  Returns:
      X_data (np.ndarray), Y_data (np.ndarray)
  """
  images = []
  labels = []
  for image_path in tqdm(image_paths, desc="Preprocessing"):
    image_name = os.path.basename(image_path)

    if image_name in labels_dict:
      label = labels_dict[image_name]
      image = preprocess(image_path)

      images.append(image)
      labels.append(label)
    else:
      print("Error: No matching label found.")

  X_data = np.array(images)
  Y_data = np.array(labels, dtype=np.float32)

  return X_data, Y_data



Preprocessing: 100%|██████████| 3673/3673 [01:11<00:00, 51.57it/s]


52891200
7346


## Edit the save_dir below.

In [None]:
def driver(trial_folders, data_proportion):
    image_paths, label_paths = get_paths(trial_folders)
    labels_dict = get_dict(label_paths)
    image_paths_sample = get_random_sample(image_paths, data_proportion)
    return build_arrays(image_paths_sample, labels_dict)
    

In [None]:
# Edit these directories accordingly
data_folder_dir = "/Users/trevorchartier/Documents/Career/School/ECE528/Project/ImitationLearningLaymo/data/processed"
save_dir = "/Users/trevorchartier/Documents/Career/School/ECE528/Project/ImitationLearningLaymo/data/processed"

# Modify this when doing DAgger to something like  dataset_dagger_iter_1, etc
DATA_NAME = "dataset_initial.npz"

glob_folder_pattern_train = os.path.join(data_folder_dir, '*')
glob_folder_pattern_val = os.path.join(data_folder_dir, '*')
glob_folder_pattern_test = os.path.join(data_folder_dir, '*')

trial_folders_train = glob.glob(glob_folder_pattern_train)
trial_folders_val = glob.glob(glob_folder_pattern_val)
trial_folders_test = glob.glob(glob_folder_pattern_test)

trial_folders = {
    "train" : trial_folders_train,
    "val" : trial_folders_val,
    "test" : trial_folders_test
}

for split in trial_folders:
    X_data, Y_data = driver(trial_folders[split])
    file_name = split + DATA_NAME

    save_path = os.path.join(save_dir, file_name)
    np.savez_compressed(save_path, X_data=X_data, Y_data=Y_data)

train_dataset.npz
val_dataset.npz
test_dataset.npz


## References

1.   https://pillow.readthedocs.io/en/stable/reference/Image.html