# Preprocessing

#### This will create two numpy files you can upload to the drive.

## Import Packages


In [1]:
import numpy as np
import os
import csv
from tqdm import tqdm
import random

from PIL import Image
import glob

## Image Preprocessing

In [2]:
def preprocess(image_path):
  # Scale down from 640 x 480 pixels to 6x smaller.
  target_image_size = (80, 60)

  # Get image and resize.
  PIL_image = Image.open(image_path)
  PIL_image_resized = PIL_image.resize(target_image_size, resample=Image.LANCZOS)

  # Convert to NumPy array and normalize
  image_arr = np.array(PIL_image_resized)
  normalized_arr = image_arr / 255.0

  return normalized_arr

## Edit the data_folder_dir below.

In [3]:
images = []
labels = []

# Edit this directory accordingly
data_folder_dir = "../data/raw_data"


# Use glob to find every image jpg and every label.csv
glob_image_pattern = os.path.join(data_folder_dir, 'trial_*/images/*.jpg')
glob_label_pattern = os.path.join(data_folder_dir, 'trial_*/labels.csv')

image_paths = glob.glob(glob_image_pattern)
label_paths = glob.glob(glob_label_pattern)

In [4]:
# Take 40% of the original dataset
sample_size = int(len(image_paths) * 0.4)
image_paths_sample = random.sample(image_paths, sample_size)

In [5]:
# Get every label into a dictionary
labels_dict = {}

for label_path in label_paths:
  # open the label file
  with open(label_path, mode='r') as f:
    reader = csv.reader(f)
    header = next(reader)

    for row in reader:
      timestamp = row[0]
      throttle_on = int(row[1])
      steering_command = float(row[2])

      image_name = timestamp + '.jpg'

      labels_dict[image_name] = [throttle_on, steering_command]

In [6]:
# Loop through every image and append its label.
# Use tqdm to monitor progress

for image_path in tqdm(image_paths_sample, desc="Preprocessing"):
  image_name = os.path.basename(image_path)

  if image_name in labels_dict:
    label = labels_dict[image_name]
    image = preprocess(image_path)

    images.append(image)
    labels.append(label)
  else:
    print("Error: No matching label found.")

X_data = np.array(images)
Y_data = np.array(labels, dtype=np.float32)

print(X_data.size)
print(Y_data.size)

Preprocessing: 100%|██████████| 23187/23187 [04:07<00:00, 93.66it/s] 


333892800
46374


In [8]:
print("Images data type:", X_data.dtype)
print("Labels data type:", Y_data.dtype)

Images data type: float64
Labels data type: float32


## Edit the save_dir below.

In [11]:
save_dir = "../data/processed"
file_name = 'dataset.npz'

save_path = os.path.join(save_dir, file_name)
os.makedirs(save_dir, exist_ok=True)
np.savez_compressed(save_path, X_data=X_data, Y_data=Y_data)

## References

1.   https://pillow.readthedocs.io/en/stable/reference/Image.html