# Data Preprocessing for all the Datasets

## Preprocessing Goals


- **COCO** dataset:
    1. Transform the folder structure from [images, annotations] to [train, validation]
    2. Transform the instances_train2017.json and instances_val2017.json to text files.
        - Bounding Boxes 
        - Class code
        - Segmentation data (mask data with format [(x_i, y_i), (x_(i+1), y_(i+1), ... )])
    3. Resize images and annotations
    4. Compute mean and standard deviation for the data using 2500 images as samples


- **Vis-Drone** dataset:
    1. Extract the bounding boxes from format [x_min, y_min, width, height] to [x_min, y_min, x_max, y_max]
    2. Resize images and annotations
    3. Produce the segmentation data from the bounding box data: [(x_min, y_min), (x_min, y_max), (x_max, y_min), (x_max, y_max)]
    4. Compute mean and standard deviation for the data


- **UAV-SOD Drone** dataset:
    1. Extract the bounding box data and the class codes from XML files and create the text file equivalent
    2. Resize images and annotations
    3. Produce the segmentation data from the bounding box data: [(x_min, y_min), (x_min, y_max), (x_max, y_min), (x_max, y_max)]
    4. Compute mean and standard deviation for the data


- City Scapes dataset:
    1. Transform the folder structure from [images, annotations] to [train, validation]
    2. Resize images and annotations
    3. Produce the segmentation data from the bounding box data: [(x_min, y_min), (x_min, y_max), (x_max, y_min), (x_max, y_max)]
    4. Compute mean and standard deviation for the data

### Import Libraries and data paths

In [None]:
# Import Libraries
import os
module_path = os.path.abspath(os.path.join('..')) 
import warnings
import os, random
from matplotlib import pyplot as plt
from tqdm import tqdm
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt
import src.data_preprocessing as preprocessing

 
warnings.filterwarnings("ignore")


# Import base data paths
COCO_DATA_PATH = "data/coco2017/"
VIS_DATA_PATH = "data/vis_drone_data/"
SOD_DATA_PATH = "data/uav_sod_data/"
CITY_DATA_PATH = "data/city-scapes-data/"

### COCO2017 Data

In [None]:
# Define the train, test and validation paths
train_path      = os.path.join(COCO_DATA_PATH, "train")
validation_path = os.path.join(COCO_DATA_PATH, "validation")

coco_paths = [train_path, validation_path]

# Fix the annotations format, resize the images
for path in coco_paths:
    images_path      = os.path.join(path, "images")
    annotations_path = os.path.join(path, "annotations")
    
    # Annotations and image transformations
    preprocessing.convert_coco_annotations(path)
    preprocessing.verify_annotations(path)
    preprocessing.resize_data(path)
    
# Get the mean and standard deviation for the COCO training set 
preprocessing.compute_mean_std(os.path.join(train_path, "images") , "coco_data")

### Vis-Drone Data

In [None]:
# Define the train, test and validation paths
train_path      = os.path.join(VIS_DATA_PATH, "train")
validation_path = os.path.join(VIS_DATA_PATH, "validation")

vis_paths = [train_path, validation_path]


# Fix the annotations format, resize the images
for path in coco_paths:
    images_path      = os.path.join(path, "images")
    annotations_path = os.path.join(path, "annotations")
    
    # Annotations and image transformations
    preprocessing.extract_annotation_values(annotations_path)
    preprocessing.resize_data(path)
    
    
# Get the mean and standard deviation for the Vis-Drone training set 
preprocessing.compute_mean_std(os.path.join(train_path, "images") , "vis_data")

### UAV-SOD Data

In [None]:
# Define the train, test and validation paths
train_path      = os.path.join(SOD_DATA_PATH, "train")
test_path       = os.path.join(SOD_DATA_PATH, "test"  )
validation_path = os.path.join(SOD_DATA_PATH, "validation")

uav_paths = [train_path, test_path, validation_path]


# Fix the annotations format, resize the images
for path in uav_paths:
    images_path      = os.path.join(path, "images")
    annotations_path = os.path.join(path, "annotations")
    
    # Annotation and image transformations
    preprocessing.xml_to_txt(annotations_path)
    preprocessing.resize_data(path)
    

# Get the mean and standard deviation for the UAV training set 
preprocessing.compute_mean_std(os.path.join(train_path, "images") , "uav_data")

### CityScapes Data

In [None]:
# Start the process by re-organizing the folder structure for the annotations
images_path      = "city_scapes_images/leftImg8bit"
annotations_path = "city_scapes_annotations"
preprocessing.reorganize_cityscapes(images_path, annotations_path)


# Define the train, test and validation paths
train_path      = os.path.join(CITY_DATA_PATH, "train")
test_path       = os.path.join(CITY_DATA_PATH, "test"  )
validation_path = os.path.join(CITY_DATA_PATH, "validation")

city_paths = [train_path, test_path, validation_path]

# Fix the annotations format, resize the images
for path in city_paths:
    images_path      = os.path.join(path, "images")
    annotations_path = os.path.join(path, "annotations")
    
    # Annotation and image transformations
    preprocessing.json_to_text(annotations_path, annotations_path)
    preprocessing.resize_data(path)


# Rename the image and annotation files in order to make it easier for the training
preprocessing.compute_mean_std(images_path, "city_scapes")

### Preprocessing Validation


In this case we want to plot images and the corresponding annotations after the pre-processing.

In [None]:
# COCO2017 dataset pre-processing plotting
preprocessing.plot_images_and_annotations(COCO_DATA_PATH)

In [None]:
# Vis-drone dataset pre-processing plotting
preprocessing.plot_images_and_annotations(VIS_DATA_PATH)

In [None]:
# SOD-UAV dataset pre-processing plotting
preprocessing.plot_images_and_annotations(SOD_DATA_PATH)

In [None]:
# City-scapes dataset pre-processing plotting
preprocessing.plot_images_and_annotations(CITY_DATA_PATH)