## Convert GTSRB dataset into Yolov8 Format
The __[German Traffic Sign Recognition Benchmark (GTSRB)](https://paperswithcode.com/dataset/gtsrb)__ dataset is a publicly available dataset commonly used for benchmarking algorithms in traffic sign recognition. It was created for the purpose of evaluating the performance of various machine learning models, particularly in the field of computer vision.

The structure of GTRSB is different from yolo. GTSRB contains 43 folders inside `GTSRB/train/images/` pertaining to 43 classes of GTSRB, each folder contains all the images of that specific classId and a csv file containing the details of ROI for those images inside that folder. In simple words, we have 43 csv files with label data.
In the first step, we combine all the images into a single folder, and then combine all the lebels into a single csv file. 

In [None]:
import os
import shutil
import pandas as pd

# Define paths
base_dir = '/home/ubaurr/repositorio/traffic_signs/traffic_env/BDD_training/GTSRB/'
images_dir = os.path.join(base_dir, 'images')
output_dir = os.path.join(base_dir, 'all_images')
output_csv = os.path.join(base_dir, 'all_labels.csv')

# Create output directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Initialize a list to store all data
all_data = []

# Loop through each class directory
for class_id in range(43):
    class_dir = os.path.join(images_dir, f'{class_id:05d}')
    csv_file = os.path.join(class_dir, f'GT-{class_id:05d}.csv')
    
    # Read the CSV file
    class_data = pd.read_csv(csv_file, sep=';')
    
    # Loop through each image file in the class directory
    for index, row in class_data.iterrows():
        filename = row['Filename']
        src_path = os.path.join(class_dir, filename)
        
        # Create a new filename to avoid conflicts
        new_filename = f'{class_id:05d}_{index:05d}.ppm'
        dst_path = os.path.join(output_dir, new_filename)
        
        # Copy the image to the new directory with the new name
        shutil.copyfile(src_path, dst_path)
        
        # Update the filename in the row
        row['Filename'] = new_filename
        
        # Append the updated row to the list
        all_data.append(row)

# Create a DataFrame from the list of all data
all_data_df = pd.DataFrame(all_data)

# Save the combined CSV file
all_data_df.to_csv(output_csv, index=False, sep=';')

print(f'All images have been consolidated into {output_dir}')
print(f'All labels have been consolidated into {output_csv}')

## Converting to Yolo format
Once all the images are gathered into single folder and labels in single csv. We will now convert the images from .ppm to .jpg and labels into yolo format.

In [None]:
import os
import pandas as pd
from PIL import Image

# Define paths
base_dir = '/home/ubaurr/repositorio/traffic_signs/traffic_env/BDD_training/GTSRB/'
images_dir = os.path.join(base_dir, 'all_images')
output_images_dir = os.path.join(base_dir, 'images_yolo')
output_labels_dir = os.path.join(base_dir, 'labels_yolo')
csv_file = os.path.join(base_dir, 'all_labels.csv')

# Create output directories if they don't exist
if not os.path.exists(output_images_dir):
    os.makedirs(output_images_dir)
if not os.path.exists(output_labels_dir):
    os.makedirs(output_labels_dir)

# Read the combined CSV file
data = pd.read_csv(csv_file, sep=';')

def convert_to_yolo_format(row, img_width, img_height):
    """Convert bounding box to YOLO format."""
    x1 = row['Roi.X1']
    y1 = row['Roi.Y1']
    x2 = row['Roi.X2']
    y2 = row['Roi.Y2']
    class_id = row['ClassId']

    # Calculate center, width, and height
    x_center = (x1 + x2) / 2.0 / img_width
    y_center = (y1 + y2) / 2.0 / img_height
    width = (x2 - x1) / img_width
    height = (y2 - y1) / img_height

    return f"{class_id} {x_center} {y_center} {width} {height}"

# Loop through each row in the CSV file
for index, row in data.iterrows():
    filename = row['Filename']
    src_path = os.path.join(images_dir, filename)
    
    # Convert image from PPM to JPG
    img = Image.open(src_path)
    new_filename = filename.replace('.ppm', '.jpg')
    dst_image_path = os.path.join(output_images_dir, new_filename)
    img.convert('RGB').save(dst_image_path, 'JPEG')

    # Get image dimensions
    img_width, img_height = img.size
    
    # Create YOLO format label
    yolo_label = convert_to_yolo_format(row, img_width, img_height)
    
    # Save the YOLO format label to a .txt file
    label_filename = new_filename.replace('.jpg', '.txt')
    label_path = os.path.join(output_labels_dir, label_filename)
    with open(label_path, 'w') as f:
        f.write(yolo_label)

print(f'Images have been converted to JPG and saved in {output_images_dir}')
print(f'Labels have been converted to YOLO format and saved in {output_labels_dir}')

### Dividing dataset in Train and val
Once the labels and images are transformed, they should be divided into train and validation with individual folders according with the Yolo format.

In [None]:
import os
import random
import shutil

# Define paths
base_dir = '/home/ubaurr/repositorio/traffic_signs/traffic_env/BDD_training/GTSRB/'
images_dir = os.path.join(base_dir, 'images_yolo')
labels_dir = os.path.join(base_dir, 'labels_yolo')
output_images_train_dir = os.path.join(base_dir, 'train', 'images')
output_images_val_dir = os.path.join(base_dir, 'val', 'images')
output_labels_train_dir = os.path.join(base_dir, 'train', 'labels')
output_labels_val_dir = os.path.join(base_dir, 'val', 'labels')

# Create output directories if they don't exist
os.makedirs(output_images_train_dir, exist_ok=True)
os.makedirs(output_images_val_dir, exist_ok=True)
os.makedirs(output_labels_train_dir, exist_ok=True)
os.makedirs(output_labels_val_dir, exist_ok=True)

# Get all image filenames
all_images = [f for f in os.listdir(images_dir) if f.endswith('.jpg')]
all_labels = [f.replace('.jpg', '.txt') for f in all_images]

# Shuffle and split the data
data = list(zip(all_images, all_labels))
random.shuffle(data)
split_index = int(0.8 * len(data))
train_data = data[:split_index]
val_data = data[split_index:]

# Function to move files
def move_files(data, output_images_dir, output_labels_dir):
    for img_file, lbl_file in data:
        shutil.move(os.path.join(images_dir, img_file), os.path.join(output_images_dir, img_file))
        shutil.move(os.path.join(labels_dir, lbl_file), os.path.join(output_labels_dir, lbl_file))

# Move training files
move_files(train_data, output_images_train_dir, output_labels_train_dir)

# Move validation files
move_files(val_data, output_images_val_dir, output_labels_val_dir)

print(f'Training images and labels have been moved to {output_images_train_dir} and {output_labels_train_dir}')
print(f'Validation images and labels have been moved to {output_images_val_dir} and {output_labels_val_dir}')
