# Imports

In [29]:
import numpy as np
import os
import shutil
from tqdm import tqdm
import zipfile
import urllib.request
import splitfolders

# Download the dataset

In [30]:
def download_file(url, file_name):
    if not os.path.exists('dataset/' + file_name):
        with urllib.request.urlopen(url) as response, open('dataset/' + file_name, 'wb') as out_file:
            content_length = int(response.headers['Content-Length'])
            with tqdm(total=content_length, unit='B', unit_scale=True, desc=url.split('/')[-1]) as pbar:
                while True:
                    chunk = response.read(1024)
                    if not chunk:
                        break
                    out_file.write(chunk)
                    pbar.update(len(chunk))
    else:
        print(f"{file_name} already exists.")


os.makedirs("dataset/", exist_ok=True)
# Training
download_file('https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Training_Images.zip',
              'GTSRB_Final_Training_Images.zip')
# Testing
download_file('https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_Images.zip',
              'GTSRB_Final_Test_Images.zip')
# Ground truth
download_file('https://sid.erda.dk/public/archives/daaeac0d7ce1152aea9b61d9f1e19370/GTSRB_Final_Test_GT.zip',
              'GTSRB_Final_Test_GT.zip')

GTSRB_Final_Training_Images.zip already exists.
GTSRB_Final_Test_Images.zip already exists.
GTSRB_Final_Test_GT.zip already exists.


# Extract zip files

In [31]:
def extract_file(file_name):
    with zipfile.ZipFile(f"dataset/{file_name}", 'r') as zip_ref:
        file_list = zip_ref.namelist()
        with tqdm(total=len(file_list), desc="Extracting") as pbar:
            for file in file_list:
                zip_ref.extract(file, 'dataset/')
                pbar.update(1)


extract_file('GTSRB_Final_Training_Images.zip')
extract_file('GTSRB_Final_Test_Images.zip')
extract_file('GTSRB_Final_Test_GT.zip')

Extracting: 100%|██████████| 39299/39299 [00:38<00:00, 1013.55it/s]
Extracting: 100%|██████████| 12635/12635 [00:10<00:00, 1172.30it/s]
Extracting: 100%|██████████| 1/1 [00:00<?, ?it/s]


# Loading CSV file

In [32]:
#IMAGES: './dataset/GTSRB/test_images'
#CSV ANNOTATIONS: './dataset/GTSRB/test_images/GT-final_test.csv'
def csv_loader(csv_path):
    data = np.loadtxt(csv_path,
                      delimiter=";", dtype=str, skiprows=1)
    return data


#You should download the testset ('GTSRB_Final_Test_Images.zip') from the website which contains only the images
#Then you have to download the ground truth csv ('GTSRB_Final_Test_GT.zip') from the website and paste it into the testset images folder
annotations = csv_loader('./dataset/GT-final_test.csv')
#sort the annotations
annotations = annotations[:, [0, 7]]
num_samples = len(annotations)
#Column 0: filename - Column 1: classid
annotations = annotations[annotations[:, 1].astype(int).argsort()]

# Making training data accordingly

In [33]:
def move_directories(source, destination):
    if not os.path.exists(destination):
        os.makedirs(destination)
    # Get a list of all directories in the source directory
    directories = [d for d in os.listdir(source) if os.path.isdir(os.path.join(source, d))]

    # Move each directory to the destination
    for directory in tqdm(directories):
        source_path = os.path.join(source, directory)
        destination_path = os.path.join(destination, directory)
        shutil.move(source_path, destination_path)


# Move directories with contents
move_directories("./dataset/GTSRB/Final_Training/Images", "./dataset/GTSRB/train")
shutil.rmtree("./dataset/GTSRB/Final_Training")

100%|██████████| 43/43 [00:00<00:00, 305.71it/s]


# Making test data accordingly

In [34]:
for class_id in tqdm(np.unique(annotations[:, 1]), desc='Class_ID'):
    newpath = './dataset/GTSRB/test/' + class_id.zfill(5)
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    for image_filename in annotations[annotations[:, 1] == class_id]:
        shutil.move('./dataset/GTSRB/Final_Test/Images/' + image_filename[0], newpath + '/' + image_filename[0])

shutil.rmtree("./dataset/GTSRB/Final_Test")

Class_ID: 100%|██████████| 43/43 [00:06<00:00,  6.58it/s]


# Merge the dataset and then resplit

In [35]:
# Directory containing the test dataset
test_dir = './dataset/GTSRB/test'
# Directory containing the train dataset
train_dir = './dataset/GTSRB/train'

def merge(source_folder, destination_folder):
    # Create the destination folder if it doesn't exist
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    # Get the total number of files and directories in the source folder
    total_items = sum([len(files) + len(dirs) for root, dirs, files in os.walk(source_folder)])

    # Initialize tqdm to show progress
    progress = tqdm(total=total_items, desc='Moving: ' + source_folder + ' --> ' + destination_folder, position=0, leave=True)

    # Iterate over all files and subdirectories in the source folder
    for root, dirs, files in os.walk(source_folder):
        for item in files + dirs:
            source_item = os.path.join(root, item)
            destination_item = os.path.join(destination_folder, os.path.relpath(source_item, source_folder))

            # If the item is a file, copy it to the destination folder
            if os.path.isfile(source_item):
                shutil.move(source_item, destination_item)
            # If the item is a directory, create it in the destination folder
            elif os.path.isdir(source_item):
                os.makedirs(destination_item, exist_ok=True)

            progress.update(1)  # Update progress bar

    progress.close()  # Close tqdm
    
def merge_folders(source_folders, target_folder):
    for sf in source_folders:
        merge(sf, target_folder)
        shutil.rmtree(sf)

# Temporary directory to store the merged dataset
merged_dir = "./dataset/GTSRB/merged"

merge_folders([train_dir,test_dir], merged_dir)

# Training 70
# Testing 30
splitfolders.ratio(merged_dir, output="./dataset/GTSRB/", seed=123, ratio=(.7,0, 0.3),move=True)

# Clear temporary folders
shutil.rmtree('./dataset/GTSRB/merged')
shutil.rmtree('./dataset/GTSRB/val')
os.remove('./dataset/GTSRB/Readme-Images-Final-test.txt')
os.remove('./dataset/GTSRB/Readme-Images.txt')

Moving: ./dataset/GTSRB/train --> ./dataset/GTSRB/merged: 100%|██████████| 39295/39295 [00:25<00:00, 1543.81it/s]
Moving: ./dataset/GTSRB/test --> ./dataset/GTSRB/merged: 100%|██████████| 12673/12673 [00:07<00:00, 1755.18it/s]
Copying files: 51882 files [00:35, 1447.04 files/s]
