# 01. Data Preparation for Detection
This notebook processes the raw datasets (Generic and Belgian), extracts bounding box coordinates from XMLs, and creates the training CSV. All Belgian plates are used for training.

In [1]:
import os
import pandas as pd
import shutil
import sys
from sklearn.model_selection import train_test_split
from tqdm import tqdm

sys.path.append('../src')
from utils import parse_xml_annotation

In [2]:
# Paths
RAW_GENERIC = '../data/raw/generic_plates'
RAW_BELGIAN = '../data/raw/belgian_plates'
PROCESSED_DIR = '../data/processed'
TEST_SAMPLES_DIR = os.path.join(PROCESSED_DIR, 'test_samples')
CSV_PATH = os.path.join(PROCESSED_DIR, 'train_detection.csv')

os.makedirs(TEST_SAMPLES_DIR, exist_ok=True)

In [3]:
def process_dataset(folder_path, is_belgian=False):
    data = []
    if not os.path.exists(folder_path):
        print(f'Warning: Folder {folder_path} does not exist.')
        return []
        
    files = [f for f in os.listdir(folder_path) if f.endswith('.xml')]
    
    for xml_file in tqdm(files, desc=f'Processing {os.path.basename(folder_path)}'):
        xml_path = os.path.join(folder_path, xml_file)
        bbox = parse_xml_annotation(xml_path)
        
        if bbox:
            # Find corresponding image (try png, jpg, jpeg)
            base_name = os.path.splitext(xml_file)[0]
            image_name = None
            for ext in ['.png', '.jpg', '.jpeg']:
                if os.path.exists(os.path.join(folder_path, base_name + ext)):
                    image_name = base_name + ext
                    break
            
            if image_name:
                image_path = os.path.abspath(os.path.join(folder_path, image_name))
                data.append({
                    'image_path': image_path,
                    'x': bbox[0],
                    'y': bbox[1],
                    'w': bbox[2],
                    'h': bbox[3],
                    'is_belgian': is_belgian,
                    'filename': image_name
                })
    return data

In [4]:
# Process Generic Plates
generic_data = process_dataset(RAW_GENERIC, is_belgian=False)
print(f'Found {len(generic_data)} generic plates.')

Processing generic_plates: 100%|██████████| 433/433 [00:00<00:00, 6786.77it/s]

Found 433 generic plates.





In [5]:
# Process Belgian Plates
belgian_data = process_dataset(RAW_BELGIAN, is_belgian=True)
print(f'Found {len(belgian_data)} Belgian plates.')

Processing belgian_plates: 100%|██████████| 451/451 [00:00<00:00, 5223.73it/s]

Found 395 Belgian plates.





In [6]:
# Use all Belgian Data for Training
belgian_train = belgian_data
print(f'Belgian Train: {len(belgian_train)} (All used for training)')

Belgian Train: 395 (All used for training)


In [7]:
# Test samples are manually managed in ../data/processed/test_samples
print(f'Test samples directory: {TEST_SAMPLES_DIR}')

Test samples directory: ../data/processed\test_samples


In [8]:
# Combine Train Data
train_data = generic_data + belgian_train
df = pd.DataFrame(train_data)

# Save to CSV
if not df.empty:
    df.to_csv(CSV_PATH, index=False)
    print(f'Saved training data to {CSV_PATH}')
    print(df.head())
else:
    print('No training data to save.')

Saved training data to ../data/processed\train_detection.csv
                                          image_path    x    y    w    h  \
0  c:\Users\Paco\Documents\github\Inteligencia_ar...  226  125  193   48   
1  c:\Users\Paco\Documents\github\Inteligencia_ar...  134  128  128   32   
2  c:\Users\Paco\Documents\github\Inteligencia_ar...  140    5  163  143   
3  c:\Users\Paco\Documents\github\Inteligencia_ar...  175  114   39   17   
4  c:\Users\Paco\Documents\github\Inteligencia_ar...  167  202   73   18   

   is_belgian     filename  
0       False    Cars0.png  
1       False    Cars1.png  
2       False   Cars10.png  
3       False  Cars100.png  
4       False  Cars101.png  
