# Preprocessing data

In [1]:
import os
import cv2
import pandas as pd
import xml.etree.ElementTree as ET

### Path to the dataset folder

In [2]:
dataset_path = "archive"

### Output CSV file

In [3]:
csv_file = "dataset.csv"

### Create a DataFrame to store image information

In [4]:
df = pd.DataFrame(columns=['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax'])

### Loop through each XML file in the annotations folder

In [5]:
annotations_path = os.path.join(dataset_path, "annotations")
for xml_file in os.listdir(annotations_path):
    if xml_file.endswith(".xml"):
        xml_path = os.path.join(annotations_path, xml_file)

        # Parse XML file
        tree = ET.parse(xml_path)
        root = tree.getroot()

        # Get image information from XML
        filename = root.find('filename').text
        width = int(root.find('size/width').text)
        height = int(root.find('size/height').text)
        class_name = root.find('object/name').text
        xmin = int(root.find('object/bndbox/xmin').text)
        ymin = int(root.find('object/bndbox/ymin').text)
        xmax = int(root.find('object/bndbox/xmax').text)
        ymax = int(root.find('object/bndbox/ymax').text)

        # Append information to the DataFrame
        df = pd.concat([df, pd.DataFrame({
            'filename': [filename],
            'width': [width],
            'height': [height],
            'class': [class_name],
            'xmin': [xmin],
            'ymin': [ymin],
            'xmax': [xmax],
            'ymax': [ymax]
        })], ignore_index=True)


### Save the DataFrame to a CSV file

In [6]:
df.to_csv(csv_file, index=False)

In [7]:
print("Dataset preprocessing complete. CSV file created.")

Dataset preprocessing complete. CSV file created.
