In [None]:
import kagglehub
from lxml import etree
import pandas as pd


In [None]:
path = kagglehub.dataset_download("stanislavlevendeev/hazmat-detection")

print("Path to dataset files:", path)

In [None]:
# Load and parse the XML file
xml_file = path + "\cvat-for-video-dataset-hazmat-codes\\annotations.xml"
print("Path to xml file:", xml_file)
tree = etree.parse(xml_file)
# Get the root element
root = tree.getroot()
print("Root element:", root.tag)


In [None]:
tasks = root.find(".//tasks") 
tasks_json = {}
if tasks is not None:
    number_frames = 0
    i = 0
    for task in tasks.findall("task"):
        # Extract the id and name of each task
        task_id = task.find("id").text if task.find("id") is not None else "No ID"
        job_id = task.find(".//segment/id").text if task.find(".//segment/id") is not None else "No ID"
        task_name = task.find("name").text if task.find("name") is not None else "No Name"
        task_source = task.find("source").text if task.find("source") is not None else "No source"
        task_frames = task.find("size").text if task.find("size") is not None else "No frames"
        if(tasks_json.get(task_id) is None):
            tasks_json[task_id] = {
                "id": task_id,
                "name": task_name,
                "job_id": job_id,
                "source": task_source,
                "frames": task_frames,
                "frames_id": number_frames,
                "idx": i,
                "labels": []
            }
        i += 1    
        number_frames += int(task_frames)
        print(f"Task ID: {task_id}, Task Name: {task_name}, Task Source: {task_source}")
else:
    print("No tasks found in the XML.")


In [None]:
print(tasks_json)

In [None]:
tracks = root.findall("track")
i = 0
for track in tracks:
    attributes = track.attrib
    task_id = attributes.get("task_id")
    print("Attributes:", attributes)
    print("Task ID:", task_id)

    boxes = track.findall("box")
    for box in boxes:
        box_attributes = box.attrib
        lable_obj = {
            "absolute_frame": box_attributes.get("frame"),
            "relative_frame": int(box_attributes.get("frame")) - tasks_json[task_id]["frames_id"] -0,
            "xtl": box_attributes.get("xtl"),
            "ytl": box_attributes.get("ytl"),
            "xbr": box_attributes.get("xbr"),
            "ybr": box_attributes.get("ybr")
        }
        for attr in box.findall("attribute"):
            lable_obj[attr.attrib.get("name")] = attr.text
        tasks_json[task_id]["labels"].append(lable_obj)
    
    i+=1


In [None]:
rows = []

for task_id, task_data in tasks_json.items():
    for label in task_data["labels"]:
        # Combine task and label data into a single row
        row = {
            "Task ID": task_data["id"],
            "Task Name": task_data["name"],
            "Job Id": task_data["job_id"],
            "Source": task_data["source"],
            "Frames": task_data["frames"],
            "Absolute Frame": int(label["absolute_frame"]),
            "Relative Frame": label["relative_frame"],
            "XTL": label["xtl"],
            "YTL": label["ytl"],
            "XBR": label["xbr"],
            "YBR": label["ybr"],
            "Code": label["code"],
            "Issue": label["issue"]
        }
        rows.append(row)

# Create a DataFrame
df = pd.DataFrame(rows)

# Sort by absolute frame descending
df = df.sort_values("Absolute Frame", ascending=True)

# Display the DataFrame
print(df.head())

# Save the DataFrame to a CSV file
output_file = "./data/labels_dataframe.csv"
df.to_csv(output_file, index=False)
print(f"CSV file '{output_file}' created successfully.")