In [None]:
from bs4 import BeautifulSoup

PATH = "../data/HS2024070201MF_VXS Versuchsdatei.XML"
OUTPUT = "../data/output.csv"

with open(PATH, "r") as f:
    data = f.read()

bs_data = BeautifulSoup(data, "xml")
# b_unique = bs_data.find_all("Description")
b_unique = bs_data.find_all("PartNumber")

print(b_unique)

In [None]:
import xml.etree.ElementTree as ET
from collections import defaultdict

# Parse the XML data
namespace = {"ns": "http://www.dat.de/vxs"}
root = ET.fromstring(data)

# Initialize a dictionary to store parts data
parts_data = defaultdict(
    lambda: {"description": "", "amount": 0, "value_per_unit": None}
)

# Get the VIN
vin = root.find(".//ns:VehicleIdentNumber", namespaces=namespace).text

# Iterate over each RepairPosition
for repair_position in root.findall(".//ns:MaterialPosition", namespaces=namespace):
    part_number_elem = repair_position.find("ns:PartNumber", namespaces=namespace)
    description = repair_position.find("ns:Description", namespaces=namespace)
    amount = repair_position.find("ns:Amount", namespaces=namespace)
    value_per_unit = repair_position.find("ns:ValuePerUnit", namespaces=namespace)

    if (
        part_number_elem is not None
        and amount is not None
        and value_per_unit is not None
    ):
        part_number = part_number_elem.text
        parts_data[part_number]["description"] = description.text
        parts_data[part_number]["amount"] += float(amount.text)
        temp = float(value_per_unit.text)
        curr = parts_data[part_number]["value_per_unit"]
        if curr is not None and temp != curr:
            print(
                f"Value per unit changed from {curr} to {temp} for part number {part_number}"
            )
        parts_data[part_number]["value_per_unit"] = float(value_per_unit.text)
        parts_data[part_number]["total_price"] = (
            parts_data[part_number]["amount"]
            * parts_data[part_number]["value_per_unit"]
        )

# Convert to dataframe

import numpy as np
import pandas as pd

df = pd.DataFrame(parts_data).T
df["Typ"] = "TNR"
df["TG"] = None
df = df.reset_index()
df.columns = ["Teilenummer", "Beschreibung", "Menge", "UPE", "Gesamtpreis", "Typ", "TG"]

# Add vin as row
df.loc[-1] = [vin, None, None, None, None, "TEXT", None]

# Sort df s.t. vin is first row
df = df.sort_index()
df = df.reset_index()
df = df.rename(columns={"index": "Pos."})

# Pos. column starts at 1
df["Pos."] = df["Pos."] + 2

# Order Columns Pos., Typ, Teilenummer, Beschreibung, TG, UPE, Menge, Gesamtpreis

df = df[
    ["Pos.", "Typ", "Teilenummer", "Beschreibung", "TG", "UPE", "Menge", "Gesamtpreis"]
]

# Round to 2 decimal places
df["UPE"] = df["UPE"].round(2)
df = df.astype(
    {
        "Gesamtpreis": np.float64,
        "UPE": np.float64,
        "TG": np.float64,
    }
)

# Save to csv
df.to_csv(OUTPUT, index=False, sep=";", columns=df.columns, decimal=",")

df.info()