In [6]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

# Folder containing XML files
XML_FOLDER = "/Users/macbook/Desktop/data mining project/Data mining/data"   # change this to your folder path

rows = []

# Namespace (important!)
ns = {"ns": "http://www.iata.org/IATA/2007/00"}

for filename in os.listdir(XML_FOLDER):
    if not filename.endswith(".xml"):
        continue

    file_path = os.path.join(XML_FOLDER, filename)

    try:
        tree = ET.parse(file_path)
        root = tree.getroot()

        flight = root.find("ns:FlightLeg", ns)
        if flight is None:
            continue

        # --- LegIdentifier ---
        leg_id = flight.find("ns:LegIdentifier", ns)

        airline = leg_id.findtext("ns:Airline", default=None, namespaces=ns)
        flightnumber = leg_id.findtext("ns:FlightNumber", default=None, namespaces=ns)
        departureairport = leg_id.findtext("ns:DepartureAirport", default=None, namespaces=ns)
        arrivalairport = leg_id.findtext("ns:ArrivalAirport", default=None, namespaces=ns)
        origindate = leg_id.findtext("ns:OriginDate", default=None, namespaces=ns)

        # --- LegData ---
        leg_data = flight.find("ns:LegData", ns)

        status = leg_data.findtext("ns:PublicStatus", default=None, namespaces=ns)
        aircrafttype = leg_data.findtext("ns:AircraftInfo/ns:AircraftType", default=None, namespaces=ns)

        operationtime = leg_data.findtext("ns:OperationTime", default=None, namespaces=ns)

        resource = leg_data.find("ns:AirportResources/ns:Resource", ns)
        departureorarrival = resource.attrib.get("DepartureOrArrival") if resource is not None else None
        aircraftterminal = resource.findtext("ns:AircraftTerminal", default=None, namespaces=ns) if resource is not None else None

        rows.append({
            "airline": airline,
            "flightnumber": flightnumber,
            "departureairport": departureairport,
            "arrivalairport": arrivalairport,
            "origindate": origindate,
            "operationtime": operationtime,
            "departureorarrival": departureorarrival,
            "aircraftterminal": aircraftterminal,
            "status": status,
            "aircrafttype": aircrafttype
        })

    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Create DataFrame
df = pd.DataFrame(rows)

# Save to CSV
df.to_csv("flights_clean.csv", index=False)

print("CSV file created successfully!")


CSV file created successfully!


In [7]:
df.head()


Unnamed: 0,airline,flightnumber,departureairport,arrivalairport,origindate,operationtime,departureorarrival,aircraftterminal,status,aircrafttype
0,SF,1803,HME,ALG,2024-12-04,2024-12-04T08:05:00Z,Arrival,1,LAN,738
1,AH,1009,ORY,ALG,2024-12-04,2024-12-04T09:46:00Z,Arrival,4,ONB,738
2,SF,2320,ALG,AZR,2024-12-04,2024-12-04T13:30:00Z,Departure,1,SCT,DH4
3,AH,6106,ALG,ORN,2024-12-04,2024-12-04T22:30:00Z,Departure,1,SCT,738
4,AF,1655,ALG,CDG,2024-12-03,2024-12-03T19:15:00Z,Departure,4,SCT,223
