# Flight CSV Header Comparison

This notebook inspects all CSV files in the `~/flight_data` directory, compares their headers, and highlights any mismatches so they can be resolved before converting the data to Parquet.


In [None]:
from pathlib import Path
import pandas as pd

input_dir = Path.home() / "flight_data"
pattern = "*.csv"

csv_files = sorted(path for path in input_dir.glob(pattern) if path.is_file())

print(f"Found {len(csv_files)} CSV files in {input_dir}")
for path in csv_files:
    print(f"- {path.name}")

if not csv_files:
    raise FileNotFoundError(f"No CSV files found in {input_dir} matching {pattern}")


In [None]:
header_info = []
reference_columns = None
reference_file = None

for path in csv_files:
    columns = list(pd.read_csv(path, nrows=0).columns)
    if reference_columns is None:
        reference_columns = columns
        reference_file = path.name
        missing = []
        extra = []
        order_mismatch = False
    else:
        ref_set = set(reference_columns)
        cur_set = set(columns)
        missing = sorted(ref_set - cur_set)
        extra = sorted(cur_set - ref_set)
        order_mismatch = columns != reference_columns and not (missing or extra)

    header_info.append(
        {
            "file": path.name,
            "missing_columns": ", ".join(missing) if missing else "",
            "extra_columns": ", ".join(extra) if extra else "",
            "order_mismatch": order_mismatch,
            "column_count": len(columns),
        }
    )

results_df = pd.DataFrame(header_info)
results_df


In [None]:
mismatch_mask = (
    results_df["missing_columns"].astype(bool)
    | results_df["extra_columns"].astype(bool)
    | results_df["order_mismatch"].astype(bool)
)

if mismatch_mask.any():
    print(f"Reference file: {reference_file}")
    display(results_df[mismatch_mask].reset_index(drop=True))
else:
    print("All CSV files share identical headers.")
