In [1]:
import pandas as pd

from pathlib import Path
import os

# Load data - fuel_train includes fuel consumption in segments
DATA_ORIGIN = Path("data")
PATH = os.path.join(DATA_ORIGIN, "flights_train")
# === CONFIG ===
DATA_DIR = Path(PATH) # <- change this
COLUMNS = ["timestamp",
           "longitude", "latitude",
           "altitude", "groundspeed",
           "track", "vertical_rate"]


def load_all_parquet(dir_path: Path) -> pd.DataFrame:
    """
    Read all parquet files in a folder, keep only relevant columns,
    and concatenate into a single pandas DataFrame.
    """
    dfs = []
    files = sorted(dir_path.glob("*.parquet"))

    if not files:
        raise FileNotFoundError(f"No .parquet files found in {dir_path}")

    for f in files:
        print(f"Reading {f.name} ...")
        df = pd.read_parquet(f)

        # keep only columns we care about if they exist
        missing = [c for c in COLUMNS if c not in df.columns]
        if missing:
            print(f"  -> skipping {f.name}, missing columns: {missing}")
            continue

        df = df[COLUMNS].copy()
        dfs.append(df)

    if not dfs:
        raise ValueError("No usable parquet files found (all missing required columns).")

    df_all = pd.concat(dfs, ignore_index=True)

    # ensure timestamp is datetime
    df_all["timestamp"] = pd.to_datetime(df_all["timestamp"])

    return df_all

import numpy as np
import matplotlib.pyplot as plt

def plot_distributions(df: pd.DataFrame):
    """
    Plot 1D distributions (histograms) for all numeric flight variables,
    with y-axis showing percentage of samples instead of raw counts.
    """
    vars_num = ["longitude", "latitude", "altitude",
                "groundspeed", "track", "vertical_rate"]

    for col in vars_num:
        data = df[col].dropna().values

        if data.size == 0:
            print(f"Skipping {col}: no non-NA values.")
            continue

        # weights so that the sum of all bins = 100 (%)
        weights = np.ones_like(data, dtype=float) / data.size * 100.0

        plt.figure(figsize=(8, 4))
        plt.hist(data, bins=200, weights=weights)
        plt.title(f"Histogram of {col}")
        plt.xlabel(col)
        plt.ylabel("Percentage of samples [%]")
        plt.tight_layout()
        plt.show()



def plot_timestamp_series(df: pd.DataFrame, max_points: int = 500_000):
    """
    Time-series style plots: variable vs timestamp.

    To avoid over-plotting / memory issues, downsample if there are
    more than `max_points` rows.
    """
    df_ts = df.sort_values("timestamp")

    n = len(df_ts)
    if n > max_points:
        step = max(n // max_points, 1)
        df_ts = df_ts.iloc[::step]
        print(f"Downsampled time-series from {n} to {len(df_ts)} points for plotting.")

    vars_num = ["altitude", "groundspeed", "vertical_rate"]
    for col in vars_num:
        plt.figure(figsize=(12, 4))
        plt.plot(df_ts["timestamp"], df_ts[col], linewidth=0.5)
        plt.title(f"{col} over time (all flights)")
        plt.xlabel("timestamp")
        plt.ylabel(col)
        plt.tight_layout()
        plt.show()


def plot_extra_helpful(df: pd.DataFrame):
    """
    A few extra plots that are typically useful:
      - Lat vs Lon scatter: rough flight map
      - Altitude vs Groundspeed 2D scatter
      - Altitude vs Vertical rate 2D scatter
    """
    # Lat/Lon scatter (very useful to see coverage + obvious outliers)
    plt.figure(figsize=(6, 6))
    plt.scatter(df["longitude"], df["latitude"], s=1, alpha=0.1)
    plt.xlabel("longitude")
    plt.ylabel("latitude")
    plt.title("Longitude vs Latitude (all flights)")
    plt.tight_layout()
    plt.show()

    # Altitude vs Groundspeed
    plt.figure(figsize=(6, 4))
    plt.scatter(df["groundspeed"], df["altitude"], s=1, alpha=0.1)
    plt.xlabel("groundspeed [kt]")
    plt.ylabel("altitude [ft]")
    plt.title("Altitude vs Groundspeed")
    plt.tight_layout()
    plt.show()

    # Altitude vs Vertical Rate
    if df["vertical_rate"].notna().any():
        plt.figure(figsize=(6, 4))
        plt.scatter(df["vertical_rate"], df["altitude"], s=1, alpha=0.1)
        plt.xlabel("vertical_rate [ft/min?]")
        plt.ylabel("altitude [ft]")
        plt.title("Altitude vs Vertical Rate")
        plt.tight_layout()
        plt.show()


def main():
    # 1) load and merge all parquet files
    df_all = load_all_parquet(DATA_DIR)

    # quick numeric summary
    print("\n=== Summary statistics ===")
    print(df_all[["longitude", "latitude",
                  "altitude", "groundspeed",
                  "track", "vertical_rate"]].describe(
                      percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]
                  ))

    # 2) distribution plots
    plot_distributions(df_all)

    # 3) timestamp-based plots
    plot_timestamp_series(df_all)

    # 4) extra helpful plots
    plot_extra_helpful(df_all)


if __name__ == "__main__":
    main()


Reading prc770822360.parquet ...
Reading prc770831136.parquet ...
Reading prc770835414.parquet ...
Reading prc770844923.parquet ...
Reading prc770847190.parquet ...
Reading prc770852254.parquet ...
Reading prc770853645.parquet ...
Reading prc770859206.parquet ...
Reading prc770860078.parquet ...
Reading prc770864956.parquet ...
Reading prc770866486.parquet ...
Reading prc770867379.parquet ...
Reading prc770868424.parquet ...
Reading prc770870642.parquet ...
Reading prc770872129.parquet ...
Reading prc770875579.parquet ...
Reading prc770876146.parquet ...
Reading prc770876159.parquet ...
Reading prc770876744.parquet ...
Reading prc770878608.parquet ...
Reading prc770879349.parquet ...
Reading prc770881363.parquet ...
Reading prc770882585.parquet ...
Reading prc770882767.parquet ...
Reading prc770885136.parquet ...
Reading prc770887177.parquet ...
Reading prc770887278.parquet ...
Reading prc770887555.parquet ...
Reading prc770887933.parquet ...
Reading prc770888915.parquet ...
Reading pr

KeyboardInterrupt: 