In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

False

In [2]:
import polars as pl
from pydantic import BaseModel
from typing import Dict, Tuple
from math import radians, sin, cos, asin, sqrt

In [3]:
train_file = os.path.join(root_dir, "kaggle", "train.parquet")
test_file = os.path.join(root_dir, "kaggle", "test.parquet")

## 1. Data Engineering Plan

* Dataset has 3 major components
  * User information data
  * Flight information data (total flight)
  * Pricing data

### 1.1. Engineer Flight Data

* Create flight ontology layer: Using ICAO and IATA's publically available data, enrich flight information data
* Create flight embedding layer
  * Has text embedding (use Qwen 8B)
  * Has tabular embedding (numerical value + simple categorical (boolean) value)

In [4]:
# Additional Engineering file
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = sin(dlat/2)**2 + cos(lat1)*cos(lat2)*sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    return 6371 * c  # Earth radius in km

def build_airline_name_map(airline_df: pl.DataFrame) -> dict:
    """Build lookup dict: IATA -> enriched description."""
    mapping = {}
    for row in airline_df.iter_rows(named=True):
        code = row.get("IATA")
        if code is None:
            continue
        name = row.get("Airline")
        country = row.get("Country/Region")
        lcc = row.get("LCC")
        service_type = "LCC" if lcc else "full service carrier"
        mapping[code] = f"{name} ({country}'s {service_type})"
    return mapping

def build_airport_maps(airport_df: pl.DataFrame) -> Tuple[Dict[str, float], Dict[str, Tuple[float, float]]]:
    """
    Build two dictionaries from a Polars airport dataframe:
      - airport_offset_map: { IATA -> UTC_Offset_Hours }
      - airport_coord_map: { IATA -> (GeoPointLat, GeoPointLong) }
    """
    meta_map: Dict[str, str] = {}
    offset_map: Dict[str, float] = {}
    coord_map: Dict[str, Tuple[float, float]] = {}
    country_map: Dict[str, str] = {}

    for row in airport_df.iter_rows(named=True):
        code = row.get("IATA")
        if code is None or code != code or code == "":
            continue  # skip invalid codes

        # location metadata
        city = row.get("City_Name")
        name = row.get("AirportName")
        
        country = row.get("Country_Name")
        meta_map[code] = f"{city} {name} ({country})"
        if code and code == code and country and country == country and code.strip() != "":
            country_map[code] = country

        # UTC offset
        offset = row.get("UTC_Offset_Hours")
        if offset is not None and offset == offset:  # not NaN
            offset_map[code] = float(offset)

        # coordinates
        lat = row.get("GeoPointLat")
        lon = row.get("GeoPointLong")
        if (lat == lat) and (lon == lon):  # not NaN
            if not (lat == 0.0 and lon == 0.0):  # skip placeholder coords
                coord_map[code] = (float(lat), float(lon))

    return meta_map, country_map, offset_map, coord_map

def build_aircraft_map(aircraft_df: pl.DataFrame) -> Dict[str, str]:
    """
    Build lookup dict: aircraft_code (IATA) -> model description
    """
    aircraft_map = {}
    aircraft_size_map = {}
    for row in aircraft_df.iter_rows(named=True):
        code = row.get("IATA")
        model = row.get("Model")
        wtc = row.get("WTC")
        
        if code and code == code and model and model == model:
            aircraft_map[code] = model
        
        if wtc and wtc == wtc:
            aircraft_size_map[code] = wtc

    return aircraft_map, aircraft_size_map

# Cabin class map
CABIN_MAP_TEXT = {
    1: "Economy class",
    2: "Economy Plus class",
    3: "Business class",
    4: "Premium class"
}

WTC_MAP = {
    "L": "light aircraft",
    "M": "medium-size jet",
    "H": "heavy widebody jet",
    "J": "super heavy jet"
}

WTC_CAPACITY = {
    "L": 20,
    "M": 150,
    "H": 300,
    "J": 500
}

# Airline
airline_file = os.path.join(root_dir, "kaggle", "support", "airlines_lccs.csv")
airline = pl.read_csv(airline_file)

# Airport
airport_file = os.path.join(root_dir, "kaggle", "support", "Global Airports Dataset.csv")
airport = pl.read_csv(airport_file)

# Aircraft
aircraft_file = os.path.join(root_dir, "kaggle", "support", "Aircraft List.csv")
aircraft = pl.read_csv(aircraft_file)

airline_map = build_airline_name_map(airline)
airport_map, airport_country_map, airport_offset_map, airport_coord_map = build_airport_maps(airport)
aircraft_map, aircraft_size_map = build_aircraft_map(aircraft)

In [5]:
class Airport(BaseModel):
    meta: dict
    country: dict
    timezone: dict
    coord: dict

class Aircraft(BaseModel):
    meta: dict
    size: dict

class Airline(BaseModel):
    meta: dict

In [6]:
airport_info = Airport(meta=airport_map, country=airport_country_map, timezone=airport_offset_map, coord=airport_coord_map)
aircraft_info = Aircraft(meta=aircraft_map, size=aircraft_size_map)
airline_info = Airline(meta=airline_map)

In [7]:
def time_of_day_label(timestr: str) -> str:
    """
    Given a time string like '23:50:00' or '06:30:00',
    return a descriptive label and a boolean flag for night.
    """
    if timestr is None or timestr != timestr or timestr == "":
        return None, None
    try:
        hour = int(timestr.split(":")[0])
    except Exception:
        return None, None
    if 22 <= hour or hour < 5:
        return f"departing at {timestr} (night flight)", True
    elif 5 <= hour < 10:
        return f"departing at {timestr} (morning flight)", False
    else:
        return f"departing at {timestr}", False

def duration_to_minutes(val):
    if val is None or val != val:
        return None
    # If already numeric
    if isinstance(val, (int, float)):
        return int(val)
    # If string like "02:40:00"
    if isinstance(val, str):
        parts = val.split(":")
        try:
            if len(parts) == 3:
                hours, minutes, seconds = map(int, parts)
                return hours * 60 + minutes + (seconds // 60)
            elif len(parts) == 2:
                hours, minutes = map(int, parts)
                return hours * 60 + minutes
            elif len(parts) == 1:
                return int(parts[0])
        except ValueError:
            return None
    return None

def engineer_flight_features(row: dict, aircraft_info: Aircraft, airport_info: Airport) -> dict:
    # single-segment example (you can loop if you want aggregate over segments)
    base = "legs0_segments0_"

    # duration
    dur_minutes = duration_to_minutes(row.get(base + "duration")) or 0

    # baggage
    qty = row.get(base + "baggageAllowance_quantity")
    unit = row.get(base + "baggageAllowance_weightMeasurementType")
    bag_pieces = 0
    bag_kg = 0
    if qty == qty and unit:
        if str(unit).lower() in ["kg", "kilo", "kgs"]:
            bag_kg = float(qty)
        else:
            bag_pieces = int(qty)

    # aircraft capacity (approx)
    ac_code = row.get(base + "aircraft_code")
    wtc = aircraft_info.size.get(ac_code)
    cap_est = WTC_CAPACITY.get(wtc, 0)

    # seats available % 
    seats = row.get(base + "seatsAvailable")
    seats_pct = 0.0
    if seats == seats and seats is not None and cap_est > 0:
        s_int = int(seats)
        seats_pct = round(s_int / max(cap_est, s_int) * 100, 2)

    # distance & haul
    dep = row.get(base + "departureFrom_airport_iata")
    arr = row.get(base + "arrivalTo_airport_iata")
    dep_coord = airport_info.coord.get(dep)
    arr_coord = airport_info.coord.get(arr)
    distance_km = 0
    if dep_coord and arr_coord:
        try:
            distance_km = int(haversine(dep_coord[0], dep_coord[1], arr_coord[0], arr_coord[1]))
        except:
            distance_km = 0
    haul_class = 0 if distance_km < 800 else 1 if distance_km < 3500 else 2

    # night flight?
    dep_time = row.get("legs0_departureAt")
    night_flag = 0
    if dep_time and dep_time == dep_time:
        try:
            hour = int(dep_time.split(":")[0])
            night_flag = 1 if (hour >= 22 or hour < 5) else 0
        except:
            night_flag = 0

    # domestic?
    dep_country = airport_info.country.get(dep)
    arr_country = airport_info.country.get(arr)
    domestic_flag = 1 if (dep_country and arr_country and dep_country == arr_country) else 0

    return {
        "flight_duration_minutes": dur_minutes,
        "baggage_pieces": bag_pieces,
        "baggage_kg": bag_kg,
        "aircraft_capacity_est": cap_est,
        "seats_available_pct": seats_pct,
        "distance_km": distance_km,
        "haul_class": haul_class,
        "is_night_flight": night_flag,
        "is_domestic": domestic_flag,
    }

def textify_flight_row(
    row: dict, 
    airline_info: Airline,
    aircraft_info: Aircraft,
    airport_info: Airport,
) -> str:
    """
    Given a row of the dataset, return a text description of the flight.
    """
    sentences = []
    for leg in range(2):
        seg_count = 0 
        for seg in range(4):
            base = f"legs{leg}_segments{seg}_"  # ✅ define base early

            mkt_val = row.get(base + "marketingCarrier_code")
            if mkt_val is None or mkt_val != mkt_val or mkt_val == "":
                continue

            # carrier
            carrier_desc = airline_info.meta.get(mkt_val, f"Carrier {mkt_val}")

            # departure & arrival airports
            dep_code = row.get(base + "departureFrom_airport_iata")
            arr_code = row.get(base + "arrivalTo_airport_iata")
            
            dep_desc = airport_info.meta.get(dep_code, f"Airport {dep_code}")
            arr_desc = airport_info.meta.get(arr_code, f"Airport {arr_code}")

            # region context
            dep_country = airport_info.country.get(dep_code, f"Airport {dep_code}")
            arr_country = airport_info.country.get(arr_code, f"Airport {arr_code}")
            if dep_country and arr_country:
                if dep_country == arr_country:
                    region_context = "domestic flight"
                else:
                    region_context = f"international flight from {dep_country} to {arr_country}"
            else:
                region_context = None

            # aircraft model
            aircraft_code = row.get(base + "aircraft_code")
            aircraft_desc = aircraft_info.meta.get(aircraft_code, None)

            size_class = None
            wtc = aircraft_info.size.get(aircraft_code)
            if wtc and wtc in WTC_MAP:
                size_class = WTC_MAP[wtc]

            # baggage allowance
            bag_qty = row.get(base + "baggageAllowance_quantity")
            bag_type = row.get(base + "baggageAllowance_weightMeasurementType")
            bag_desc = None
            if bag_qty == bag_qty and bag_type and bag_type == bag_type:
                # bag_type might be "kg" or "piece"
                if str(bag_type).lower() in ["kg", "kilo", "kgs"]:
                    bag_desc = f"baggage allowance {bag_qty} kg"
                else:
                    bag_desc = f"baggage allowance {bag_qty} pieces"

            # seats available
            seats = row.get(base + "seatsAvailable")
            wtc = aircraft_info.size.get(aircraft_code)
            seats_desc = None
            if seats == seats and seats is not None and wtc and wtc in WTC_CAPACITY:
                try:
                    seats_int = int(seats)
                    pct = round(seats_int / max(WTC_CAPACITY[wtc], seats_int) * 100, 2)
                    seats_desc = f"seats available {seats_int} ({pct}%) (capacity {max(WTC_CAPACITY[wtc], seats_int)})"
                except Exception:
                    pass

            # duration
            dur_minutes = duration_to_minutes(row.get(base + "duration"))

            # cabin
            cab_val = row.get(base + "cabinClass")
            cabin_desc = CABIN_MAP_TEXT.get(cab_val, cab_val) if cab_val == cab_val else None

            # time difference
            time_diff = None
            dep_offset = airport_info.timezone.get(dep_code)
            arr_offset = airport_info.timezone.get(arr_code)
            if dep_offset is not None and arr_offset is not None:
                try:
                    time_diff = abs(float(dep_offset) - float(arr_offset))
                except Exception:
                    time_diff = None

            # distance & haul length
            distance_km = None
            dep_coord = airport_info.coord.get(dep_code)
            arr_coord = airport_info.coord.get(arr_code)
            if dep_coord and arr_coord:
                try:
                    distance_km = int(haversine(dep_coord[0], dep_coord[1], arr_coord[0], arr_coord[1]))
                except Exception:
                    distance_km = None
            
            haul = None
            if distance_km is not None:
                if distance_km < 800:
                    haul = "short-haul"
                elif distance_km < 3500:
                    haul = "medium-haul"
                else:
                    haul = "long-haul"
            else:
                haul = None

            # choose label
            if seg_count == 0:
                label = "Direct flight" if leg == 0 else "\nReturn flight"
            else:
                label = "Connecting flight"

            # build sentence
            sentence = f"{label}: {carrier_desc} from {dep_desc} to {arr_desc}"
            if aircraft_desc:
                sentence += f", aircraft {aircraft_desc}"
            if size_class:
                sentence += f" ({size_class})"
            if dur_minutes is not None:
                sentence += f", flight duration {dur_minutes} minutes"
            if distance_km is not None:
                sentence += f", travel distance {distance_km} km"
            if haul:
                sentence += f", {haul} flight"
            if cabin_desc:
                sentence += f", cabin class {cabin_desc}"
            if bag_desc:
                sentence += f", {bag_desc}"
            if seats_desc:
                sentence += f", {seats_desc}"
            if time_diff is not None:
                sentence += f", destination time difference {time_diff} hours"
            if region_context:
                sentence += f", {region_context}"

            sentences.append(sentence + ".")  # Add a period to each
            seg_count += 1

    return "\n".join(sentences)

In [8]:
def process_flight_parquet(
    parquet_path: str,
    airline_info: Airline,
    aircraft_info: Aircraft,
    airport_info: Airport,
    chunk_size: int = 200_000,
    output_path: str = None,
):
    """
    Read a large parquet lazily, build flight_text from ontology in chunks to keep memory low.
    Writes results to output_path (parquet) if provided, else returns list of all texts.
    """
    needed_cols = [
        f"legs{leg}_segments{seg}_{col}"
        for leg in range(2)
        for seg in range(4)
        for col in [
            "marketingCarrier_code",
            "departureFrom_airport_iata",
            "arrivalTo_airport_iata",
            "aircraft_code",
            "baggageAllowance_quantity",
            "baggageAllowance_weightMeasurementType",
            "seatsAvailable",
            "duration",
            "cabinClass",
        ]
    ]

    scan = pl.scan_parquet(parquet_path).select(needed_cols)
    df = scan.collect(engine="streaming")

    all_texts = []
    total = df.height
    for start in range(0, total, chunk_size):
        batch = df.slice(start, chunk_size)
        texts = [
            textify_flight_row(row, airline_info, aircraft_info, airport_info)
            for row in batch.iter_rows(named=True)
        ]

        if output_path:
            # append mode: write each chunk
            pl.DataFrame({"flight_text": texts}).write_parquet(
                f"{output_path}_part_{start//chunk_size}.parquet"
            )
        else:
            all_texts.extend(texts)

        del batch
        del texts

    if not output_path:
        return all_texts
        

def process_flight_parquet_with_features(
    parquet_path: str,
    airline_info: Airline,
    aircraft_info: Aircraft,
    airport_info: Airport,
    chunk_size: int = 200_000,
    output_path: str = None,
):
    needed_cols = [
        "legs0_departureAt",
        *(f"legs{leg}_segments{seg}_{col}"
          for leg in range(2) for seg in range(4)
          for col in [
              "marketingCarrier_code",
              "departureFrom_airport_iata",
              "arrivalTo_airport_iata",
              "aircraft_code",
              "baggageAllowance_quantity",
              "baggageAllowance_weightMeasurementType",
              "seatsAvailable",
              "duration",
              "cabinClass",
          ])
    ]

    scan = pl.scan_parquet(parquet_path).select(needed_cols)
    row_count = scan.select(pl.len()).collect(engine="streaming")[0, 0]

    all_rows = [] if output_path is None else None

    for start in range(0, row_count, chunk_size):
        batch_df = scan.slice(start, chunk_size).collect(engine="streaming")
        texts = []
        feats = []

        for row in batch_df.iter_rows(named=True):
            texts.append(textify_flight_row(row, airline_info, aircraft_info, airport_info))
            feats.append(engineer_flight_features(row, aircraft_info, airport_info))

        result_df = pl.DataFrame(feats)
        result_df = result_df.with_columns(pl.Series("flight_text", texts))

        if output_path:
            result_df.write_parquet(f"{output_path}_part_{start // chunk_size}.parquet")
        else:
            all_rows.append(result_df)

        del batch_df, result_df, texts, feats

    if output_path is None:
        return pl.concat(all_rows)

In [9]:
flight_train = process_flight_parquet_with_features(
    parquet_path=train_file,
    airline_info=airline_info,
    aircraft_info=aircraft_info,
    airport_info=airport_info,
    chunk_size=100_000
)

flight_test = process_flight_parquet_with_features(
    parquet_path=test_file,
    airline_info=airline_info,
    aircraft_info=aircraft_info,
    airport_info=airport_info,
    chunk_size=100_000
)

In [10]:
flight_train = flight_train.with_row_index(name='row_id')
flight_train.write_parquet(os.path.join(root_dir, "data", "processed_flight_features_train.parquet"))

flight_test = flight_test.with_row_index(name='row_id')
flight_test.write_parquet(os.path.join(root_dir, "data", "processed_flight_features_test.parquet"))