In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
import polars as pl

In [3]:
train_file = os.path.join(root_dir, "kaggle", "train.parquet")
test_file = os.path.join(root_dir, "kaggle", "test.parquet")

In [4]:
def get_session_ranks(df: pl.DataFrame) -> pl.DataFrame:
    segment_cols = [
        "legs0_segments0_marketingCarrier_code",
        "legs0_segments1_marketingCarrier_code",
        "legs0_segments2_marketingCarrier_code",
        "legs1_segments0_marketingCarrier_code",
        "legs1_segments1_marketingCarrier_code",
        "legs1_segments2_marketingCarrier_code",
    ]
    if not segment_cols:
        raise ValueError("No *_marketingCarrier_code columns found in DataFrame!")

    ff_list_expr = (
        pl.when(pl.col("frequentFlyer").is_null())
        .then(pl.lit([]))
        .otherwise(pl.col("frequentFlyer").str.split("/"))
        .alias("ff_list")
    )

    exprs = [pl.col("ff_list").list.contains(pl.col(c)) for c in segment_cols]
    is_pref_expr = pl.any_horizontal(exprs).cast(pl.Int8).alias("is_preferred_airline")
    ranked = (
        df
        .with_row_index(name="row_id")
        .with_columns([ff_list_expr])
        .with_columns([is_pref_expr])
        .sort(["ranker_id", "totalPrice"])
        .with_columns([
            pl.col("totalPrice").rank(method="dense").over("ranker_id").alias("price_rank"),
            (pl.col("totalPrice") - pl.col("totalPrice").min().over("ranker_id"))
            .fill_null(0.0)
            .alias("price_delta"),
            pl.col("legs0_departureAt").rank(method="dense").over("ranker_id").alias("departure_rank")
        ])
        .sort(["row_id"])
    )

    return ranked.select([
        "row_id", "price_rank", "price_delta", "departure_rank", "is_preferred_airline"
    ])

def get_lead_time(df: pl.DataFrame) -> pl.DataFrame:
    lead = (
        df
        .with_row_index(name="row_id")
        .with_columns([
            (pl.col("legs0_departureAt").cast(pl.Datetime("us")) - pl.col("requestDate").cast(pl.Datetime("us")))
            .dt.total_days()
            .alias("lead_time_days")
        ])
    )
    return lead.select(["row_id", "lead_time_days"])

def get_departure_month(df: pl.DataFrame) -> pl.DataFrame:
    month = (
        df
        .with_row_index(name="row_id")
        .with_columns([
            pl.col("legs0_departureAt").cast(pl.Datetime("us")).dt.month().alias("departure_month"),
            (pl.col("legs0_departureAt").cast(pl.Datetime("us")).dt.month().is_in([7,8])).cast(pl.Int8).alias("is_summer_trip")
        ])
    )
    return month.select(["row_id", "departure_month", "is_summer_trip"])

In [5]:
d = pl.read_parquet(train_file)

dsr = get_session_ranks(d)
dlt = get_lead_time(d)
ddm = get_departure_month(d)

In [6]:
t = pl.read_parquet(test_file)

tsr = get_session_ranks(t)
tlt = get_lead_time(t)
tdm = get_departure_month(t)

In [7]:
merged_train = (
    dsr
    .join(dlt, on="row_id")
    .join(ddm, on="row_id")
)

merged_test = (
    tsr
    .join(tlt, on="row_id")
    .join(tdm, on="row_id")
)

In [8]:
merged_train.write_parquet(os.path.join(root_dir, "data", "v1", "processed_additional_features_train.parquet"))
merged_test.write_parquet(os.path.join(root_dir, "data", "v1", "processed_additional_features_test.parquet"))