In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

False

In [None]:
import polars as pl

In [3]:
train_file = os.path.join(root_dir, "kaggle", "train.parquet")
test_file = os.path.join(root_dir, "kaggle", "test.parquet")

In [4]:
def engineer_user_features(row: dict) -> dict:
    # Simple booleans
    is_self_booking = 1 if row.get("bySelf") else 0
    is_vip = 1 if row.get("isVip") else 0
    is_access_3d = 1 if row.get("isAccess3D") else 0

    # Corporate info
    has_corporate_tariff = 1 if row.get("corporateTariffCode") not in (None, "", float("nan")) else 0

    # Frequent flyer programs
    ff_raw = row.get("frequentFlyer")
    ff_set = set(ff_raw.split("/")) if ff_raw and ff_raw == ff_raw else set()
    frequent_flyer_count = len(ff_set)

    # --- Frequent flyer match with flight carriers ---
    flags = []
    for leg in range(2):
        for seg in range(4):
            mkt_col = f"legs{leg}_segments{seg}_marketingCarrier_code"
            op_col = f"legs{leg}_segments{seg}_operatingCarrier_code"
            mkt_val = row.get(mkt_col)
            op_val = row.get(op_col)
            if (mkt_val and mkt_val in ff_set) or (op_val and op_val in ff_set):
                flags.append(True)
            else:
                flags.append(False)
    any_segment_in_ff = 1 if any(flags) else 0
    count_segments_in_ff = sum(flags)

    # Nationality
    nat_code = row.get("nationality")
    nat_code = int(nat_code) if nat_code == nat_code else -1

    # Sex
    sex_val = row.get("sex")
    is_male = 1 if sex_val is True else (0 if sex_val is False else -1)

    return {
        "is_self_booking": is_self_booking,
        "is_vip": is_vip,
        "is_access_3d": is_access_3d,
        "has_corporate_tariff": has_corporate_tariff,
        "frequent_flyer_count": frequent_flyer_count,
        "any_segment_in_ff": any_segment_in_ff,
        "count_segments_in_ff": count_segments_in_ff,
        "nationality_code": nat_code,
        "is_male": is_male
    }



In [5]:
def process_user_parquet(parquet_path: str, chunk_size: int = 200_000, output_path: str = None):
    # Add flight columns needed for frequent flyer match
    flight_cols = [
        f"legs{leg}_segments{seg}_{col}"
        for leg in range(2)
        for seg in range(4)
        for col in ["marketingCarrier_code", "operatingCarrier_code"]
    ]
    needed_cols = [
        "bySelf", "isVip", "isAccess3D",
        "companyID", "corporateTariffCode",
        "frequentFlyer", "nationality", "sex"
    ] + flight_cols

    scan = (
        pl.scan_parquet(parquet_path)
        .select(needed_cols)
        .with_row_index(name="row_id")
    )

    row_count = scan.select(pl.len()).collect(engine="streaming")[0, 0]
    all_chunks = []

    for start in range(0, row_count, chunk_size):
        batch_df = scan.slice(start, chunk_size).collect(engine="streaming")
        feats = [engineer_user_features(row) for row in batch_df.iter_rows(named=True)]
        feats_df = pl.DataFrame(feats).with_columns(pl.Series("row_id", batch_df["row_id"]))

        if output_path:
            feats_df.write_parquet(f"{output_path}_user_part_{start // chunk_size}.parquet")
        else:
            all_chunks.append(feats_df)

    if not output_path:
        return pl.concat(all_chunks)

In [6]:
user_train = process_user_parquet(
    parquet_path=train_file,
    chunk_size=100_000
)
user_test = process_user_parquet(
    parquet_path=test_file,
    chunk_size=100_000
)

In [7]:
user_train.write_parquet(os.path.join(root_dir, "data", "processed_user_features_train.parquet"))
user_test.write_parquet(os.path.join(root_dir, "data", "processed_user_features_test.parquet"))