In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

False

In [2]:
import polars as pl

In [3]:
train_file = os.path.join(root_dir, "kaggle", "train.parquet")
test_file = os.path.join(root_dir, "kaggle", "test.parquet")

In [10]:
def engineer_pricing_features(row: dict) -> dict:
    # Raw values
    total_price = row.get("totalPrice")
    taxes = row.get("taxes")
    # Guard against None/NaN
    total_price = float(total_price) if total_price == total_price else 0.0
    taxes = float(taxes) if taxes == taxes else 0.0

    # Derived ratio
    tax_ratio = (taxes / total_price) if total_price > 0 else 0.0

    # Mini rules penalties and percentages
    m0_penalty = row.get("miniRules0_monetaryAmount")
    m0_pct = row.get("miniRules0_percentage")
    m1_penalty = row.get("miniRules1_monetaryAmount")
    m1_pct = row.get("miniRules1_percentage")
    
    m0_penalty = float(m0_penalty) if m0_penalty is not None and m0_penalty == m0_penalty else 0.0
    m0_pct = float(m0_pct) if m0_pct is not None and m0_pct == m0_pct else 0.0
    m1_penalty = float(m1_penalty) if m1_penalty is not None and m1_penalty == m1_penalty else 0.0
    m1_pct = float(m1_pct) if m1_pct is not None and m1_pct == m1_pct else 0.0

    # AccessTP (boolean)
    is_access_tp = 1 if row.get("pricingInfo_isAccessTP") else 0

    # Passenger count
    passenger_count = row.get("pricingInfo_passengerCount")
    passenger_count = int(passenger_count) if passenger_count == passenger_count else 0

    return {
        "total_price": total_price,
        "taxes": taxes,
        "tax_ratio": tax_ratio,
        "mini_rule0_penalty": m0_penalty,
        "mini_rule0_pct": m0_pct,
        "mini_rule1_penalty": m1_penalty,
        "mini_rule1_pct": m1_pct,
        "is_access_tp": is_access_tp,
        "passenger_count": passenger_count
    }

In [11]:
def process_pricing_parquet(parquet_path: str, chunk_size: int = 200_000, output_path: str = None):
    needed_cols = [
        "miniRules0_monetaryAmount", "miniRules0_percentage",
        "miniRules0_statusInfos", "miniRules1_monetaryAmount",
        "miniRules1_percentage", "miniRules1_statusInfos",
        "pricingInfo_isAccessTP", "pricingInfo_passengerCount",
        "taxes", "totalPrice"
    ]

    scan = (
        pl.scan_parquet(parquet_path)
        .select(needed_cols)
        .with_row_index(name="row_id")
    )

    row_count = scan.select(pl.len()).collect(engine="streaming")[0, 0]
    all_chunks = []

    for start in range(0, row_count, chunk_size):
        batch_df = scan.slice(start, chunk_size).collect(engine="streaming")
        feats = [engineer_pricing_features(row) for row in batch_df.iter_rows(named=True)]
        feats_df = pl.DataFrame(feats).with_columns(pl.Series("row_id", batch_df["row_id"]))

        if output_path:
            feats_df.write_parquet(f"{output_path}_pricing_part_{start // chunk_size}.parquet")
        else:
            all_chunks.append(feats_df)

    if not output_path:
        return pl.concat(all_chunks)

In [12]:
pricing_train = process_pricing_parquet(
    parquet_path=train_file,
    chunk_size=100_000
)
pricing_test = process_pricing_parquet(
    parquet_path=test_file,
    chunk_size=100_000
)

In [13]:
pricing_train.write_parquet(os.path.join(root_dir, "data", "processed_pricing_features_train.parquet"))
pricing_test.write_parquet(os.path.join(root_dir, "data", "processed_pricing_features_test.parquet"))