In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
import polars as pl

In [3]:
FILE_FLIGHT_TRAIN = os.path.join(root_dir, "data", "v1", "processed_flight_features_train.parquet")
FILE_PRICE_TRAIN = os.path.join(root_dir, "data", "v1", "processed_pricing_features_train.parquet")
FILE_USER_TRAIN = os.path.join(root_dir, "data", "v1", "processed_user_features_train.parquet")
FILE_ADDITIONAL_TRAIN = os.path.join(root_dir, "data", "v1", "processed_additional_features_train.parquet")

In [4]:
FILE_FLIGHT_TEST = os.path.join(root_dir, "data", "v1", "processed_flight_features_test.parquet")
FILE_PRICE_TEST = os.path.join(root_dir, "data", "v1", "processed_pricing_features_test.parquet")
FILE_USER_TEST = os.path.join(root_dir, "data", "v1", "processed_user_features_test.parquet")
FILE_ADDITIONAL_TEST = os.path.join(root_dir, "data", "v1", "processed_additional_features_test.parquet")

In [5]:
print("[INFO] Reading TRAIN parquet files...")
train_flight = pl.read_parquet(FILE_FLIGHT_TRAIN)
train_price  = pl.read_parquet(FILE_PRICE_TRAIN)
train_user   = pl.read_parquet(FILE_USER_TRAIN)
train_add    = pl.read_parquet(FILE_ADDITIONAL_TRAIN)

print("[INFO] Shapes:")
print("flight:", train_flight.shape)
print("price :", train_price.shape)
print("user  :", train_user.shape)
print("add   :", train_add.shape)

print("[INFO] Reading TEST parquet files...")
test_flight = pl.read_parquet(FILE_FLIGHT_TEST)
test_price  = pl.read_parquet(FILE_PRICE_TEST)
test_user   = pl.read_parquet(FILE_USER_TEST)
test_add    = pl.read_parquet(FILE_ADDITIONAL_TEST)

print("[INFO] Shapes:")
print("flight:", test_flight.shape)
print("price :", test_price.shape)
print("user  :", test_user.shape)
print("add   :", test_add.shape)

[INFO] Reading TRAIN parquet files...
[INFO] Shapes:
flight: (18145372, 53)
price : (18145372, 10)
user  : (18145372, 10)
add   : (18145372, 8)
[INFO] Reading TEST parquet files...
[INFO] Shapes:
flight: (6897776, 53)
price : (6897776, 10)
user  : (6897776, 10)
add   : (6897776, 8)


In [6]:
train_df = (
    train_flight
    .join(train_price, on="row_id")
    .join(train_user, on="row_id")
    .join(train_add, on="row_id")
)

test_df = (
    test_flight
    .join(test_price, on="row_id")
    .join(test_user, on="row_id")
    .join(test_add, on="row_id")
)

### Add ranker_id and selected

In [7]:
train_org_file = os.path.join(root_dir, "kaggle", "train.parquet")
test_org_file = os.path.join(root_dir, "kaggle", "test.parquet")

In [12]:
train_org = pl.scan_parquet(train_org_file)
test_org = pl.scan_parquet(test_org_file)

train_org_group_id = train_org.select('ranker_id').with_row_index('row_id')
test_org_group_id = test_org.select('ranker_id').with_row_index('row_id')

train_target = train_org.select('selected').with_row_index('row_id')

In [None]:
train_df = (
    train_df
    .join(train_org_group_id.collect(), on='row_id')
    .sort(["ranker_id", "go_total_flight_time"])
    .with_columns([
        pl.col("go_total_flight_time").rank(method='dense').over('ranker_id').alias('go_total_flight_time_rank'),
        (pl.col("go_total_flight_time") - pl.col("go_total_flight_time").min().over('ranker_id')).fill_null(0.0).alias('go_total_flight_time_delta')
    ])
    .sort('row_id')
    .sort(["ranker_id", "rtn_total_flight_time"])
    .with_columns([
        pl.col("rtn_total_flight_time").rank(method='dense').over('ranker_id').alias('rtn_total_flight_time_rank'),
        (pl.col("rtn_total_flight_time") - pl.col("rtn_total_flight_time").min().over('ranker_id')).fill_null(0.0).alias('rtn_total_flight_time_delta')
    ])
    .sort('row_id')
)
test_df = (
    test_df
    .join(test_org_group_id.collect(), on='row_id')
    .sort(["ranker_id", "go_total_flight_time"])
    .with_columns([
        pl.col("go_total_flight_time").rank(method='dense').over('ranker_id').alias('go_total_flight_time_rank'),
        (pl.col("go_total_flight_time") - pl.col("go_total_flight_time").min().over('ranker_id')).fill_null(0.0).alias('go_total_flight_time_delta')
    ])
    .sort('row_id')
    .sort(["ranker_id", "rtn_total_flight_time"])
    .with_columns([
        pl.col("rtn_total_flight_time").rank(method='dense').over('ranker_id').alias('rtn_total_flight_time_rank'),
        (pl.col("rtn_total_flight_time") - pl.col("rtn_total_flight_time").min().over('ranker_id')).fill_null(0.0).alias('rtn_total_flight_time_delta')
    ])
    .sort('row_id')
)

In [27]:
print("train_df.shape", train_df.shape)
print("train_target.shape", train_target.collect().shape)
print("")
print("test_df.shape", test_df.shape)

train_df.shape (18145372, 83)
train_target.shape (18145372, 2)

test_df.shape (6897776, 83)


In [29]:
train_df.write_parquet(os.path.join(root_dir, "data", "v1", "train.parquet"))
train_target.collect().write_parquet(os.path.join(root_dir, "data", "v1", "train_target.parquet"))
test_df.write_parquet(os.path.join(root_dir, "data", "v1", "test.parquet"))