In [1]:
import sys
import os
from dotenv import load_dotenv

root_dir = os.path.abspath("..")
sys.path.append(root_dir)
dotenv_path = os.path.join(root_dir, ".env")
load_dotenv(dotenv_path)

True

In [2]:
import polars as pl
import numpy as np

### Final prep

* Join features and target
* Split them into 10 equal sized splits
  * Use 8 to train
  * Use 2 to validate


In [3]:
SESSION_ID = os.path.join(root_dir, "kaggle", "train.parquet")

TRAIN_FEATURES = os.path.join(root_dir, "data", "processed_train_features.parquet")
TRAIN_TARGETS = os.path.join(root_dir, "data", "processed_train_targets.parquet")

TARGET_COL = "selected"

In [None]:
session_id = pl.read_parquet(SESSION_ID).select(["ranker_id"]).with_row_index("row_id")
feat = pl.read_parquet(TRAIN_FEATURES)
targ = pl.read_parquet(TRAIN_TARGETS).with_row_index("row_id")

In [8]:
ranker_ids = session_id.select('ranker_id').unique().to_series().to_list()
total_ranker_ids = len(ranker_ids)
print("[INFO] Total ranker ids:", total_ranker_ids)

[INFO] Total ranker ids: 105539


In [11]:
np.random.seed(42)
np.random.shuffle(ranker_ids)

splits = np.array_split(ranker_ids, 10)
print("[INFO] Each split sizes:")
for i, s in enumerate(splits):
    print(f"  Split {i}: {len(s)} ranker_ids")

[INFO] Each split sizes:
  Split 0: 10554 ranker_ids
  Split 1: 10554 ranker_ids
  Split 2: 10554 ranker_ids
  Split 3: 10554 ranker_ids
  Split 4: 10554 ranker_ids
  Split 5: 10554 ranker_ids
  Split 6: 10554 ranker_ids
  Split 7: 10554 ranker_ids
  Split 8: 10554 ranker_ids
  Split 9: 10553 ranker_ids


In [15]:
feat_with_target = feat.join(targ, on='row_id', how='inner')
split_dfs = []
for i, id_chunk in enumerate(splits):
    id_set = set(id_chunk)
    flitered = feat_with_target.join(
        session_id.filter(pl.col("ranker_id").is_in(id_set)),
        on='row_id',
        how='inner'
    )
    split_dfs.append(flitered)
    print(f"[INFO] Split {i}")

[INFO] Split 0
[INFO] Split 1
[INFO] Split 2
[INFO] Split 3
[INFO] Split 4
[INFO] Split 5
[INFO] Split 6
[INFO] Split 7
[INFO] Split 8
[INFO] Split 9


In [16]:
for i, df in enumerate(split_dfs):
    df.write_parquet(os.path.join(root_dir, "data", "train", f"train_split_{i}.parquet"))