In [4]:
# --- 1. Setup & Imports ---
import polars as pl
import numpy as np
import lightgbm as lgb
import xgboost as xgb
import optuna
import gc
from pathlib import Path
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix

print("--- Ultimate Flight Recommender: Clean Paths Version ---")

RANDOM_STATE = 42
N_COMPONENTS = 15
OPTUNA_TRIALS = 25
np.random.seed(RANDOM_STATE)
INPUT_DIR = Path("/kaggle/input/aeroclub-recsys-2025")


--- Ultimate Flight Recommender: Clean Paths Version ---


In [5]:
def hitrate_at_3(y_true, y_pred, groups):
    df = pl.DataFrame({'group': groups, 'pred': y_pred, 'true': y_true})
    return df.sort(["group", "pred"], descending=[False, True]).group_by("group", maintain_order=True).head(3).group_by("group").agg(pl.col("true").max()).select(pl.col("true").mean()).item()

def re_rank(test: pl.DataFrame, submission: pl.DataFrame, penalty_factor=0.1):
    COLS_TO_COMPARE = ["legs0_departureAt", "legs0_arrivalAt", "legs1_departureAt", "legs1_arrivalAt", "legs0_segments0_flightNumber", "legs1_segments0_flightNumber"]
    test = test.with_columns([pl.col(c).cast(str).fill_null("NULL") for c in COLS_TO_COMPARE if c in test.columns])
    df = submission.join(test, on=["Id", "ranker_id"], how="left")
    df = df.with_columns(pl.concat_str([c for c in COLS_TO_COMPARE if c in df.columns]).alias("flight_hash"))
    df = df.with_columns(pl.max("pred_score").over(["ranker_id", "flight_hash"]).alias("max_score_same_flight"))
    df = df.with_columns((pl.col("pred_score") - penalty_factor * (pl.col("max_score_same_flight") - pl.col("pred_score"))).alias("reorder_score"))
    df = df.with_columns(pl.col("reorder_score").rank(method="ordinal", descending=True).over("ranker_id").cast(pl.Int32).alias("new_selected"))
    return df.select(["Id", "ranker_id", "new_selected"])


In [None]:
train_raw = pl.read_parquet(INPUT_DIR / 'train.parquet')
test_raw = pl.read_parquet(INPUT_DIR / 'test.parquet')
train_height = train_raw.height

data = pl.concat([
    train_raw,
    test_raw.with_columns(pl.lit(0, dtype=pl.Int64).alias("selected"))
], how="vertical_relaxed")
del test_raw
gc.collect()

# Duration parsing
def dur_to_min(col):
    days = col.str.extract(r"^(\d+)\.", 1).cast(pl.Int64).fill_null(0) * 1440
    time_str = pl.when(col.str.contains(r"^\d+\.")).then(col.str.replace(r"^\d+\.", "")).otherwise(col)
    hours = time_str.str.extract(r"^(\d+):", 1).cast(pl.Int64).fill_null(0) * 60
    minutes = time_str.str.extract(r":(\d+):", 1).cast(pl.Int64).fill_null(0)
    return (days + hours + minutes).fill_null(0)

df = data.clone()
dur_cols = [c for c in df.columns if "duration" in c and df[c].dtype == pl.Utf8]
df = df.with_columns([dur_to_min(pl.col(c)).alias(c) for c in dur_cols])

df = df.with_columns([
    (pl.col("legs0_duration").fill_null(0) + pl.col("legs1_duration").fill_null(0)).alias("total_duration"),
    pl.col("Id").count().over("ranker_id").alias("group_size"),
    pl.sum_horizontal(pl.col(f"legs0_segments{i}_duration").is_not_null() for i in range(4)).alias("n_segments_leg0"),
    pl.sum_horizontal(pl.col(f"legs1_segments{i}_duration").is_not_null() for i in range(4)).alias("n_segments_leg1"),
    (pl.col("totalPrice").rank("average").over("ranker_id") / pl.col("group_size")).alias("price_pct_rank"),
    (pl.col("totalPrice") == pl.col("totalPrice").min().over("ranker_id")).alias("is_cheapest"),
    (pl.col("total_duration") == pl.col("total_duration").min().over("ranker_id")).alias("is_fastest"),
])
df = df.with_columns(
    (pl.col("n_segments_leg0") + pl.col("n_segments_leg1")).alias("total_segments")
)
df = df.with_columns([
    (pl.col("total_segments") == pl.col("total_segments").min().over("ranker_id")).alias("is_min_segments"),
])

df = df.join(
    train_raw.group_by('legs0_segments0_marketingCarrier_code').agg(pl.mean('selected').alias('carrier0_pop')),
    on='legs0_segments0_marketingCarrier_code', how='left'
)


In [None]:
svd_source_df = train_raw.select(["profileId", "legs0_segments0_marketingCarrier_code"])
user_map = svd_source_df.select("profileId").unique().with_row_index("user_code")
item_map = svd_source_df.select("legs0_segments0_marketingCarrier_code").unique().with_row_index("item_code")
svd_source_df = svd_source_df.join(user_map, on="profileId").join(item_map, on="legs0_segments0_marketingCarrier_code")

sparse_matrix = csr_matrix((np.ones(len(svd_source_df)), (svd_source_df['user_code'], svd_source_df['item_code'])), shape=(len(user_map), len(item_map)))
svd = TruncatedSVD(n_components=N_COMPONENTS, random_state=RANDOM_STATE)
user_embeddings = svd.fit_transform(sparse_matrix)
item_embeddings = svd.components_.T

user_svd_df = pl.DataFrame(user_embeddings, schema=[f"user_svd_{i}" for i in range(N_COMPONENTS)]).with_columns(user_map.get_column("profileId"))
item_svd_df = pl.DataFrame(item_embeddings, schema=[f"item_svd_{i}" for i in range(N_COMPONENTS)]).with_columns(item_map.get_column("legs0_segments0_marketingCarrier_code"))

df = df.join(user_svd_df, on="profileId", how="left")
df = df.join(item_svd_df, on="legs0_segments0_marketingCarrier_code", how="left")
data = df.fill_null(0)


In [None]:
# --- Step 5: Model Training & Submission ---
print("\n📦 Preparing data for modeling...")

train_df = data.head(train_height)
test_df = data.tail(len(data) - train_height)
feature_cols = [col for col in data.columns if col not in [
    'Id', 'ranker_id', 'selected', 'profileId', 'requestDate',
    'legs0_departureAt', 'legs0_arrivalAt', 'legs1_departureAt', 'legs1_arrivalAt'
] and data[col].dtype.is_numeric()]

# -------------------------------
# Optuna Hyperparameter Tuning (XGBoost)
# -------------------------------
print("\n🎯 Starting Optuna tuning for XGBoost...")

X_all = train_df.select(feature_cols)
y_all = train_df.select('selected')
groups_all = train_df.select('ranker_id')

# Time-based split (or random)
split_point = int(0.8 * train_height)
X_tr, X_val = X_all[:split_point], X_all[split_point:]
y_tr, y_val = y_all[:split_point], y_all[split_point:]
groups_tr = groups_all[:split_point]
groups_val = groups_all[split_point:]

group_sizes_tr = groups_tr.group_by('ranker_id', maintain_order=True).agg(pl.len())['len'].to_numpy()
group_sizes_val = groups_val.group_by('ranker_id', maintain_order=True).agg(pl.len())['len'].to_numpy()

dtrain = xgb.DMatrix(X_tr.to_pandas(), label=y_tr.to_pandas(), group=group_sizes_tr)
dval = xgb.DMatrix(X_val.to_pandas(), label=y_val.to_pandas(), group=group_sizes_val)

def objective(trial):
    params = {
        'objective': 'rank:pairwise',
        'eval_metric': 'ndcg@3',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 6, 12),
        'subsample': trial.suggest_float('subsample', 0.6, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'lambda': trial.suggest_float('lambda', 1e-2, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-2, 10.0, log=True),
        'seed': RANDOM_STATE
    }
    model = xgb.train(
        params, dtrain, num_boost_round=1000,
        evals=[(dval, "val")], early_stopping_rounds=50,
        verbose_eval=False
    )
    return model.best_score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=OPTUNA_TRIALS)

best_xgb_params = study.best_params
best_iteration = int(study.best_trial.user_attrs.get('best_iteration', 300) * 1.2)
print(f"🔧 Best XGBoost NDCG@3: {study.best_value:.5f}")

In [None]:
print("\n🚀 Training final models...")

# Full training
X_train = train_df.select(feature_cols).to_pandas()
y_train = train_df.select('selected').to_pandas()
group_sizes_train = train_df.select('ranker_id').to_pandas()['ranker_id'].value_counts(sort=False).to_numpy()

X_test = test_df.select(feature_cols).to_pandas()

# Scale for LR
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Model 1: XGBoost ---
dtrain_full = xgb.DMatrix(X_train, label=y_train, group=group_sizes_train)
dtest = xgb.DMatrix(X_test)

final_xgb = xgb.train(
    {'objective': 'rank:pairwise', 'eval_metric': 'ndcg@3', 'seed': RANDOM_STATE, **best_xgb_params},
    dtrain_full,
    num_boost_round=best_iteration
)
preds_xgb = final_xgb.predict(dtest)

# --- Model 2: LightGBM ---
lgb_train = lgb.Dataset(X_train, label=y_train, group=group_sizes_train)
lgb_params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'eval_at': [3],
    'verbose': -1,
    'seed': RANDOM_STATE
}
final_lgbm = lgb.train(lgb_params, lgb_train, num_boost_round=1200)
preds_lgbm = final_lgbm.predict(X_test)

# --- Model 3: Logistic Regression ---
lr_model = LogisticRegression(random_state=RANDOM_STATE, class_weight='balanced', max_iter=1000, C=0.01)
lr_model.fit(X_train_scaled, y_train.values.ravel())
preds_lr = lr_model.predict_proba(X_test_scaled)[:, 1]


In [None]:
print("\n📤 Creating submission...")

# Ensemble
ensemble_preds = 0.45 * preds_xgb + 0.45 * preds_lgbm + 0.10 * preds_lr

# Re-rank and output
raw_test_df = pl.read_parquet(INPUT_DIR / 'test.parquet')
sub_df = test_df.select(['Id', 'ranker_id']).with_columns(pl.Series('pred_score', ensemble_preds))
final_submission = re_rank(raw_test_df, sub_df).rename({"new_selected": "selected"})

# Check and save
assert final_submission.shape[0] == raw_test_df.shape[0], "Row count mismatch!"
final_submission.write_csv("submission.csv")
print("✅ Final 'submission.csv' generated!")
