In [None]:
# %%
import pandas as pd
import numpy as np
from sklearn.metrics import root_mean_squared_error as rmse
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor, early_stopping

# feature cols
default_cols = [
    "seg_duration",
    "aircraft_type",
    "full_flight_dist",
    "altitude_mean",
    "track_mean",
    "track_std",
    "vertical_rate_mean",
    "phase",
    "seg_dist",
    "vertical_rate_std",
    "groundspeed_mean",
    "groundspeed_std",
    "mach_mean",
    "mach_std",
    "TAS_mean",
    "TAS_std",
    "CAS_mean",
    "CAS_std",
    "vertical_rate_min",
    "vertical_rate_max",
    "m_tow",
    "oew",
    "mass_est_mean",
    "ff_kgs_est_mean",
    "ff_kgs_est_std",
    "mass_est_std",
    "tow_est_kg",
    #                 'tau_s', 'tau_e',
    #                 'cumdist_min', 'cumdist_max',
]

try_kfold = True

# -------------------------------------------------
# Load data
# -------------------------------------------------
df_features_alt = pd.read_parquet("data/fuel_train_with_alt_10parts.parquet")
df_features_train = pd.read_parquet("data/df_train_best_v0.parquet")
# df_features_train = df_features_train[(df_features_train['tau_e'] <= 1) & (df_features_train['tau_s'] >= 0)]
df_features_train = df_features_train[default_cols + ["idx", "ff_kgs", "fuel_kg"]]

if True:
    traj_cols = [c for c in df_features_alt.columns if ("vrate" in c)]
else:
    traj_cols = []

feature_cols = default_cols + traj_cols

## Merge with traj features
df_features_train = df_features_train.merge(
    df_features_alt[["idx"] + traj_cols], on="idx", how="left"
)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
df_features_train = df_features_train[
    (df_features_train["ff_kgs"] < 6.5) & (df_features_train["ff_kgs"] > 0.05)
]

In [None]:
# Load rank data (for test/prediction after CV)
df_features_rank_alt = pd.read_parquet("data/fuel_rank_with_alt_10parts.parquet")
df_features_rank = pd.read_parquet("data/df_rank_best_v0.parquet")
df_features_rank = df_features_rank[default_cols + ["idx", "ff_kgs", "fuel_kg"]]
df_features_rank = df_features_rank.merge(
    df_features_rank_alt[["idx"] + traj_cols], on="idx", how="left"
)

In [None]:
# -------------------------------------------------
# Settings
# -------------------------------------------------
target_col = "ff_kgs"

base_params = {
    "n_estimators": 7500,
    "learning_rate": 0.01,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_lambda": 0.5,
    "reg_alpha": 0.1,
    "metric": "rmse",
    "random_state": 42,
}

categorical_features = ["aircraft_type", "phase"]

# -------------------------------------------------
# Build X / y
# -------------------------------------------------
X = df_features_train[feature_cols]

# %%

y = df_features_train[target_col]

# Ensure cat features exist
cat_feats_actual = [c for c in categorical_features if c in X.columns]
for c in categorical_features:
    X[c] = X[c].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[c] = X[c].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[c] = X[c].astype("category")


In [None]:
# -------------------------------------------------
# K-fold CV (5 folds)
# -------------------------------------------------
if try_kfold:
    kf = KFold(n_splits=5, shuffle=False)

    fold_predictions = []
    oof_preds = np.zeros(len(df_features_train))

    print("\n============================")
    print("  5-FOLD CROSS VALIDATION")
    print("============================\n")

    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
        print(f"\n----- Fold {fold + 1} -----")

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = LGBMRegressor(**base_params)

        model.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
            categorical_feature=cat_feats_actual,
            eval_metric="rmse",
            callbacks=[early_stopping(200)],
        )

        valid_pred = model.predict(X_valid)
        oof_preds[valid_idx] = valid_pred

        # RMSE in kg: ff_kgs * deltat
        rmse_fold = rmse(
            y_valid * X_valid["seg_duration"], valid_pred * X_valid["seg_duration"]
        )
        print(f"Fold {fold + 1} RMSE: {rmse_fold:.5f}")

    # OOF score
    rmse_oof = rmse(y * X["seg_duration"], oof_preds * X["seg_duration"])
    print(f"\n============================")
    print(f"OOF RMSE (5 fold): {rmse_oof:.5f}")
    print("============================\n")


  5-FOLD CROSS VALIDATION


----- Fold 1 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10940
[LightGBM] [Info] Number of data points in the train set: 104812, number of used features: 47
[LightGBM] [Info] Start training from score 0.902043
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3115]	valid_0's rmse: 0.317974




Fold 1 RMSE: 182.43269

----- Fold 2 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003333 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10927
[LightGBM] [Info] Number of data points in the train set: 104813, number of used features: 47
[LightGBM] [Info] Start training from score 0.910584
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[4218]	valid_0's rmse: 0.360886




Fold 2 RMSE: 290.52998

----- Fold 3 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10948
[LightGBM] [Info] Number of data points in the train set: 104813, number of used features: 47
[LightGBM] [Info] Start training from score 0.931779
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[4547]	valid_0's rmse: 0.303108




Fold 3 RMSE: 173.77424

----- Fold 4 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003355 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10966
[LightGBM] [Info] Number of data points in the train set: 104813, number of used features: 47
[LightGBM] [Info] Start training from score 0.926671
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[3042]	valid_0's rmse: 0.295379




Fold 4 RMSE: 192.89095

----- Fold 5 -----
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10958
[LightGBM] [Info] Number of data points in the train set: 104813, number of used features: 47
[LightGBM] [Info] Start training from score 0.950587
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[2682]	valid_0's rmse: 0.286982
Fold 5 RMSE: 136.19214

OOF RMSE (5 fold): 201.81267





In [None]:
# -------------------------------------------------
# Train final model on FULL TRAINING SET
# -------------------------------------------------
import random

print("Training final FULL models (for submission)...")

X_test = df_features_rank[feature_cols]
for c in categorical_features:
    X_test[c] = X_test[c].astype("category")

assert len(X_test.columns) == len(X.columns)

# Ensemble of 5 models using different seeds
seeds = [46]
preds = []

for seed in seeds:
    params = {**base_params, "random_state": seed}
    model = LGBMRegressor(**params)
    model.fit(
        X,
        y,
        categorical_feature=cat_feats_actual,
        eval_metric="rmse",
    )
    preds.append(model.predict(X_test))

preds = np.vstack(preds)
ff_kgs_pred = np.median(preds, axis=0)

df_features_rank["ff_kgs"] = ff_kgs_pred
df_features_rank["fuel_kg"] = ff_kgs_pred * df_features_rank["seg_duration"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[c] = X_test[c].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[c] = X_test[c].astype("category")


Training final FULL models (for submission)...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005984 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10975
[LightGBM] [Info] Number of data points in the train set: 131016, number of used features: 47
[LightGBM] [Info] Start training from score 0.924333


KeyboardInterrupt: 

In [None]:
# -------------------------------------------------
# Save submission
# -------------------------------------------------
df_final = pd.read_parquet("data/fuel_rank_submission.parquet")
df_final["fuel_kg"] = df_features_rank["fuel_kg"].values
df_final.to_parquet("data/resourceful-quiver_v170.parquet", index=False)

print("Saved submission â†’ data/resourceful-quiver_v170.parquet")
print(df_final[["idx", "fuel_kg"]].head())
