In [10]:
# %%
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from lightgbm import LGBMRegressor, early_stopping
import os

select_cols = True
get_auto_version = True
try_kfold = False
save_model_and_metadata = True
do_submission = False
final_model = True

description = "Use physical features. ADSB + ACARS + Wind + Vrate + Mass. ff_kgs filter. n_estimators 5000"

if(select_cols):
    cols_to_take = ['idx', 'fuel_kg',  'ff_kgs',
               'seg_duration','seg_dist',
               'flight_duration', 'full_flight_dist',
               'aircraft_type',  'phase',
#                'tau_s', 'tau_e',
               
#                'latitude_mean', 'latitude_std', 'latitude_min', 'latitude_max',
#                'longitude_mean', 'longitude_std', 'longitude_min', 'longitude_max',
               
               'altitude_mean',
               'groundspeed_mean', 'groundspeed_std',
               'track_mean', 'track_std',
                    
               'vertical_rate_min', 'vertical_rate_max',
                    
#                'altitude_min', 'altitude_max',
#                'groundspeed_min', 'groundspeed_max', 
#                'track_min', 'track_max',
#                'mach_min', 'mach_max',
#                'TAS_min', 'TAS_max',
#                'CAS_min', 'CAS_max',
               
               'vertical_rate_mean', 'vertical_rate_std',

               'mach_mean', 'mach_std',               
               'TAS_mean', 'TAS_std',
               'CAS_mean', 'CAS_std',
                    
#                'compute_TAS_mean', 'compute_TAS_std',
#                'compute_mach_mean', 'compute_mach_std',
#                'compute_CAS_mean', 'compute_CAS_std',
                              
#                'cumdist_mean',
               
#                'cumdist_std', 'cumdist_min', 'cumdist_max',
               
#                'distance_km_mean',
               
#                'distance_km_std', 'distance_km_min', 'distance_km_max',
               
                'tow_est_kg',     
#                 'ff_kgs_est_mean', 'ff_kgs_est_std',
#                 'mass_est_mean', 'mass_est_std',
                'm_tow', 'oew',
                    
                "mass_est_tf_mean",
                "mass_est_tf_std",
                "ff_kgs_est_mass_tf_mean",
                "ff_kgs_est_mass_tf_std",
#                 "ff_kgs_est_mass_tf_min",
#                 "ff_kgs_est_mass_tf_max",
#                'mass_est_tf_min', 'mass_est_tf_max',
#                'ff_kgs_est_min', 'ff_kgs_est_max',
               
               'vertical_rate_mean_0',
               'vertical_rate_mean_1', 'vertical_rate_mean_2', 'vertical_rate_mean_3',
               'vertical_rate_mean_4', 'vertical_rate_mean_5', 'vertical_rate_mean_6',
               'vertical_rate_mean_7', 'vertical_rate_mean_8', 'vertical_rate_mean_9',
               'vertical_rate_std_0', 'vertical_rate_std_1', 'vertical_rate_std_2',
               'vertical_rate_std_3', 'vertical_rate_std_4', 'vertical_rate_std_5',
               'vertical_rate_std_6', 'vertical_rate_std_7', 'vertical_rate_std_8',
               'vertical_rate_std_9',
                    
#                'ts_min', 'ts_std', 'ts_mean', 'ts_max',
                    
#                'compute_TAS_min', 'compute_TAS_max',
                    
#                 "drag_mean", "drag_std",

#                 "acceleration_mean",
#                 "acceleration_std",
                    
#                 "thrust_mean",
#                 "thrust_std",
                    
#                 "work_mean",
#                 "work_std",
#                 "work_min",
#                 "work_max"
              ]
else:
    cols_to_take = []

# -------------------------------------------------
# Load data
# -------------------------------------------------
df_features_train = pd.read_parquet('data/df_train_v2.parquet')

# df_features_train = df_features_train[cols_to_take]

must_drop = ["idx", "flight_id", "fuel_kg", "ff_kgs", "start", "end", "flight_date", "takeoff", "landed"]

weather_drop = [c for c in df_features_train.columns
             if (("temperature" in c) or
                 ("specific_humidity" in c) or
                 ("component_of_wind" in c)
                )]

must_drop.extend(weather_drop)

# Filter out impossible fuel rate
filter_low_high = {'low': 0.05, "high": 6.5}
df_features_train = df_features_train[(df_features_train['ff_kgs'] < filter_low_high['high']) & (df_features_train['ff_kgs'] > filter_low_high['low'])]

# Get rank file
df_features_rank = pd.read_parquet('data/df_rank_v2.parquet')

In [7]:
# -------------------------------------------------
# Settings
# -------------------------------------------------
target_col = "ff_kgs"

base_params = {
    "n_estimators": 7500,
    "learning_rate": 0.01,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_lambda": 0.5,
    "reg_alpha": 0.1,
    "metric": "rmse",
    "random_state": 46
}

categorical_features = ["aircraft_type", "phase"]

# -------------------------------------------------
# Build X / y
# -------------------------------------------------
if len(cols_to_take) > 0:
    X = df_features_train[cols_to_take].drop(columns=must_drop, errors='ignore')
else:
    X = df_features_train.drop(columns=must_drop, errors='ignore')

# %%

y = df_features_train[target_col]

# Ensure cat features exist
cat_feats_actual = [c for c in categorical_features if c in X.columns]
for c in categorical_features:
    X[c] = X[c].astype("category")
    
print("Nb features: ", len(X.columns))
X.columns

Nb features:  48


Index(['seg_duration', 'seg_dist', 'flight_duration', 'full_flight_dist',
       'aircraft_type', 'phase', 'altitude_mean', 'groundspeed_mean',
       'groundspeed_std', 'track_mean', 'track_std', 'vertical_rate_min',
       'vertical_rate_max', 'vertical_rate_mean', 'vertical_rate_std',
       'mach_mean', 'mach_std', 'TAS_mean', 'TAS_std', 'CAS_mean', 'CAS_std',
       'tow_est_kg', 'm_tow', 'oew', 'mass_est_tf_mean', 'mass_est_tf_std',
       'ff_kgs_est_mass_tf_mean', 'ff_kgs_est_mass_tf_std',
       'vertical_rate_mean_0', 'vertical_rate_mean_1', 'vertical_rate_mean_2',
       'vertical_rate_mean_3', 'vertical_rate_mean_4', 'vertical_rate_mean_5',
       'vertical_rate_mean_6', 'vertical_rate_mean_7', 'vertical_rate_mean_8',
       'vertical_rate_mean_9', 'vertical_rate_std_0', 'vertical_rate_std_1',
       'vertical_rate_std_2', 'vertical_rate_std_3', 'vertical_rate_std_4',
       'vertical_rate_std_5', 'vertical_rate_std_6', 'vertical_rate_std_7',
       'vertical_rate_std_8', '

In [3]:
# -------------------------------------------------
# K-fold CV (5 folds)
# -------------------------------------------------
if(try_kfold):
    kf = KFold(n_splits=5, shuffle=False)

    fold_predictions = []
    oof_preds = np.zeros(len(df_features_train))

    print("\n============================")
    print("  5-FOLD CROSS VALIDATION")
    print("============================\n")

    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
        print(f"\n----- Fold {fold+1} -----")

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = LGBMRegressor(**base_params)

        model.fit(
            X_train, y_train,
#             eval_set=[(X_valid, y_valid)],
            categorical_feature=cat_feats_actual,
            eval_metric="rmse",
#             callbacks=[early_stopping(200)]
        )

        valid_pred = model.predict(X_valid)
        oof_preds[valid_idx] = valid_pred

        # RMSE in kg: ff_kgs * deltat
        rmse_fold = mean_squared_error(
            y_valid * X_valid['seg_duration'], valid_pred * X_valid['seg_duration'], squared=False
        )
        print(f"Fold {fold+1} RMSE: {rmse_fold:.5f}")

    # OOF score
    rmse_oof = mean_squared_error(
        y * X['seg_duration'], oof_preds * X['seg_duration'], squared=False
    )
    print(f"\n============================")
    print(f"OOF RMSE (5 fold): {rmse_oof:.5f}")
    print("============================\n")

In [8]:
# -------------------------------------------------
# Train final model on FULL TRAINING SET
# -------------------------------------------------
import random

print("Training final FULL models (for submission)...")

X_test = df_features_rank[cols_to_take].drop(columns = must_drop, errors='ignore')

for c in categorical_features:
    X_test[c] = X_test[c].astype("category")
    
assert(len(X_test.columns) == len(X.columns))

params = {**base_params}
model = LGBMRegressor(**params, verbose = -1)
model.fit(
    X, y,
    categorical_feature=cat_feats_actual,
    eval_metric="rmse",
)
    
ff_kgs_pred = model.predict(X_test)

df_features_rank["ff_kgs"] = ff_kgs_pred
df_features_rank["fuel_kg"] = ff_kgs_pred * df_features_rank["seg_duration"]

Training final FULL models (for submission)...


In [9]:
import json
import re
import subprocess
import time

# Find remote latest version
cmd = ["mc", "ls", "opensky/prc-2025-resourceful-quiver/"]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
output = result.stdout

versions = re.findall(r"resourceful-quiver_v(\d+)\.parquet", output)
if not versions:
    raise ValueError("No resourceful-quiver_vXXX.parquet files found.")

latest_version = max(map(int, versions))

if(get_auto_version):
    next_version = latest_version + 1

# -------------------------------------------------
# Save submission
# -------------------------------------------------

df_final = pd.read_parquet('data/fuel_rank_submission.parquet')
df_final["fuel_kg"] = df_features_rank["fuel_kg"].values
df_final.to_parquet(f'data/resourceful-quiver_v{next_version}.parquet', index=False)

print(f"Saved submission → data/resourceful-quiver_v{next_version}.parquet")
print(df_final[['idx', 'fuel_kg']].head())

X.to_parquet(f'v{next_version}.parquet',index = False)

# -------------------------------------------------
# Upload to MinIO
# -------------------------------------------------
cmd_upload = [
    "mc", "cp",
    f"data/resourceful-quiver_v{next_version}.parquet",
    "opensky/prc-2025-resourceful-quiver"
]

subprocess.run(cmd_upload, check=True)
print(f"Uploaded resourceful-quiver_v{next_version}.parquet to opensky/prc-2025-resourceful-quiver/")

if(not True):
    time.sleep(45)
    
cmd = [ "mc", "cat", f"opensky/prc-2025-resourceful-quiver/resourceful-quiver_v{next_version}.parquet_result.json" ]

while True:
    try:
        print("Trying to fetch result JSON...")
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        json_text = result.stdout
        print("Success! File found.")
        break

    except subprocess.CalledProcessError:
        print("File not ready yet. Retrying in 15 seconds...")
        time.sleep(15)
        
# Parse JSON
info = json.loads(json_text)

# Print score
print("Status:", info.get("status"))
print("File:", info.get("file"))
print("Score:", info.get("score"))
print("Used pairs:", info.get("used_pairs"))

# If you want only the score:
print("\nFinal score:", info["score"])

# %%
# build JSON text
text_json = json.dumps(
    {"description": description,
     "cols_to_take": list(X.columns),
     "score": info["score"],
     "base_params": base_params,
     "filter_low_high": filter_low_high},
    indent=2
)

if(save_model_and_metadata):
    # save to file named vnext_version.json
    filename = f"metadata/{next_version}_metadata.json"
    with open(filename, "w") as f:
        f.write(text_json)

    print(f"Saved to {filename}")
    
    print(f"Saving model to models/v{next_version}_model.lgb")
    os.makedirs("models", exist_ok=True)
    model.booster_.save_model(f"models/v{next_version}_model.lgb")
    
if(final_model):
    # save to file named vnext_version.json
    filename = f"metadata/final_metadata.json"
    with open(filename, "w") as f:
        f.write(text_json)

    print(f"Saved to {filename}")
    
    print(f"Saving model to models/final_model.lgb")
    os.makedirs("models", exist_ok=True)
    model.booster_.save_model(f"models/final_model.lgb")

Saved submission → data/resourceful-quiver_v241.parquet
   idx     fuel_kg
0    0  181.084973
1    1  183.340290
2    2  183.256997
3    3  181.718074
4    4  182.219017
`/Users/mfrahman/Python/PRC-2025/data/resourceful-quiver_v241.parquet` -> `opensky/prc-2025-resourceful-quiver/resourceful-quiver_v241.parquet`
Total: 816.71 KiB, Transferred: 816.71 KiB, Speed: 484.46 KiB/s
Uploaded resourceful-quiver_v241.parquet to opensky/prc-2025-resourceful-quiver/
Trying to fetch result JSON...
File not ready yet. Retrying in 15 seconds...
Trying to fetch result JSON...
Success! File found.
Status: Succeeded
File: resourceful-quiver_v241.parquet
Score: 199.9065
Used pairs: 24289

Final score: 199.9065
Saved to metadata/241_metadata.json
Saving model to models/v241model.lgb
