In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v11.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v11.csv")



In [4]:
# Separating features and target variable
X = challenge_set_updated.drop('tow', axis=1)
y = challenge_set_updated['tow']

In [7]:
# Dropping unnecessary columns
to_drop = [
    'offblock_to_arrival_duration', 'normalized_taxi_ratio', 'MALW_kg', 'wind_distance_ARR_100', 
    'average_airspeed_ARR_100', 'track_variation_ARR_100', 'is_offblock_weekend', 'Num_Engines', 
    'flown_distance_ARR_100', 'average_humidity_ARR_100', 'average_temperature_ARR_100', 
    'arrival_minute', 'track_variation_ENR', 'groundspeed_ARR_100', 'average_vertical_rate_ARR_100', 
    'taxiout_time', 'offblock_minute', 
    'average_airspeed_ENR', 'specific_energy_ENR', 'taxi_ratio',  
    'specific_energy_ARR_100', 'is_offblock_rush_hour', 'wind_distance_ENR', 'groundspeed_ENR', 
    'altitude_difference', 'average_vertical_rate_ENR', 'bearing', 'Altitude_ades'
]


# Function to filter out non-existing columns before dropping
def filter_existing_columns(df, columns_to_drop):
    return [col for col in columns_to_drop if col in df.columns]

# Check the columns in the DataFrame X
print("Current Columns in X:", X.columns.tolist())

# Filter the columns to drop
existing_columns_to_drop = filter_existing_columns(X, to_drop)

# Drop the unnecessary columns
X.drop(existing_columns_to_drop, axis=1, inplace=True)


Current Columns in X: ['taxiout_time', 'flown_distance', 'track_variation_ARR_100', 'track_variation_DEP_40', 'track_variation_ENR', 'average_vertical_rate_ARR_100', 'average_vertical_rate_DEP_40', 'average_vertical_rate_ENR', 'average_airspeed_ARR_100', 'average_airspeed_DEP_40', 'average_airspeed_ENR', 'groundspeed_ARR_100', 'groundspeed_DEP_40', 'groundspeed_ENR', 'wind_distance_ARR_100', 'wind_distance_DEP_40', 'wind_distance_ENR', 'average_temperature_ARR_100', 'average_temperature_DEP_40', 'average_temperature_ENR', 'average_humidity_ARR_100', 'average_humidity_DEP_40', 'average_humidity_ENR', 'specific_energy_ARR_100', 'specific_energy_DEP_40', 'specific_energy_ENR', 'flown_distance_ARR_100', 'flown_distance_DEP_40', 'flown_distance_ENR', 'average_altitude_ARR_100', 'average_altitude_DEP_40', 'average_altitude_ENR', 'specific_energy', 'taxi_ratio', 'flight_speed', 'normalized_taxi_ratio', 'speed_per_distance', 'cumulative_avg_speed', 'airspeed_specific_energy_ARR', 'airspeed_spe

In [8]:
cat_names = [
    'adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_hour', 'offblock_minute', 
    'offblock_day_of_week', 'offblock_month', 'offblock_week_of_year', 'offblock_season', 
    'arrival_hour', 'arrival_minute', 'is_offblock_weekend', 'is_offblock_rush_hour', 
    'flight_duration_category', 'adep_region', 'ades_region', 'same_country_flight', 
    'same_region_flight', 'flight_direction', 'is_intercontinental', 'Manufacturer', 
    'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight', 'adep_geo_cluster', 'ades_geo_cluster'
]

# Filtering categorical features that are still present after dropping columns
selected_cat_names = [x for x in cat_names if x not in to_drop]

In [10]:
from sklearn.preprocessing import LabelEncoder

# Check available columns in X
print("Current Columns in X:", X.columns.tolist())

# Handle categorical columns by Label Encoding
label_encoder = LabelEncoder()

# Specify the columns that are categorical
categorical_cols = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season', 
                    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction', 
                    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight']

# Filter out only those categorical columns that exist in X
existing_categorical_cols = [col for col in categorical_cols if col in X.columns]

# Convert these categorical columns into numerical form using Label Encoding
for col in existing_categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Now, you can proceed with training LightGBM with this processed data


Current Columns in X: ['flown_distance', 'track_variation_DEP_40', 'average_vertical_rate_DEP_40', 'average_airspeed_DEP_40', 'groundspeed_DEP_40', 'wind_distance_DEP_40', 'average_temperature_DEP_40', 'average_temperature_ENR', 'average_humidity_DEP_40', 'average_humidity_ENR', 'specific_energy_DEP_40', 'flown_distance_DEP_40', 'flown_distance_ENR', 'average_altitude_ARR_100', 'average_altitude_DEP_40', 'average_altitude_ENR', 'specific_energy', 'flight_speed', 'speed_per_distance', 'cumulative_avg_speed', 'airspeed_specific_energy_ARR', 'airspeed_specific_energy_DEP', 'airspeed_specific_energy_ENR', 'vertical_rate_airspeed_ratio_ARR', 'vertical_rate_airspeed_ratio_DEP', 'Approach_Speed_knot', 'Wingspan_ft_without_winglets_sharklets', 'Wingspan_ft_with_winglets_sharklets', 'Length_ft', 'Tail_Height_at_OEW_ft', 'Wheelbase_ft', 'Cockpit_to_Main_Gear_ft', 'Main_Gear_Width_ft', 'Parking_Area_ft2', 'Total_IFR_Operations_2021_2022', 'MTOW_kg', 'Latitude_adep', 'Longitude_adep', 'Altitude_ad

In [11]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical feature indices for LightGBM
cat_feature_indices = [X_train.columns.get_loc(col) for col in selected_cat_names if col in X_train.columns]

# Define the objective function for hyperparameter tuning with Optuna
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 3000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 10.0),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 0.2),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.0, 10.0),
        'device_type': 'gpu',  # Use GPU for training

    }

    model = lgb.LGBMRegressor(**params, n_estimators=5000)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=50)],
        categorical_feature=cat_feature_indices
    )
    
    val_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, val_pred, squared=False)
    
    return rmse

In [12]:
# Create the Optuna study and start optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Display the best hyperparameters found
print(f"Best trial: {study.best_trial.params}")


[I 2024-09-26 09:47:09,692] A new study created in memory with name: no-name-2a58bf00-646a-40d2-b24f-aeb3c329794b


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4998]	valid_0's rmse: 2762.88


[I 2024-09-26 09:47:49,648] Trial 0 finished with value: 2762.8829082473276 and parameters: {'learning_rate': 0.035615551544055386, 'num_leaves': 2059, 'max_depth': 8, 'min_data_in_leaf': 77, 'feature_fraction': 0.5785152976273922, 'bagging_fraction': 0.9296503790920617, 'lambda_l1': 5.874308412280258, 'lambda_l2': 7.256133483225944, 'min_gain_to_split': 0.035909932886852784, 'min_child_weight': 3.2229204107038543}. Best is trial 0 with value: 2762.8829082473276.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2788.2


[I 2024-09-26 09:48:17,882] Trial 1 finished with value: 2788.1957382239652 and parameters: {'learning_rate': 0.03326177130240085, 'num_leaves': 1252, 'max_depth': 7, 'min_data_in_leaf': 69, 'feature_fraction': 0.6719196873411559, 'bagging_fraction': 0.6537943478041194, 'lambda_l1': 0.04541017854773233, 'lambda_l2': 0.23878889991223673, 'min_gain_to_split': 0.18061753212130904, 'min_child_weight': 6.765029338322633}. Best is trial 0 with value: 2762.8829082473276.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2778.38


[I 2024-09-26 09:49:04,519] Trial 2 finished with value: 2778.3818480398218 and parameters: {'learning_rate': 0.024142734143911056, 'num_leaves': 2376, 'max_depth': 8, 'min_data_in_leaf': 47, 'feature_fraction': 0.9492726306546486, 'bagging_fraction': 0.9185196043177036, 'lambda_l1': 1.3480115693539674, 'lambda_l2': 4.128619312021061, 'min_gain_to_split': 0.16202305878215403, 'min_child_weight': 8.499883167614687}. Best is trial 0 with value: 2762.8829082473276.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4998]	valid_0's rmse: 3037.26


[I 2024-09-26 09:49:13,220] Trial 3 finished with value: 3037.2598907895485 and parameters: {'learning_rate': 0.08362789981467116, 'num_leaves': 1903, 'max_depth': 3, 'min_data_in_leaf': 100, 'feature_fraction': 0.9272323492455561, 'bagging_fraction': 0.9112729495951775, 'lambda_l1': 7.883340749343805, 'lambda_l2': 5.962609025320182, 'min_gain_to_split': 0.006489066568723701, 'min_child_weight': 7.572474090678489}. Best is trial 0 with value: 2762.8829082473276.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2873.23


[I 2024-09-26 09:49:25,079] Trial 4 finished with value: 2873.230789394959 and parameters: {'learning_rate': 0.08689146187355191, 'num_leaves': 2509, 'max_depth': 4, 'min_data_in_leaf': 25, 'feature_fraction': 0.6711966542761167, 'bagging_fraction': 0.5619287964574358, 'lambda_l1': 9.43587619638359, 'lambda_l2': 1.3765829240318084, 'min_gain_to_split': 0.1548000109526796, 'min_child_weight': 7.209025077880396}. Best is trial 0 with value: 2762.8829082473276.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4999]	valid_0's rmse: 3078.05


[I 2024-09-26 09:49:38,087] Trial 5 finished with value: 3078.0494423220716 and parameters: {'learning_rate': 0.028321343595049092, 'num_leaves': 111, 'max_depth': 4, 'min_data_in_leaf': 93, 'feature_fraction': 0.6187620830879583, 'bagging_fraction': 0.5245381253559697, 'lambda_l1': 7.0640165346988395, 'lambda_l2': 1.8773959822493813, 'min_gain_to_split': 0.021855052306481507, 'min_child_weight': 0.5350891242206979}. Best is trial 0 with value: 2762.8829082473276.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4999]	valid_0's rmse: 2835.79


[I 2024-09-26 09:49:53,677] Trial 6 finished with value: 2835.7930031159976 and parameters: {'learning_rate': 0.068657399366399, 'num_leaves': 2753, 'max_depth': 5, 'min_data_in_leaf': 72, 'feature_fraction': 0.7138179061178785, 'bagging_fraction': 0.9685812006241037, 'lambda_l1': 2.3550491466865022, 'lambda_l2': 9.327699982869072, 'min_gain_to_split': 0.19590443328319998, 'min_child_weight': 2.4414478931789385}. Best is trial 0 with value: 2762.8829082473276.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3015]	valid_0's rmse: 2755.51


[I 2024-09-26 09:50:52,183] Trial 7 finished with value: 2755.5135276304222 and parameters: {'learning_rate': 0.060378280243872755, 'num_leaves': 1396, 'max_depth': 13, 'min_data_in_leaf': 93, 'feature_fraction': 0.40973281933911343, 'bagging_fraction': 0.7011468457374967, 'lambda_l1': 8.588999585087034, 'lambda_l2': 4.193112055043297, 'min_gain_to_split': 0.18321914386596905, 'min_child_weight': 9.290715602424754}. Best is trial 7 with value: 2755.5135276304222.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2814.97


[I 2024-09-26 09:51:36,282] Trial 8 finished with value: 2814.9735382466806 and parameters: {'learning_rate': 0.013185453373689643, 'num_leaves': 853, 'max_depth': 9, 'min_data_in_leaf': 98, 'feature_fraction': 0.6910517118737867, 'bagging_fraction': 0.9459874223305709, 'lambda_l1': 9.483923221967276, 'lambda_l2': 1.653842273313697, 'min_gain_to_split': 0.08327518636377669, 'min_child_weight': 6.836374919473377}. Best is trial 7 with value: 2755.5135276304222.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2840.09


[I 2024-09-26 09:51:53,040] Trial 9 finished with value: 2840.092916376986 and parameters: {'learning_rate': 0.05173261828496909, 'num_leaves': 1965, 'max_depth': 5, 'min_data_in_leaf': 23, 'feature_fraction': 0.6918553953773501, 'bagging_fraction': 0.8620904695503425, 'lambda_l1': 3.851532770939933, 'lambda_l2': 6.730133257588972, 'min_gain_to_split': 0.14309946793830133, 'min_child_weight': 6.8727039401204575}. Best is trial 7 with value: 2755.5135276304222.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2740]	valid_0's rmse: 2753.27


[I 2024-09-26 09:54:09,804] Trial 10 finished with value: 2753.273317584168 and parameters: {'learning_rate': 0.0611192798975334, 'num_leaves': 844, 'max_depth': 14, 'min_data_in_leaf': 6, 'feature_fraction': 0.40353112688405385, 'bagging_fraction': 0.4210713295981399, 'lambda_l1': 4.136403626995856, 'lambda_l2': 3.760833237058865, 'min_gain_to_split': 0.1135225703791216, 'min_child_weight': 9.87143921567126}. Best is trial 10 with value: 2753.273317584168.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3139]	valid_0's rmse: 2747.49


[I 2024-09-26 09:57:06,414] Trial 11 finished with value: 2747.4894807750456 and parameters: {'learning_rate': 0.05798401361365572, 'num_leaves': 790, 'max_depth': 14, 'min_data_in_leaf': 3, 'feature_fraction': 0.41384772662043745, 'bagging_fraction': 0.42194835404337244, 'lambda_l1': 4.368551366840768, 'lambda_l2': 3.943089298462867, 'min_gain_to_split': 0.10804857380700328, 'min_child_weight': 9.82887994400401}. Best is trial 11 with value: 2747.4894807750456.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2733.82


[I 2024-09-26 10:00:14,134] Trial 12 finished with value: 2733.8237752918453 and parameters: {'learning_rate': 0.0510386922436786, 'num_leaves': 529, 'max_depth': 15, 'min_data_in_leaf': 2, 'feature_fraction': 0.4033354755243845, 'bagging_fraction': 0.4012078232332666, 'lambda_l1': 4.361366440817587, 'lambda_l2': 3.399395476492106, 'min_gain_to_split': 0.10443444471464444, 'min_child_weight': 9.955017269638407}. Best is trial 12 with value: 2733.8237752918453.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2772.42


[I 2024-09-26 10:00:46,959] Trial 13 finished with value: 2772.4206163337135 and parameters: {'learning_rate': 0.04776604177411088, 'num_leaves': 60, 'max_depth': 12, 'min_data_in_leaf': 5, 'feature_fraction': 0.5168460634804681, 'bagging_fraction': 0.40796049563169406, 'lambda_l1': 5.535037143240543, 'lambda_l2': 3.064065822051835, 'min_gain_to_split': 0.07693440499338929, 'min_child_weight': 4.726793977527949}. Best is trial 12 with value: 2733.8237752918453.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1945]	valid_0's rmse: 2756.75


[I 2024-09-26 10:02:03,204] Trial 14 finished with value: 2756.7535883358482 and parameters: {'learning_rate': 0.07235139742899235, 'num_leaves': 569, 'max_depth': 15, 'min_data_in_leaf': 23, 'feature_fraction': 0.4947343912652503, 'bagging_fraction': 0.5057780342226598, 'lambda_l1': 3.5499498163068126, 'lambda_l2': 5.055570870133552, 'min_gain_to_split': 0.1160658022078206, 'min_child_weight': 8.66901764622183}. Best is trial 12 with value: 2733.8237752918453.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3371]	valid_0's rmse: 2731.14


[I 2024-09-26 10:03:07,255] Trial 15 finished with value: 2731.1358037110317 and parameters: {'learning_rate': 0.043439488503074865, 'num_leaves': 635, 'max_depth': 11, 'min_data_in_leaf': 40, 'feature_fraction': 0.8421909122661454, 'bagging_fraction': 0.6510732011334237, 'lambda_l1': 6.203249915219508, 'lambda_l2': 2.7068452107197705, 'min_gain_to_split': 0.06594324740440607, 'min_child_weight': 5.460585277082647}. Best is trial 15 with value: 2731.1358037110317.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3111]	valid_0's rmse: 2733.71


[I 2024-09-26 10:04:02,856] Trial 16 finished with value: 2733.711332243817 and parameters: {'learning_rate': 0.04251147975480407, 'num_leaves': 396, 'max_depth': 11, 'min_data_in_leaf': 44, 'feature_fraction': 0.8248050768227795, 'bagging_fraction': 0.7812040475399622, 'lambda_l1': 6.67264828254082, 'lambda_l2': 2.6371005655523714, 'min_gain_to_split': 0.05991326984263978, 'min_child_weight': 5.1881515689200075}. Best is trial 15 with value: 2731.1358037110317.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3235]	valid_0's rmse: 2735.98


[I 2024-09-26 10:05:06,395] Trial 17 finished with value: 2735.9833241589436 and parameters: {'learning_rate': 0.04084320670921195, 'num_leaves': 458, 'max_depth': 11, 'min_data_in_leaf': 44, 'feature_fraction': 0.827342447897695, 'bagging_fraction': 0.777243603139686, 'lambda_l1': 6.689111989103696, 'lambda_l2': 0.3967222712424112, 'min_gain_to_split': 0.04460797459251714, 'min_child_weight': 5.014108921760135}. Best is trial 15 with value: 2731.1358037110317.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1501]	valid_0's rmse: 2768.24


[I 2024-09-26 10:05:40,623] Trial 18 finished with value: 2768.241221332797 and parameters: {'learning_rate': 0.09994794458096701, 'num_leaves': 1212, 'max_depth': 11, 'min_data_in_leaf': 37, 'feature_fraction': 0.8085158715167241, 'bagging_fraction': 0.7655876876962671, 'lambda_l1': 6.634096710261541, 'lambda_l2': 2.779292855269564, 'min_gain_to_split': 0.06094683847654775, 'min_child_weight': 5.339033851952656}. Best is trial 15 with value: 2731.1358037110317.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2761.04


[I 2024-09-26 10:06:54,661] Trial 19 finished with value: 2761.0429403760663 and parameters: {'learning_rate': 0.016457577720957545, 'num_leaves': 1662, 'max_depth': 10, 'min_data_in_leaf': 60, 'feature_fraction': 0.8510416366570338, 'bagging_fraction': 0.6547291309637523, 'lambda_l1': 7.946192299791277, 'lambda_l2': 2.2574146478811565, 'min_gain_to_split': 0.05985516299866374, 'min_child_weight': 3.7108758335623944}. Best is trial 15 with value: 2731.1358037110317.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3894]	valid_0's rmse: 2732.36


[I 2024-09-26 10:08:07,438] Trial 20 finished with value: 2732.359462466091 and parameters: {'learning_rate': 0.04191372745510007, 'num_leaves': 347, 'max_depth': 12, 'min_data_in_leaf': 58, 'feature_fraction': 0.7724286918446021, 'bagging_fraction': 0.7893179361930602, 'lambda_l1': 5.4834647534394385, 'lambda_l2': 5.222195043656004, 'min_gain_to_split': 0.08545806776239126, 'min_child_weight': 5.826131298869141}. Best is trial 15 with value: 2731.1358037110317.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4412]	valid_0's rmse: 2734.23


[I 2024-09-26 10:09:32,015] Trial 21 finished with value: 2734.227084751908 and parameters: {'learning_rate': 0.04322693349463416, 'num_leaves': 285, 'max_depth': 12, 'min_data_in_leaf': 61, 'feature_fraction': 0.762796148289436, 'bagging_fraction': 0.8200705117596455, 'lambda_l1': 5.214746400351307, 'lambda_l2': 5.433554132316387, 'min_gain_to_split': 0.08154113479396564, 'min_child_weight': 5.89597526447815}. Best is trial 15 with value: 2731.1358037110317.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4998]	valid_0's rmse: 2725.1


[I 2024-09-26 10:10:59,971] Trial 22 finished with value: 2725.1024902499903 and parameters: {'learning_rate': 0.040232447540238, 'num_leaves': 1035, 'max_depth': 10, 'min_data_in_leaf': 35, 'feature_fraction': 0.8880567898770956, 'bagging_fraction': 0.7109490701551315, 'lambda_l1': 6.156991762243389, 'lambda_l2': 7.71117214126504, 'min_gain_to_split': 0.051274563395863657, 'min_child_weight': 4.021845555499864}. Best is trial 22 with value: 2725.1024902499903.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2748.88


[I 2024-09-26 10:12:26,382] Trial 23 finished with value: 2748.876924021506 and parameters: {'learning_rate': 0.02147101480045263, 'num_leaves': 1003, 'max_depth': 10, 'min_data_in_leaf': 34, 'feature_fraction': 0.9023325164778059, 'bagging_fraction': 0.7086051370472111, 'lambda_l1': 5.818445518099778, 'lambda_l2': 9.3036894565535, 'min_gain_to_split': 0.04052651918971402, 'min_child_weight': 3.7533169457441646}. Best is trial 22 with value: 2725.1024902499903.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4442]	valid_0's rmse: 2729.57


[I 2024-09-26 10:14:21,478] Trial 24 finished with value: 2729.571842509032 and parameters: {'learning_rate': 0.03278869165537788, 'num_leaves': 1092, 'max_depth': 13, 'min_data_in_leaf': 55, 'feature_fraction': 0.8963043374583974, 'bagging_fraction': 0.6160860973070215, 'lambda_l1': 3.104788104489984, 'lambda_l2': 8.024326120463424, 'min_gain_to_split': 0.0936865798969131, 'min_child_weight': 1.8348492757984083}. Best is trial 22 with value: 2725.1024902499903.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3158]	valid_0's rmse: 2742.52


[I 2024-09-26 10:15:58,325] Trial 25 finished with value: 2742.5220768907093 and parameters: {'learning_rate': 0.031509848880958036, 'num_leaves': 1588, 'max_depth': 13, 'min_data_in_leaf': 51, 'feature_fraction': 0.9978374747216763, 'bagging_fraction': 0.6026921761618478, 'lambda_l1': 2.838052890578157, 'lambda_l2': 8.285559968900744, 'min_gain_to_split': 0.13541550011281167, 'min_child_weight': 1.431441341092622}. Best is trial 22 with value: 2725.1024902499903.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2740.67


[I 2024-09-26 10:17:06,711] Trial 26 finished with value: 2740.674035684798 and parameters: {'learning_rate': 0.036842411058338846, 'num_leaves': 1194, 'max_depth': 9, 'min_data_in_leaf': 33, 'feature_fraction': 0.8672616196094547, 'bagging_fraction': 0.6275710433296907, 'lambda_l1': 1.970923858772546, 'lambda_l2': 8.252626919572457, 'min_gain_to_split': 0.0017775909960288105, 'min_child_weight': 2.037427827086125}. Best is trial 22 with value: 2725.1024902499903.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4998]	valid_0's rmse: 2742.35


[I 2024-09-26 10:18:57,254] Trial 27 finished with value: 2742.3542735196943 and parameters: {'learning_rate': 0.02504523405361413, 'num_leaves': 1012, 'max_depth': 10, 'min_data_in_leaf': 15, 'feature_fraction': 0.9940426715042445, 'bagging_fraction': 0.733658865067458, 'lambda_l1': 0.9336983346309582, 'lambda_l2': 7.9906492472006665, 'min_gain_to_split': 0.06833094687037669, 'min_child_weight': 0.026287020793487104}. Best is trial 22 with value: 2725.1024902499903.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4998]	valid_0's rmse: 2736.22


[I 2024-09-26 10:21:35,632] Trial 28 finished with value: 2736.215953760433 and parameters: {'learning_rate': 0.01781140506372878, 'num_leaves': 676, 'max_depth': 13, 'min_data_in_leaf': 37, 'feature_fraction': 0.9032931764855036, 'bagging_fraction': 0.5957893744650885, 'lambda_l1': 3.4431471917350724, 'lambda_l2': 6.864105933603034, 'min_gain_to_split': 0.09310943475057917, 'min_child_weight': 4.190548190460737}. Best is trial 22 with value: 2725.1024902499903.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2788.1


[I 2024-09-26 10:22:06,022] Trial 29 finished with value: 2788.103937955437 and parameters: {'learning_rate': 0.03428774108057072, 'num_leaves': 1059, 'max_depth': 7, 'min_data_in_leaf': 53, 'feature_fraction': 0.7701605715402617, 'bagging_fraction': 0.6703326119680071, 'lambda_l1': 4.856842525798516, 'lambda_l2': 9.952080142998154, 'min_gain_to_split': 0.0316325009866814, 'min_child_weight': 2.767631378386426}. Best is trial 22 with value: 2725.1024902499903.


Best trial: {'learning_rate': 0.040232447540238, 'num_leaves': 1035, 'max_depth': 10, 'min_data_in_leaf': 35, 'feature_fraction': 0.8880567898770956, 'bagging_fraction': 0.7109490701551315, 'lambda_l1': 6.156991762243389, 'lambda_l2': 7.71117214126504, 'min_gain_to_split': 0.051274563395863657, 'min_child_weight': 4.021845555499864}


In [13]:
# Train the final LightGBM model with the best hyperparameters
best_params = study.best_trial.params

best_model = lgb.LGBMRegressor(**best_params, n_estimators=10000)

best_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
  #  early_stopping_rounds=50,
    categorical_feature=cat_feature_indices
)



In [14]:
# Inspect the model's feature names (from the training phase)
model_features = best_model.feature_name_  # Or check best_model.booster_.feature_name() for lightgbm

# Print model's features and submission_set columns for debugging
print("Model features:", model_features)
print("Submission set columns:", submission_set.columns.tolist())

# Ensure the submission_set has the same features as the model
# Add missing columns with default values (e.g., 0 or the mean)
for feature in model_features:
    if feature not in submission_set.columns:
        submission_set[feature] = 0  # You can choose a more appropriate fill value


Model features: ['flown_distance', 'track_variation_DEP_40', 'average_vertical_rate_DEP_40', 'average_airspeed_DEP_40', 'groundspeed_DEP_40', 'wind_distance_DEP_40', 'average_temperature_DEP_40', 'average_temperature_ENR', 'average_humidity_DEP_40', 'average_humidity_ENR', 'specific_energy_DEP_40', 'flown_distance_DEP_40', 'flown_distance_ENR', 'average_altitude_ARR_100', 'average_altitude_DEP_40', 'average_altitude_ENR', 'specific_energy', 'flight_speed', 'speed_per_distance', 'cumulative_avg_speed', 'airspeed_specific_energy_ARR', 'airspeed_specific_energy_DEP', 'airspeed_specific_energy_ENR', 'vertical_rate_airspeed_ratio_ARR', 'vertical_rate_airspeed_ratio_DEP', 'Approach_Speed_knot', 'Wingspan_ft_without_winglets_sharklets', 'Wingspan_ft_with_winglets_sharklets', 'Length_ft', 'Tail_Height_at_OEW_ft', 'Wheelbase_ft', 'Cockpit_to_Main_Gear_ft', 'Main_Gear_Width_ft', 'Parking_Area_ft2', 'Total_IFR_Operations_2021_2022', 'MTOW_kg', 'Latitude_adep', 'Longitude_adep', 'Altitude_adep', '

  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropriate fill value
  submission_set[feature] = 0  # You can choose a more appropria

In [15]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns that need to be converted to numeric
categorical_columns = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline']

# Initialize LabelEncoder
label_encoders = {}

# Convert each categorical column to numeric using LabelEncoder
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    submission_set[col] = label_encoders[col].fit_transform(submission_set[col].astype(str))

# Print to check the converted types
print("Submission set dtypes after label encoding:", submission_set.dtypes)

# Ensure the submission_set has the same features as the model
submission_set_features = submission_set[model_features]

# Predict using the aligned features
submission_set['tow'] = best_model.predict(submission_set_features)

# Inspect the result
print(submission_set.head())


Submission set dtypes after label encoding: flight_id               int64
date                   object
callsign               object
adep                    int64
name_adep              object
                        ...  
ades_geo_cluster_15     int64
ades_geo_cluster_16     int64
ades_geo_cluster_17     int64
ades_geo_cluster_18     int64
ades_geo_cluster_19     int64
Length: 292, dtype: object
   flight_id        date                          callsign  adep  \
0  248753821  2022-01-01  3b3de0f3ad0ee192513995c02f7bf7cf   331   
1  248753822  2022-01-01  e06dd03d4a879ca37d9e18c1bd7cad16    11   
2  248754498  2022-01-01  2d3b1c962c78c4ebeef11bcd51b9e94c   158   
3  248757623  2022-01-01  81564432d3ee97c4bdf4cd8f006753dc    45   
4  248763603  2022-01-01  84be079d7e660db105d91f600b4b3d59    74   

                name_adep country_code_adep  ades        name_ades  \
0  Istanbul Sabiha Gokcen                TR   185             Lyon   
1                Brussels                BE   138 

In [16]:

from datetime import datetime
import pytz

saopaulo_tz = pytz.timezone('America/Sao_Paulo')

timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')


In [17]:

import os

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")
submission_set.to_csv(submission_file, index=False)