In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v9_median.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v9_median.csv")



In [3]:
# Separating features and target variable
X = challenge_set_updated.drop('tow', axis=1)
y = challenge_set_updated['tow']

In [4]:
# Dropping unnecessary columns
to_drop = [
    'offblock_to_arrival_duration', 'normalized_taxi_ratio', 'MALW_kg', 'wind_distance_ARR_100', 
    'average_airspeed_ARR_100', 'track_variation_ARR_100', 'is_offblock_weekend', 'Num_Engines', 
    'flown_distance_ARR_100', 'average_humidity_ARR_100', 'average_temperature_ARR_100', 
    'arrival_minute', 'track_variation_ENR', 'groundspeed_ARR_100', 'average_vertical_rate_ARR_100', 
    'taxiout_time', 'offblock_minute', 
    'average_airspeed_ENR', 'specific_energy_ENR', 'taxi_ratio',  
    'specific_energy_ARR_100', 'is_offblock_rush_hour', 'wind_distance_ENR', 'groundspeed_ENR', 
    'altitude_difference', 'average_vertical_rate_ENR', 'bearing', 'Altitude_ades'
]


X.drop(to_drop, axis=1, inplace=True)

In [5]:
cat_names = [
    'adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_hour', 'offblock_minute', 
    'offblock_day_of_week', 'offblock_month', 'offblock_week_of_year', 'offblock_season', 
    'arrival_hour', 'arrival_minute', 'is_offblock_weekend', 'is_offblock_rush_hour', 
    'flight_duration_category', 'adep_region', 'ades_region', 'same_country_flight', 
    'same_region_flight', 'flight_direction', 'is_intercontinental', 'Manufacturer', 
    'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight', 'adep_geo_cluster', 'ades_geo_cluster'
]

# Filtering categorical features that are still present after dropping columns
selected_cat_names = [x for x in cat_names if x not in to_drop]

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Handle categorical columns by Label Encoding
label_encoder = LabelEncoder()

# Specify the columns that are categorical
categorical_cols = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season', 
                    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction', 
                    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight']

# Convert these categorical columns into numerical form using Label Encoding
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Now, you can proceed with training LightGBM with this processed data


In [7]:
# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical feature indices for LightGBM
cat_feature_indices = [X_train.columns.get_loc(col) for col in selected_cat_names if col in X_train.columns]

# Define the objective function for hyperparameter tuning with Optuna
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 3000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 10.0),
        'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 0.2),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.0, 10.0),
        'device_type': 'gpu',  # Use GPU for training

    }

    model = lgb.LGBMRegressor(**params, n_estimators=5000)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='rmse',
        callbacks=[lgb.early_stopping(stopping_rounds=50)],
        categorical_feature=cat_feature_indices
    )
    
    val_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, val_pred, squared=False)
    
    return rmse

In [8]:
# Create the Optuna study and start optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# Display the best hyperparameters found
print(f"Best trial: {study.best_trial.params}")


[I 2024-09-25 11:17:04,340] A new study created in memory with name: no-name-1d30e969-a543-46a8-a9f1-6a201312e5c8


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2606]	valid_0's rmse: 2708.85


[I 2024-09-25 11:17:20,147] Trial 0 finished with value: 2708.8505588056096 and parameters: {'learning_rate': 0.06824013703023082, 'num_leaves': 568, 'max_depth': 7, 'min_data_in_leaf': 26, 'feature_fraction': 0.7645742084353451, 'bagging_fraction': 0.762039990749708, 'lambda_l1': 4.0172032540830855, 'lambda_l2': 3.1558944189389737, 'min_gain_to_split': 0.1505361144667464, 'min_child_weight': 5.673169545674453}. Best is trial 0 with value: 2708.8505588056096.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3188]	valid_0's rmse: 2710.54


[I 2024-09-25 11:17:41,121] Trial 1 finished with value: 2710.539239300816 and parameters: {'learning_rate': 0.09482995200078194, 'num_leaves': 2931, 'max_depth': 7, 'min_data_in_leaf': 53, 'feature_fraction': 0.40083426098264774, 'bagging_fraction': 0.9495950196575347, 'lambda_l1': 7.796807558927977, 'lambda_l2': 9.05108001278872, 'min_gain_to_split': 0.17753972404427532, 'min_child_weight': 9.285691009651972}. Best is trial 0 with value: 2708.8505588056096.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2797]	valid_0's rmse: 2723.22


[I 2024-09-25 11:18:02,670] Trial 2 finished with value: 2723.216718314271 and parameters: {'learning_rate': 0.058245521635027546, 'num_leaves': 1255, 'max_depth': 7, 'min_data_in_leaf': 14, 'feature_fraction': 0.9341295608471015, 'bagging_fraction': 0.6974294892915671, 'lambda_l1': 5.669402065443439, 'lambda_l2': 9.278583241518115, 'min_gain_to_split': 0.04138080497925545, 'min_child_weight': 8.306677092263289}. Best is trial 0 with value: 2708.8505588056096.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 3072.69


[I 2024-09-25 11:18:11,576] Trial 3 finished with value: 3072.6885173831247 and parameters: {'learning_rate': 0.027267006964760122, 'num_leaves': 2223, 'max_depth': 3, 'min_data_in_leaf': 5, 'feature_fraction': 0.5513444704607725, 'bagging_fraction': 0.4921562579754231, 'lambda_l1': 1.8047041775534112, 'lambda_l2': 4.558295966875978, 'min_gain_to_split': 0.13235097146573044, 'min_child_weight': 5.846507109068744}. Best is trial 0 with value: 2708.8505588056096.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1550]	valid_0's rmse: 2723


[I 2024-09-25 11:18:28,593] Trial 4 finished with value: 2723.0008646272 and parameters: {'learning_rate': 0.08536505437362495, 'num_leaves': 825, 'max_depth': 8, 'min_data_in_leaf': 32, 'feature_fraction': 0.9080277365974212, 'bagging_fraction': 0.5677706461363927, 'lambda_l1': 0.5354447010126018, 'lambda_l2': 3.0166039620308602, 'min_gain_to_split': 0.09600392329395449, 'min_child_weight': 3.0205568356102654}. Best is trial 0 with value: 2708.8505588056096.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4992]	valid_0's rmse: 2693.6


[I 2024-09-25 11:19:05,363] Trial 5 finished with value: 2693.598551324118 and parameters: {'learning_rate': 0.0348869695013039, 'num_leaves': 113, 'max_depth': 10, 'min_data_in_leaf': 59, 'feature_fraction': 0.7884848137382204, 'bagging_fraction': 0.8724082867814367, 'lambda_l1': 1.87255114459271, 'lambda_l2': 5.345579585792784, 'min_gain_to_split': 0.06198318801538272, 'min_child_weight': 0.8869979318310584}. Best is trial 5 with value: 2693.598551324118.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2005]	valid_0's rmse: 2685.82


[I 2024-09-25 11:19:58,957] Trial 6 finished with value: 2685.8223051739274 and parameters: {'learning_rate': 0.0501424941382079, 'num_leaves': 405, 'max_depth': 14, 'min_data_in_leaf': 22, 'feature_fraction': 0.6426953935489712, 'bagging_fraction': 0.8849347168267577, 'lambda_l1': 4.000809799575446, 'lambda_l2': 5.529708620372766, 'min_gain_to_split': 0.1450262758410169, 'min_child_weight': 0.5737650878136513}. Best is trial 6 with value: 2685.8223051739274.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1088]	valid_0's rmse: 2708.49


[I 2024-09-25 11:20:42,357] Trial 7 finished with value: 2708.494183005438 and parameters: {'learning_rate': 0.07036203438720712, 'num_leaves': 929, 'max_depth': 12, 'min_data_in_leaf': 24, 'feature_fraction': 0.8203154537548751, 'bagging_fraction': 0.7985710148762004, 'lambda_l1': 8.57800813126953, 'lambda_l2': 5.868547766831989, 'min_gain_to_split': 0.005484258911301332, 'min_child_weight': 0.40625478591800324}. Best is trial 6 with value: 2685.8223051739274.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 3063.57


[I 2024-09-25 11:20:52,206] Trial 8 finished with value: 3063.5666052943925 and parameters: {'learning_rate': 0.029755320322083265, 'num_leaves': 1833, 'max_depth': 3, 'min_data_in_leaf': 73, 'feature_fraction': 0.4515728280518596, 'bagging_fraction': 0.49191976942670224, 'lambda_l1': 3.2745486135130406, 'lambda_l2': 1.579029403252653, 'min_gain_to_split': 0.18320419714681396, 'min_child_weight': 2.718900450371952}. Best is trial 6 with value: 2685.8223051739274.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3448]	valid_0's rmse: 2717.13


[I 2024-09-25 11:21:07,885] Trial 9 finished with value: 2717.1341765097354 and parameters: {'learning_rate': 0.07975893412963032, 'num_leaves': 2171, 'max_depth': 6, 'min_data_in_leaf': 90, 'feature_fraction': 0.860371729148602, 'bagging_fraction': 0.6206172561969556, 'lambda_l1': 4.197763332427162, 'lambda_l2': 4.460738074745342, 'min_gain_to_split': 0.06362415319455912, 'min_child_weight': 0.7398607322643425}. Best is trial 6 with value: 2685.8223051739274.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2759.3


[I 2024-09-25 11:21:37,475] Trial 10 finished with value: 2759.3023451336176 and parameters: {'learning_rate': 0.012825035736641698, 'num_leaves': 68, 'max_depth': 15, 'min_data_in_leaf': 38, 'feature_fraction': 0.637691696809199, 'bagging_fraction': 0.9668213368159375, 'lambda_l1': 5.738069161874885, 'lambda_l2': 7.197066079536035, 'min_gain_to_split': 0.12402337728870036, 'min_child_weight': 3.471216984494418}. Best is trial 6 with value: 2685.8223051739274.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2706.79


[I 2024-09-25 11:21:58,901] Trial 11 finished with value: 2706.790106108739 and parameters: {'learning_rate': 0.04587511221381087, 'num_leaves': 40, 'max_depth': 11, 'min_data_in_leaf': 58, 'feature_fraction': 0.6798792538549379, 'bagging_fraction': 0.8524332943736322, 'lambda_l1': 2.378663267536495, 'lambda_l2': 6.807951166042289, 'min_gain_to_split': 0.08236644549012555, 'min_child_weight': 0.11709000400983616}. Best is trial 6 with value: 2685.8223051739274.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1712]	valid_0's rmse: 2671.15


[I 2024-09-25 11:22:46,225] Trial 12 finished with value: 2671.1521251914496 and parameters: {'learning_rate': 0.043844284290387614, 'num_leaves': 518, 'max_depth': 15, 'min_data_in_leaf': 69, 'feature_fraction': 0.5953264125299325, 'bagging_fraction': 0.8753546638417105, 'lambda_l1': 0.0919917970052857, 'lambda_l2': 7.626896981305079, 'min_gain_to_split': 0.04340255522610508, 'min_child_weight': 1.5637526018119017}. Best is trial 12 with value: 2671.1521251914496.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1442]	valid_0's rmse: 2671.88


[I 2024-09-25 11:23:30,792] Trial 13 finished with value: 2671.8785050972056 and parameters: {'learning_rate': 0.04828880664129436, 'num_leaves': 596, 'max_depth': 15, 'min_data_in_leaf': 100, 'feature_fraction': 0.5785949191747294, 'bagging_fraction': 0.999898760479947, 'lambda_l1': 0.646815986943782, 'lambda_l2': 7.423033662584855, 'min_gain_to_split': 0.0008390258666972697, 'min_child_weight': 1.7338396201849635}. Best is trial 12 with value: 2671.1521251914496.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1697]	valid_0's rmse: 2668.22


[I 2024-09-25 11:24:19,938] Trial 14 finished with value: 2668.2205701330645 and parameters: {'learning_rate': 0.04463961594305718, 'num_leaves': 1270, 'max_depth': 13, 'min_data_in_leaf': 96, 'feature_fraction': 0.5650234551485831, 'bagging_fraction': 0.9866897623006018, 'lambda_l1': 0.07871650874003659, 'lambda_l2': 8.0685173528918, 'min_gain_to_split': 0.009725576230935618, 'min_child_weight': 2.0084774249330533}. Best is trial 14 with value: 2668.2205701330645.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2648.95


[I 2024-09-25 11:27:00,677] Trial 15 finished with value: 2648.9534808293793 and parameters: {'learning_rate': 0.010201035547956504, 'num_leaves': 1243, 'max_depth': 13, 'min_data_in_leaf': 78, 'feature_fraction': 0.5200300191154457, 'bagging_fraction': 0.9065294189412629, 'lambda_l1': 0.0016504718350366515, 'lambda_l2': 8.345756878455427, 'min_gain_to_split': 0.031741825759914924, 'min_child_weight': 4.318153436301136}. Best is trial 15 with value: 2648.9534808293793.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2649.09


[I 2024-09-25 11:29:29,688] Trial 16 finished with value: 2649.085641985118 and parameters: {'learning_rate': 0.010338744901694757, 'num_leaves': 1468, 'max_depth': 13, 'min_data_in_leaf': 86, 'feature_fraction': 0.5030955059585589, 'bagging_fraction': 0.7262443940387098, 'lambda_l1': 1.4380583857096587, 'lambda_l2': 9.612759340749548, 'min_gain_to_split': 0.023103721911607172, 'min_child_weight': 4.481907307576972}. Best is trial 15 with value: 2648.9534808293793.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4998]	valid_0's rmse: 2652.82


[I 2024-09-25 11:31:40,758] Trial 17 finished with value: 2652.818560046912 and parameters: {'learning_rate': 0.011134361298782544, 'num_leaves': 1669, 'max_depth': 12, 'min_data_in_leaf': 83, 'feature_fraction': 0.4985626441436933, 'bagging_fraction': 0.6949797900685445, 'lambda_l1': 1.3261402966559335, 'lambda_l2': 9.613269242037383, 'min_gain_to_split': 0.0376216813698424, 'min_child_weight': 4.3997251172646195}. Best is trial 15 with value: 2648.9534808293793.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4995]	valid_0's rmse: 2653.21


[I 2024-09-25 11:33:02,562] Trial 18 finished with value: 2653.2084389078645 and parameters: {'learning_rate': 0.0198000188800777, 'num_leaves': 1352, 'max_depth': 10, 'min_data_in_leaf': 78, 'feature_fraction': 0.49336238370872976, 'bagging_fraction': 0.7696954366526322, 'lambda_l1': 2.8512766244401604, 'lambda_l2': 8.520393211214694, 'min_gain_to_split': 0.027723030354542252, 'min_child_weight': 6.926767236460147}. Best is trial 15 with value: 2648.9534808293793.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3249]	valid_0's rmse: 2669.71


[I 2024-09-25 11:35:39,909] Trial 19 finished with value: 2669.714261910574 and parameters: {'learning_rate': 0.020631113935262527, 'num_leaves': 2009, 'max_depth': 13, 'min_data_in_leaf': 43, 'feature_fraction': 0.7263355030246372, 'bagging_fraction': 0.6172339360836181, 'lambda_l1': 1.354640022146738, 'lambda_l2': 9.964474451799262, 'min_gain_to_split': 0.06939854131843542, 'min_child_weight': 4.226041534754213}. Best is trial 15 with value: 2648.9534808293793.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4771]	valid_0's rmse: 2707.08


[I 2024-09-25 11:36:38,036] Trial 20 finished with value: 2707.084186827637 and parameters: {'learning_rate': 0.02006048091219813, 'num_leaves': 2654, 'max_depth': 9, 'min_data_in_leaf': 65, 'feature_fraction': 0.995754483241825, 'bagging_fraction': 0.8109776016250146, 'lambda_l1': 6.875997138894256, 'lambda_l2': 8.445316596324055, 'min_gain_to_split': 0.02099423721230105, 'min_child_weight': 6.648433328746529}. Best is trial 15 with value: 2648.9534808293793.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2657.28


[I 2024-09-25 11:38:32,316] Trial 21 finished with value: 2657.284946382836 and parameters: {'learning_rate': 0.010158580638921152, 'num_leaves': 1691, 'max_depth': 12, 'min_data_in_leaf': 83, 'feature_fraction': 0.5014183757410755, 'bagging_fraction': 0.6874419965632301, 'lambda_l1': 0.9389929889143102, 'lambda_l2': 9.960935337187388, 'min_gain_to_split': 0.04558631152813275, 'min_child_weight': 4.550561517440494}. Best is trial 15 with value: 2648.9534808293793.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[5000]	valid_0's rmse: 2648.52


[I 2024-09-25 11:40:43,097] Trial 22 finished with value: 2648.5227099065073 and parameters: {'learning_rate': 0.01009418569687866, 'num_leaves': 1559, 'max_depth': 13, 'min_data_in_leaf': 88, 'feature_fraction': 0.42589539111138475, 'bagging_fraction': 0.4055190455436449, 'lambda_l1': 1.4484843497691309, 'lambda_l2': 9.12420444567433, 'min_gain_to_split': 0.028092504472655155, 'min_child_weight': 4.075077560931486}. Best is trial 22 with value: 2648.5227099065073.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2661]	valid_0's rmse: 2657.23


[I 2024-09-25 11:42:00,631] Trial 23 finished with value: 2657.225709537356 and parameters: {'learning_rate': 0.034951246601194724, 'num_leaves': 1028, 'max_depth': 13, 'min_data_in_leaf': 91, 'feature_fraction': 0.40169238660341433, 'bagging_fraction': 0.41645448710830335, 'lambda_l1': 2.422386690256837, 'lambda_l2': 0.16904607875853195, 'min_gain_to_split': 0.02088171282471115, 'min_child_weight': 5.288674236418073}. Best is trial 22 with value: 2648.5227099065073.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4260]	valid_0's rmse: 2643.36


[I 2024-09-25 11:44:08,834] Trial 24 finished with value: 2643.3637905333035 and parameters: {'learning_rate': 0.017780312906297557, 'num_leaves': 1474, 'max_depth': 14, 'min_data_in_leaf': 86, 'feature_fraction': 0.45292978398075145, 'bagging_fraction': 0.4201674433320353, 'lambda_l1': 3.209550539996979, 'lambda_l2': 6.5289134117884915, 'min_gain_to_split': 0.10540894611143609, 'min_child_weight': 3.7742275788805566}. Best is trial 24 with value: 2643.3637905333035.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3640]	valid_0's rmse: 2645.13


[I 2024-09-25 11:46:19,847] Trial 25 finished with value: 2645.12860039945 and parameters: {'learning_rate': 0.02500612523389948, 'num_leaves': 1141, 'max_depth': 14, 'min_data_in_leaf': 76, 'feature_fraction': 0.4513754801797484, 'bagging_fraction': 0.4081765575637231, 'lambda_l1': 3.2679573590481863, 'lambda_l2': 6.543375678791744, 'min_gain_to_split': 0.10054708274054301, 'min_child_weight': 3.5507495606120845}. Best is trial 24 with value: 2643.3637905333035.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2829]	valid_0's rmse: 2650.26


[I 2024-09-25 11:47:50,172] Trial 26 finished with value: 2650.264950417102 and parameters: {'learning_rate': 0.023960017623298236, 'num_leaves': 1069, 'max_depth': 14, 'min_data_in_leaf': 74, 'feature_fraction': 0.4267647406566788, 'bagging_fraction': 0.40339263994603375, 'lambda_l1': 9.866749529264808, 'lambda_l2': 6.575738248360427, 'min_gain_to_split': 0.10589712606837266, 'min_child_weight': 3.443433920910425}. Best is trial 24 with value: 2643.3637905333035.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1791]	valid_0's rmse: 2663.64


[I 2024-09-25 11:48:41,275] Trial 27 finished with value: 2663.6398505917564 and parameters: {'learning_rate': 0.03646046215259334, 'num_leaves': 1559, 'max_depth': 14, 'min_data_in_leaf': 94, 'feature_fraction': 0.45315648022408306, 'bagging_fraction': 0.4663188477014925, 'lambda_l1': 3.3280312170418007, 'lambda_l2': 6.26617065181266, 'min_gain_to_split': 0.10245020812697038, 'min_child_weight': 6.607504124930977}. Best is trial 24 with value: 2643.3637905333035.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[4971]	valid_0's rmse: 2646.51


[I 2024-09-25 11:50:26,788] Trial 28 finished with value: 2646.509352133502 and parameters: {'learning_rate': 0.01712847927294276, 'num_leaves': 1979, 'max_depth': 11, 'min_data_in_leaf': 65, 'feature_fraction': 0.44853227301909065, 'bagging_fraction': 0.5383732940213594, 'lambda_l1': 4.6172226581130555, 'lambda_l2': 4.228239666885414, 'min_gain_to_split': 0.11765746086511997, 'min_child_weight': 2.4459990397622433}. Best is trial 24 with value: 2643.3637905333035.


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1918]	valid_0's rmse: 2694.13


[I 2024-09-25 11:51:05,670] Trial 29 finished with value: 2694.1336328860834 and parameters: {'learning_rate': 0.061797949861180757, 'num_leaves': 2444, 'max_depth': 11, 'min_data_in_leaf': 64, 'feature_fraction': 0.4638016088341487, 'bagging_fraction': 0.5409719765162476, 'lambda_l1': 4.817668945407842, 'lambda_l2': 3.7934097891791323, 'min_gain_to_split': 0.16450621717049108, 'min_child_weight': 2.3368013250940995}. Best is trial 24 with value: 2643.3637905333035.


Best trial: {'learning_rate': 0.017780312906297557, 'num_leaves': 1474, 'max_depth': 14, 'min_data_in_leaf': 86, 'feature_fraction': 0.45292978398075145, 'bagging_fraction': 0.4201674433320353, 'lambda_l1': 3.209550539996979, 'lambda_l2': 6.5289134117884915, 'min_gain_to_split': 0.10540894611143609, 'min_child_weight': 3.7742275788805566}


In [9]:
# Train the final LightGBM model with the best hyperparameters
best_params = study.best_trial.params

best_model = lgb.LGBMRegressor(**best_params, n_estimators=10000)

best_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
  #  early_stopping_rounds=50,
    categorical_feature=cat_feature_indices
)



In [None]:
# Inspect the model's feature names (from the training phase)
model_features = best_model.feature_name_  # Or check best_model.booster_.feature_name() for lightgbm

# Print model's features and submission_set columns for debugging
print("Model features:", model_features)
print("Submission set columns:", submission_set.columns.tolist())

# Ensure the submission_set has the same features as the model
# Add missing columns with default values (e.g., 0 or the mean)
for feature in model_features:
    if feature not in submission_set.columns:
        submission_set[feature] = 0  # You can choose a more appropriate fill value


In [12]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns that need to be converted to numeric
categorical_columns = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline']

# Initialize LabelEncoder
label_encoders = {}

# Convert each categorical column to numeric using LabelEncoder
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    submission_set[col] = label_encoders[col].fit_transform(submission_set[col].astype(str))

# Print to check the converted types
print("Submission set dtypes after label encoding:", submission_set.dtypes)

# Ensure the submission_set has the same features as the model
submission_set_features = submission_set[model_features]

# Predict using the aligned features
submission_set['tow'] = best_model.predict(submission_set_features)

# Inspect the result
print(submission_set.head())


Submission set dtypes after label encoding: flight_id              int64
date                  object
callsign              object
adep                   int64
name_adep             object
                       ...  
Latitude_ades          int64
Longitude_ades         int64
elevation_gradient     int64
adep_geo_cluster       int64
ades_geo_cluster       int64
Length: 64, dtype: object
   flight_id        date                          callsign  adep  \
0  248753821  2022-01-01  3b3de0f3ad0ee192513995c02f7bf7cf   331   
1  248753822  2022-01-01  e06dd03d4a879ca37d9e18c1bd7cad16    11   
2  248754498  2022-01-01  2d3b1c962c78c4ebeef11bcd51b9e94c   158   
3  248757623  2022-01-01  81564432d3ee97c4bdf4cd8f006753dc    45   
4  248763603  2022-01-01  84be079d7e660db105d91f600b4b3d59    74   

                name_adep country_code_adep  ades        name_ades  \
0  Istanbul Sabiha Gokcen                TR   185             Lyon   
1                Brussels                BE   138     New York

In [13]:

from datetime import datetime
import pytz

saopaulo_tz = pytz.timezone('America/Sao_Paulo')

timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')


In [14]:

import os

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")
submission_set.to_csv(submission_file, index=False)