In [1]:
import pandas as pd
import numpy as np

## Loading Data

In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v19.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v19.csv")
submission_set = pd.read_csv("./data/final_submission_set.csv")

## Predictive Model Learning

In [3]:
cat_names = ['callsign',
            'adep', 
            'ades', 
            'aircraft_type', 
            'wtc', 
            'airline',
            'offblock_hour',
            'offblock_minute', 
            'offblock_day_of_week',
            'offblock_weekday_name',
            'offblock_month',
            'offblock_week_of_year', 
            'offblock_season', 
            'arrival_hour',
            'arrival_minute',
            'arrival_season',
            'arrival_weekday_name',
            'is_offblock_weekend',
            'is_offblock_rush_hour',
            'flight_duration_category',                       
            'adep_region', 
            'ades_region', 
            'same_country_flight',
            'same_region_flight',                        
            'flight_direction',
            'is_intercontinental',
            'Manufacturer',
            'Model_FAA',
            'Physical_Class_Engine',
            'FAA_Weight',
            'adep_geo_cluster',
            'ades_geo_cluster']

In [4]:
dataset = pd.concat([challenge_set_updated, submission_set_updated], axis=0)

In [5]:
pd.set_option('display.max_rows', None)

print(dataset.isnull().mean().sort_values(ascending=False))

pd.reset_option('display.max_rows')

specific_energy_1                   0.406441
vlof_tas                            0.406441
sqrd_tas_1                          0.406441
sqrd_vlof_tas                       0.406441
vlof_groundspeed                    0.406441
tas_1                               0.406441
tas_2                               0.372550
specific_energy_2                   0.372550
sqrd_tas_2                          0.372550
tas_3                               0.359277
sqrd_tas_3                          0.359277
specific_energy_3                   0.359277
specific_energy_10                  0.356763
sqrd_tas_10                         0.356763
tas_10                              0.356763
sqrd_tas_9                          0.353705
specific_energy_9                   0.353705
tas_9                               0.353705
tas_4                               0.353320
specific_energy_4                   0.353320
sqrd_tas_4                          0.353320
tas_8                               0.351376
specific_e

In [6]:
# Dropping columns with too many NaNs
threshold = 0.4
df = dataset.dropna(thresh=int((1 - threshold) * len(dataset)), axis=1)

In [7]:
# Imputation of NaNs
columns_with_nan = dataset.isna().any()
for col in dataset.columns[columns_with_nan]:
    dataset.loc[:, col] = dataset.fillna(dataset[col].median())

In [8]:
df = dataset.iloc[0:challenge_set_updated.shape[0], :]

In [9]:
X = df.drop('tow', axis=1)
y = df.tow

In [10]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from lightgbm.basic import LightGBMError

# Specify your categorical columns
categorical_cols = [
    'callsign', 'adep', 'ades', 'aircraft_type', 'wtc', 'airline',
    'offblock_season', 'offblock_weekday_name', 'arrival_season', 'arrival_weekday_name',
    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction',
    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight'
]

# Convert categorical columns to type 'category'
for col in categorical_cols:
    if col in X.columns:
        X[col] = X[col].astype('category')

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM parameters
params = {
    'learning_rate': 0.01, 
    'max_depth': 3, 
    'min_child_weight': 10, 
    'colsample_bytree': 0.485499328628503, 
    'reg_alpha': 0.0003241894462645541, 
    'reg_lambda': 0.8218509198885221, 
    'objective': 'regression',
    'random_state': 42,
    'n_estimators': 50000,
    'metric': 'rmse',
    'n_jobs': -1,
    'device': 'gpu',  # Enable GPU usage
    'subsample': 1.0
}

# Creating LightGBM datasets with free_raw_data set to False
train_data = lgb.Dataset(X_train, label=y_train, 
                         categorical_feature=categorical_cols, 
                         free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, 
                       categorical_feature=categorical_cols, 
                       free_raw_data=False)

# Training the LightGBM model
try:
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)  # Logs evaluation every 100 rounds
        ]
    )
except LightGBMError as e:
    # Handle GPU error, switch to CPU
    print("Switching to CPU due to GPU limitation.")
    params['device'] = 'cpu'  # Switch device to CPU
    model = lgb.train(
        params,
        train_data,
        num_boost_round=5000,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)  # Logs evaluation every 100 rounds
        ]
    )




[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 46211
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 216
Switching to CPU due to GPU limitation.


[LightGBM] [Fatal] bin size 6666 cannot run on GPU


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.544972 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46211
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 216
[LightGBM] [Info] Start training from score 79542.054059
Training until validation scores don't improve for 100 rounds
[100]	valid's rmse: 21032.1
[200]	valid's rmse: 9914.67
[300]	valid's rmse: 6296.55
[400]	valid's rmse: 5118.2
[500]	valid's rmse: 4640.96
[600]	valid's rmse: 4368.3
[700]	valid's rmse: 4194.81
[800]	valid's rmse: 4063.47
[900]	valid's rmse: 3963.09
[1000]	valid's rmse: 3876.9
[1100]	valid's rmse: 3802.46
[1200]	valid's rmse: 3733.72
[1300]	valid's rmse: 3675.2
[1400]	valid's rmse: 3627.13
[1500]	valid's rmse: 3582.17
[1600]	valid's rmse: 3542.49
[1700]	valid's rmse: 3509.81
[1800]	valid's rmse: 3482.6
[1900]	valid's rm

In [11]:
# Printing the best iteration and RMSE
best_iteration = model.best_iteration
best_rmse = model.best_score['valid']['rmse']
print(f"Best iteration: {best_iteration}")
print(f"Best RMSE: {best_rmse}")

Best iteration: 49999
Best RMSE: 2531.0264471272317


In [12]:
# Continue training the LightGBM model for additional rounds
additional_rounds = 5000  # Specify the number of additional boosting rounds

# Train the model further
model = lgb.train(
    params,
    train_data,
    init_model=model,  # Continue from the existing model
    num_boost_round=additional_rounds,
    valid_sets=[val_data],
    valid_names=['valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

# Print the updated best iteration and RMSE
best_iteration = model.best_iteration
best_rmse = model.best_score['valid']['rmse']
print(f"Updated best iteration: {best_iteration}")
print(f"Updated best RMSE: {best_rmse}")




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.541126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 46211
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 216
[50000]	valid's rmse: 2531.02
Training until validation scores don't improve for 100 rounds
[50100]	valid's rmse: 2530.72
[50200]	valid's rmse: 2530.41
[50300]	valid's rmse: 2530.04
[50400]	valid's rmse: 2529.81
[50500]	valid's rmse: 2529.62
[50600]	valid's rmse: 2529.39
[50700]	valid's rmse: 2529.14
[50800]	valid's rmse: 2528.85
[50900]	valid's rmse: 2528.51
[51000]	valid's rmse: 2528.25
[51100]	valid's rmse: 2528.03
[51200]	valid's rmse: 2527.86
[51300]	valid's rmse: 2527.56
[51400]	valid's rmse: 2527.26
[51500]	valid's rmse: 2527.07
[51600]	valid's rmse: 2526.8
[51700]	valid's rmse: 2526.45
[51800]	valid's rmse: 2526.28
[51900]	valid

In [13]:
model.save_model('models/lightgbm_model_v96.txt')

<lightgbm.basic.Booster at 0x76635cd5e980>

In [15]:
y_pred = model.predict(X_val)
pd.DataFrame(data={'tow': y_pred}).to_csv('output_data/lightgbm_val_v20.csv', index=False)

In [16]:
df_test = dataset.iloc[challenge_set_updated.shape[0]:, :]
# df_test = pd.read_csv("./data/submission_set_updated_v18.csv")
df_test.head()

Unnamed: 0,callsign,adep,ades,aircraft_type,wtc,airline,taxiout_time,flown_distance,track_variation_ARR_100,track_variation_DEP_40,...,Latitude_ades,Longitude_ades,Altitude_ades,actual_distance,altitude_difference,bearing,elevation_gradient,adep_geo_cluster,ades_geo_cluster,tow
0,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,LFLL,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,15.0,1122,1.668989,1.079187,...,45.726,5.091,251,2022.915548,-61,293.477205,-0.030154,11,17,63852.0
1,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,KJFK,A333,H,bdeeef3a675587d530de70a25d7118d2,15.0,3205,1.766098,1.147364,...,40.64,-73.779,4,5886.43037,-53,291.395141,-0.009004,6,1,63852.0
2,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,EGLL,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,10.0,3965,6.253309,1.292737,...,51.477,-0.461,25,7108.920003,22,43.036806,0.003095,12,13,63852.0
3,35f7721f68bf85128195547ae38b0f04,EBBR,LEAL,B738,M,f53c55b5cf0cbb3be755bf50df6fa52d,9.0,802,1.775667,0.905718,...,38.282,-0.558,44,1458.405355,-13,197.753476,-0.008914,6,19,63852.0
4,eb56918bee9bc5204624186b9bcc4391,LSZH,LFPG,BCS3,M,2d5def0a5a844b343ba1b7cc9cb28fa9,11.0,292,1.200644,1.204058,...,49.013,2.55,120,476.291487,-312,293.398537,-0.655061,2,6,63852.0


In [17]:
X_test = df_test.drop('tow', axis=1)


In [18]:
# Assuming X_test is your test dataset

# Convert categorical columns to 'category' type with the same categories as in X_train
for col in categorical_cols:
    if col in X_test.columns:
        X_test[col] = pd.Categorical(X_test[col], categories=X_train[col].cat.categories)

# Now you can safely use the model for prediction
y_pred = model.predict(X_test)
y_test = y_pred

In [19]:
dft0 = pd.read_csv('./data/final_submission_set.csv')
dft0['tow'] = y_test
dft0[['flight_id', 'tow']].to_csv('submissions/lightligbm20.csv', index=False)

In [20]:
!mc cp ./submissions/lightligbm20.csv dc24/submissions/team_tiny_rainbow_v124_7ec66710-1eb8-478e-8976-584c090b6373.csv


...igbm20.csv: 4.25 MiB / 4.25 MiB ┃▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓┃ 1.21 MiB/s 3s[0;22m[0m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m

In [21]:
print("k")

k
