In [1]:
import pandas as pd
import numpy as np

## Loading Data

In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v19.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v19.csv")
submission_set = pd.read_csv("./data/final_submission_set.csv")

## Predictive Model Learning

In [3]:
cat_names = ['callsign',
            'adep', 
            'ades', 
            'aircraft_type', 
            'wtc', 
            'airline',
            'offblock_hour',
            'offblock_minute', 
            'offblock_day_of_week',
            'offblock_weekday_name',
            'offblock_month',
            'offblock_week_of_year', 
            'offblock_season', 
            'arrival_hour',
            'arrival_minute',
            'arrival_season',
            'arrival_weekday_name',
            'is_offblock_weekend',
            'is_offblock_rush_hour',
            'flight_duration_category',                       
            'adep_region', 
            'ades_region', 
            'same_country_flight',
            'same_region_flight',                        
            'flight_direction',
            'is_intercontinental',
            'Manufacturer',
            'Model_FAA',
            'Physical_Class_Engine',
            'FAA_Weight',
            'adep_geo_cluster',
            'ades_geo_cluster']

In [4]:
dataset = pd.concat([challenge_set_updated, submission_set_updated], axis=0)

In [5]:
pd.set_option('display.max_rows', None)

print(dataset.isnull().mean().sort_values(ascending=False))

pd.reset_option('display.max_rows')

vlof_tas                            0.406441
sqrd_tas_1                          0.406441
tas_1                               0.406441
specific_energy_1                   0.406441
sqrd_vlof_tas                       0.406441
vlof_groundspeed                    0.406441
tas_2                               0.372550
specific_energy_2                   0.372550
sqrd_tas_2                          0.372550
sqrd_tas_3                          0.359277
specific_energy_3                   0.359277
tas_3                               0.359277
sqrd_tas_10                         0.356763
specific_energy_10                  0.356763
tas_10                              0.356763
sqrd_tas_9                          0.353705
tas_9                               0.353705
specific_energy_9                   0.353705
tas_4                               0.353320
specific_energy_4                   0.353320
sqrd_tas_4                          0.353320
tas_8                               0.351376
sqrd_tas_8

In [6]:
# Dropping columns with too many NaNs
threshold = 0.4
df = dataset.dropna(thresh=int((1 - threshold) * len(dataset)), axis=1)

In [7]:
# Imputation of NaNs
columns_with_nan = dataset.isna().any()
for col in dataset.columns[columns_with_nan]:
    dataset.loc[:, col] = dataset.fillna(dataset[col].median())

In [8]:
df = dataset.iloc[0:challenge_set_updated.shape[0], :]

In [9]:
X = df.drop('tow', axis=1)
y = df.tow

In [11]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from lightgbm.basic import LightGBMError

# Specify your categorical columns
categorical_cols = [
    'callsign', 'adep', 'ades', 'aircraft_type', 'wtc', 'airline',
    'offblock_season', 'offblock_weekday_name', 'arrival_season', 'arrival_weekday_name',
    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction',
    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight'
]

# Convert categorical columns to type 'category'
for col in categorical_cols:
    if col in X.columns:
        X[col] = X[col].astype('category')

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM parameters
params = {
    'learning_rate': 0.01,
    'reg_lambda': 0.46415888336127775,
    'reg_alpha': 0.166810053720005,
    'min_child_weight': 4,
    'max_depth': 13,
    'colsample_bytree': 0.6,
    'objective': 'regression',
    'random_state': 42,
    'n_estimators': 50000,
    'metric': 'rmse',
    'n_jobs': -1,
    'device': 'gpu',  # Enable GPU usage
    'subsample': 1.0
}

# Creating LightGBM datasets with free_raw_data set to False
train_data = lgb.Dataset(X_train, label=y_train, 
                         categorical_feature=categorical_cols, 
                         free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, 
                       categorical_feature=categorical_cols, 
                       free_raw_data=False)

# Training the LightGBM model
try:
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=20),
            lgb.log_evaluation(period=100)  # Logs evaluation every 100 rounds
        ]
    )
except LightGBMError as e:
    # Handle GPU error, switch to CPU
    print("Switching to CPU due to GPU limitation.")
    params['device'] = 'cpu'  # Switch device to CPU
    model = lgb.train(
        params,
        train_data,
        num_boost_round=5000,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=20),
            lgb.log_evaluation(period=100)  # Logs evaluation every 100 rounds
        ]
    )




[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 45955
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 215
Switching to CPU due to GPU limitation.


[LightGBM] [Fatal] bin size 6666 cannot run on GPU


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.146944 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45955
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 215
[LightGBM] [Info] Start training from score 79542.054059
Training until validation scores don't improve for 20 rounds
[100]	valid's rmse: 19987.2
[200]	valid's rmse: 8358.87
[300]	valid's rmse: 4699.27
[400]	valid's rmse: 3712.39
[500]	valid's rmse: 3406.51
[600]	valid's rmse: 3262.95
[700]	valid's rmse: 3174.95
[800]	valid's rmse: 3108.16
[900]	valid's rmse: 3055.84
[1000]	valid's rmse: 3010.78
[1100]	valid's rmse: 2973.59
[1200]	valid's rmse: 2940.91
[1300]	valid's rmse: 2914.23
[1400]	valid's rmse: 2890.43
[1500]	valid's rmse: 2869.75
[1600]	valid's rmse: 2852.09
[1700]	valid's rmse: 2835.1
[1800]	valid's rmse: 2819.59
[1900]	valid's

In [18]:
from sklearn.feature_selection import SelectFromModel
# Using SelectFromModel for feature selection
selector = SelectFromModel(model, max_features=80, threshold=-np.inf, prefit=True)
X_selected = selector.transform(X_train)

# Get the selected feature names
selected_features = X_train.columns[selector.get_support()]

# Print the selected features
print(f"Selected features: {selected_features.tolist()}")



TypeError: <lightgbm.basic.Booster object at 0x7ea0170b3850> is not an estimator instance.

In [12]:
df_test = dataset.iloc[challenge_set_updated.shape[0]:, :]
# df_test = pd.read_csv("./data/submission_set_updated_v18.csv")
df_test.head()

Unnamed: 0,callsign,adep,ades,aircraft_type,wtc,airline,taxiout_time,flown_distance,track_variation_ARR_100,track_variation_DEP_40,...,Latitude_ades,Longitude_ades,Altitude_ades,actual_distance,altitude_difference,bearing,elevation_gradient,adep_geo_cluster,ades_geo_cluster,tow
0,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,LFLL,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,15.0,1122,1.668989,1.079187,...,45.726,5.091,251,2022.915548,-61,293.477205,-0.030154,11,17,63852.0
1,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,KJFK,A333,H,bdeeef3a675587d530de70a25d7118d2,15.0,3205,1.766098,1.147364,...,40.64,-73.779,4,5886.43037,-53,291.395141,-0.009004,6,1,63852.0
2,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,EGLL,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,10.0,3965,6.253309,1.292737,...,51.477,-0.461,25,7108.920003,22,43.036806,0.003095,12,13,63852.0
3,35f7721f68bf85128195547ae38b0f04,EBBR,LEAL,B738,M,f53c55b5cf0cbb3be755bf50df6fa52d,9.0,802,1.775667,0.905718,...,38.282,-0.558,44,1458.405355,-13,197.753476,-0.008914,6,19,63852.0
4,eb56918bee9bc5204624186b9bcc4391,LSZH,LFPG,BCS3,M,2d5def0a5a844b343ba1b7cc9cb28fa9,11.0,292,1.200644,1.204058,...,49.013,2.55,120,476.291487,-312,293.398537,-0.655061,2,6,63852.0


In [None]:
df_test = dataset.iloc[challenge_set_updated.shape[0]:, :]
# df_test = pd.read_csv("./data/submission_set_updated_v18.csv")
df_test.head()

Unnamed: 0,callsign,adep,ades,aircraft_type,wtc,airline,taxiout_time,flown_distance,track_variation_ARR_100,track_variation_DEP_40,...,Latitude_ades,Longitude_ades,Altitude_ades,actual_distance,altitude_difference,bearing,elevation_gradient,adep_geo_cluster,ades_geo_cluster,tow
0,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,LFLL,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,15.0,1122,1.668989,1.079187,...,45.726,5.091,251,2022.915548,-61,293.477205,-0.030154,11,17,63852.0
1,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,KJFK,A333,H,bdeeef3a675587d530de70a25d7118d2,15.0,3205,1.766098,1.147364,...,40.64,-73.779,4,5886.43037,-53,291.395141,-0.009004,6,1,63852.0
2,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,EGLL,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,10.0,3965,6.253309,1.292737,...,51.477,-0.461,25,7108.920003,22,43.036806,0.003095,12,13,63852.0
3,35f7721f68bf85128195547ae38b0f04,EBBR,LEAL,B738,M,f53c55b5cf0cbb3be755bf50df6fa52d,9.0,802,1.775667,0.905718,...,38.282,-0.558,44,1458.405355,-13,197.753476,-0.008914,6,19,63852.0
4,eb56918bee9bc5204624186b9bcc4391,LSZH,LFPG,BCS3,M,2d5def0a5a844b343ba1b7cc9cb28fa9,11.0,292,1.200644,1.204058,...,49.013,2.55,120,476.291487,-312,293.398537,-0.655061,2,6,63852.0


In [13]:
X_test = df_test.drop('tow', axis=1)


In [14]:
# Assuming X_test is your test dataset

# Convert categorical columns to 'category' type with the same categories as in X_train
for col in categorical_cols:
    if col in X_test.columns:
        X_test[col] = pd.Categorical(X_test[col], categories=X_train[col].cat.categories)

# Now you can safely use the model for prediction
y_pred = model.predict(X_test)
y_test = y_pred

In [16]:
dft0 = pd.read_csv('./data/final_submission_set.csv')
dft0['tow'] = y_test
dft0[['flight_id', 'tow']].to_csv('submissions/lightligbm3.csv', index=False)

In [17]:
!mc cp ./submissions/lightligbm3.csv dc24/submissions/team_tiny_rainbow_v112_7ec66710-1eb8-478e-8976-584c090b6373.csv


...ligbm3.csv: 4.25 MiB / 4.25 MiB ┃▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓┃ 1.37 MiB/s 3s[0;22m[0m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m[m[32;1m