In [1]:
import pandas as pd
import numpy as np

## Loading Data

In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v19.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v19.csv")
submission_set = pd.read_csv("./data/final_submission_set.csv")

## Predictive Model Learning

In [3]:
cat_names = ['callsign',
            'adep', 
            'ades', 
            'aircraft_type', 
            'wtc', 
            'airline',
            'offblock_hour',
            'offblock_minute', 
            'offblock_day_of_week',
            'offblock_weekday_name',
            'offblock_month',
            'offblock_week_of_year', 
            'offblock_season', 
            'arrival_hour',
            'arrival_minute',
            'arrival_season',
            'arrival_weekday_name',
            'is_offblock_weekend',
            'is_offblock_rush_hour',
            'flight_duration_category',                       
            'adep_region', 
            'ades_region', 
            'same_country_flight',
            'same_region_flight',                        
            'flight_direction',
            'is_intercontinental',
            'Manufacturer',
            'Model_FAA',
            'Physical_Class_Engine',
            'FAA_Weight',
            'adep_geo_cluster',
            'ades_geo_cluster']

In [4]:
dataset = pd.concat([challenge_set_updated, submission_set_updated], axis=0)

In [5]:
pd.set_option('display.max_rows', None)

print(dataset.isnull().mean().sort_values(ascending=False))

pd.reset_option('display.max_rows')

vlof_tas                            0.406441
sqrd_tas_1                          0.406441
tas_1                               0.406441
specific_energy_1                   0.406441
sqrd_vlof_tas                       0.406441
vlof_groundspeed                    0.406441
tas_2                               0.372550
specific_energy_2                   0.372550
sqrd_tas_2                          0.372550
sqrd_tas_3                          0.359277
specific_energy_3                   0.359277
tas_3                               0.359277
sqrd_tas_10                         0.356763
specific_energy_10                  0.356763
tas_10                              0.356763
sqrd_tas_9                          0.353705
tas_9                               0.353705
specific_energy_9                   0.353705
tas_4                               0.353320
specific_energy_4                   0.353320
sqrd_tas_4                          0.353320
tas_8                               0.351376
sqrd_tas_8

In [6]:
# Dropping columns with too many NaNs
threshold = 0.4
df = dataset.dropna(thresh=int((1 - threshold) * len(dataset)), axis=1)

In [7]:
# Imputation of NaNs
columns_with_nan = dataset.isna().any()
for col in dataset.columns[columns_with_nan]:
    dataset.loc[:, col] = dataset.fillna(dataset[col].median())

In [8]:
df = dataset.iloc[0:challenge_set_updated.shape[0], :]

In [9]:
X = df.drop('tow', axis=1)
y = df.tow

In [10]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from lightgbm.basic import LightGBMError

# Specify your categorical columns
categorical_cols = [
    'callsign', 'adep', 'ades', 'aircraft_type', 'wtc', 'airline',
    'offblock_season', 'offblock_weekday_name', 'arrival_season', 'arrival_weekday_name',
    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction',
    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight'
]

# Convert categorical columns to type 'category'
for col in categorical_cols:
    if col in X.columns:
        X[col] = X[col].astype('category')

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# LightGBM parameters
params = {
    'learning_rate': 0.004965293773840623, 
    'max_depth': 3, 
    'min_child_weight': 10, 
    'colsample_bytree': 0.485499328628503, 
    'reg_alpha': 0.0003241894462645541, 
    'reg_lambda': 0.8218509198885221, 
    'objective': 'regression',
    'random_state': 42,
    'n_estimators': 50000,
    'metric': 'rmse',
    'n_jobs': -1,
    'device': 'gpu',  # Enable GPU usage
    'subsample': 1.0
}

# Creating LightGBM datasets with free_raw_data set to False
train_data = lgb.Dataset(X_train, label=y_train, 
                         categorical_feature=categorical_cols, 
                         free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, 
                       categorical_feature=categorical_cols, 
                       free_raw_data=False)

# Training the LightGBM model
try:
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)  # Logs evaluation every 100 rounds
        ]
    )
except LightGBMError as e:
    # Handle GPU error, switch to CPU
    print("Switching to CPU due to GPU limitation.")
    params['device'] = 'cpu'  # Switch device to CPU
    model = lgb.train(
        params,
        train_data,
        num_boost_round=5000,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)  # Logs evaluation every 100 rounds
        ]
    )




[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 45955
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 215
Switching to CPU due to GPU limitation.


[LightGBM] [Fatal] bin size 6666 cannot run on GPU


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.379403 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45955
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 215
[LightGBM] [Info] Start training from score 79542.054059
Training until validation scores don't improve for 100 rounds
[100]	valid's rmse: 33081.1
[200]	valid's rmse: 21203.3
[300]	valid's rmse: 14111.7
[400]	valid's rmse: 10012.6
[500]	valid's rmse: 7674.58
[600]	valid's rmse: 6330.36
[700]	valid's rmse: 5570.35
[800]	valid's rmse: 5127.68
[900]	valid's rmse: 4837.31
[1000]	valid's rmse: 4643.34
[1100]	valid's rmse: 4488.37
[1200]	valid's rmse: 4363.17
[1300]	valid's rmse: 4265.07
[1400]	valid's rmse: 4186.38
[1500]	valid's rmse: 4119.2
[1600]	valid's rmse: 4060.8
[1700]	valid's rmse: 4008.04
[1800]	valid's rmse: 3960.56
[1900]	valid's

In [11]:
model.save_model('models/lightgbm_model_v96.txt')

<lightgbm.basic.Booster at 0x749bd3e47a30>

In [13]:
# Printing the best iteration and RMSE
best_iteration = model.best_iteration
best_rmse = model.best_score['valid']['rmse']
print(f"Best iteration: {best_iteration}")
print(f"Best RMSE: {best_rmse}")

Best iteration: 49999
Best RMSE: 2676.7052237585035


In [14]:
# Continue training the LightGBM model for additional rounds
additional_rounds = 1000  # Specify the number of additional boosting rounds

# Train the model further
model = lgb.train(
    params,
    train_data,
    init_model=model,  # Continue from the existing model
    num_boost_round=additional_rounds,
    valid_sets=[val_data],
    valid_names=['valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

# Print the updated best iteration and RMSE
best_iteration = model.best_iteration
best_rmse = model.best_score['valid']['rmse']
print(f"Updated best iteration: {best_iteration}")
print(f"Updated best RMSE: {best_rmse}")




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.397461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45955
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 215
[50000]	valid's rmse: 2676.7
Training until validation scores don't improve for 100 rounds
[50100]	valid's rmse: 2676.33
[50200]	valid's rmse: 2675.94
[50300]	valid's rmse: 2675.48
[50400]	valid's rmse: 2675.1
[50500]	valid's rmse: 2674.74
[50600]	valid's rmse: 2674.33
[50700]	valid's rmse: 2673.96
[50800]	valid's rmse: 2673.45
[50900]	valid's rmse: 2673.06
[51000]	valid's rmse: 2672.73
[51100]	valid's rmse: 2672.31
[51200]	valid's rmse: 2671.93
[51300]	valid's rmse: 2671.31
[51400]	valid's rmse: 2670.9
[51500]	valid's rmse: 2670.48
[51600]	valid's rmse: 2670.09
[51700]	valid's rmse: 2669.52
[51800]	valid's rmse: 2668.99
[51900]	valid's

KeyboardInterrupt: 

In [15]:
# Now you can try saving the model again.
try:
    model.save_model('saved_model.txt')
    print("Model saved successfully.")
except NameError:
    print("Model variable is not defined, check where the model is being instantiated.")
except Exception as e:
    print("Failed to save the model:", str(e))

Model saved successfully.


In [16]:
# Now you can try saving the model again.
try:
    model.save_model('saved_model.txt')
    print("Model saved successfully.")
except NameError:
    print("Model variable is not defined, check where the model is being instantiated.")
except Exception as e:
    print("Failed to save the model:", str(e))

Model saved successfully.


In [12]:
y_pred = model.predict(X_val)
pd.DataFrame(data={'tow': y_pred}).to_csv('output_data/lightgbm_val_v19.csv', index=False)

OSError: Cannot save file into a non-existent directory: 'outout_data'

In [17]:
best_iteration = model.best_iteration
model.save_model('saved_model.txt', num_iteration=best_iteration)
print(f"Model saved successfully at iteration {best_iteration}.")

Model saved successfully at iteration 49999.


In [None]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_true, y_pred)

In [None]:
df_test = dataset.iloc[challenge_set_updated.shape[0]:, :]
# df_test = pd.read_csv("./data/submission_set_updated_v18.csv")
df_test.head()

In [13]:
X_test = df_test.drop('tow', axis=1)


In [14]:
# Assuming X_test is your test dataset

# Convert categorical columns to 'category' type with the same categories as in X_train
for col in categorical_cols:
    if col in X_test.columns:
        X_test[col] = pd.Categorical(X_test[col], categories=X_train[col].cat.categories)

# Now you can safely use the model for prediction
y_pred = model.predict(X_test)
y_test = y_pred

In [16]:
dft0 = pd.read_csv('./data/final_submission_set.csv')
dft0['tow'] = y_test
dft0[['flight_id', 'tow']].to_csv('submissions/lightligbm3.csv', index=False)

In [None]:
!mc cp ./submissions/lightligbm3.csv dc24/submissions/team_tiny_rainbow_v112_7ec66710-1eb8-478e-8976-584c090b6373.csv
