In [1]:
import pandas as pd
import numpy as np

## Loading Data

In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v20.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v20.csv")
submission_set = pd.read_csv("./data/final_submission_set.csv")

## Predictive Model Learning

In [3]:
cat_names = ['callsign',
            'adep', 
            'ades', 
            'aircraft_type', 
            'wtc', 
            'airline',
            'offblock_hour',
            'offblock_minute', 
            'offblock_day_of_week',
            'offblock_weekday_name',
            'offblock_month',
            'offblock_week_of_year', 
            'offblock_season', 
            'arrival_hour',
            'arrival_minute',
            'arrival_season',
            'arrival_weekday_name',
            'is_offblock_weekend',
            'is_offblock_rush_hour',
            'flight_duration_category',                       
            'adep_region', 
            'ades_region', 
            'same_country_flight',
            'same_region_flight',                        
            'flight_direction',
            'is_intercontinental',
            'Manufacturer',
            'Model_FAA',
            'Physical_Class_Engine',
            'FAA_Weight',
            'adep_geo_cluster',
            'ades_geo_cluster']

In [4]:
dataset = pd.concat([challenge_set_updated, submission_set_updated], axis=0)

In [5]:
pd.set_option('display.max_rows', None)

print(dataset.isnull().mean().sort_values(ascending=False))

pd.reset_option('display.max_rows')

specific_energy_1                   0.406441
vlof_tas                            0.406441
sqrd_tas_1                          0.406441
sqrd_vlof_tas                       0.406441
vlof_groundspeed                    0.406441
tas_1                               0.406441
tas_2                               0.372550
specific_energy_2                   0.372550
sqrd_tas_2                          0.372550
tas_3                               0.359277
sqrd_tas_3                          0.359277
specific_energy_3                   0.359277
specific_energy_10                  0.356763
sqrd_tas_10                         0.356763
tas_10                              0.356763
sqrd_tas_9                          0.353705
specific_energy_9                   0.353705
tas_9                               0.353705
tas_4                               0.353320
specific_energy_4                   0.353320
sqrd_tas_4                          0.353320
tas_8                               0.351376
specific_e

In [6]:
# Dropping columns with too many NaNs
threshold = 0.4
df = dataset.dropna(thresh=int((1 - threshold) * len(dataset)), axis=1)

In [7]:
# Imputation of NaNs
columns_with_nan = dataset.isna().any()
for col in dataset.columns[columns_with_nan]:
    dataset.loc[:, col] = dataset.fillna(dataset[col].median())

In [8]:
df = dataset.iloc[0:challenge_set_updated.shape[0], :]

In [9]:
X = df.drop('tow', axis=1)
y = df.tow

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from lightgbm.basic import LightGBMError

# Specify your categorical columns
categorical_cols = [
    'callsign', 'adep', 'ades', 'aircraft_type', 'wtc', 'airline',
    'offblock_season', 'offblock_weekday_name', 'arrival_season', 'arrival_weekday_name',
    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction',
    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight'
]

# Convert categorical columns to type 'category'
for col in categorical_cols:
    if col in X.columns:
        X[col] = X[col].astype('category')

# Splitting the dataset
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



# LightGBM parameters
params = {
    'learning_rate': 0.01,
    'reg_lambda': 0.46415888336127775,
    'reg_alpha': 0.166810053720005,
    'min_child_weight': 4,
    'max_depth': 13,
    'colsample_bytree': 0.6,
    'objective': 'regression',
    'random_state': 42,
    'n_estimators': 50000,
    'metric': 'rmse',
    'n_jobs': -1,
    'device': 'gpu',  # Enable GPU usage
    'subsample': 1.0
}

# Creating LightGBM datasets with free_raw_data set to False
train_data = lgb.Dataset(X_train, label=y_train, 
                         categorical_feature=categorical_cols, 
                         free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, 
                       categorical_feature=categorical_cols, 
                       free_raw_data=False)

# Training the LightGBM model
try:
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)  # Logs evaluation every 100 rounds
        ]
    )
except LightGBMError as e:
    # Handle GPU error, switch to CPU
    print("Switching to CPU due to GPU limitation.")
    params['device'] = 'cpu'  # Switch device to CPU
    model = lgb.train(
        params,
        train_data,
        num_boost_round=5000,
        valid_sets=[val_data],
        valid_names=['valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=100)  # Logs evaluation every 100 rounds
        ]
    )




[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 46211
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 216
Switching to CPU due to GPU limitation.


[LightGBM] [Fatal] bin size 6666 cannot run on GPU


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.728449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 46211
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 216
[LightGBM] [Info] Start training from score 79542.054059
Training until validation scores don't improve for 100 rounds
[100]	valid's rmse: 19984.5
[200]	valid's rmse: 8352.76
[300]	valid's rmse: 4685.95
[400]	valid's rmse: 3698.43
[500]	valid's rmse: 3393.35
[600]	valid's rmse: 3249.8
[700]	valid's rmse: 3159.44
[800]	valid's rmse: 3093.29
[900]	valid's rmse: 3038.41
[1000]	valid's rmse: 2992.87
[1100]	valid's rmse: 2953.61
[1200]	valid's rmse: 2921.23
[1300]	valid's rmse: 2892.65
[1400]	valid's rmse: 2867.24
[1500]	valid's rmse: 2845.51
[1600]	valid's rmse: 2826.96
[1700]	valid's rmse: 2809.4
[1800]	valid's rmse: 2794.07
[1900]	valid's rmse: 2779.7
[2000]	valid's rmse: 2766.7
[2100]	valid's rmse: 2

In [None]:
# Save the model to a file (e.g., in LightGBM format)
model.save_model('modelsinterrupted_model.txt')


In [None]:
try:
    # Assuming the training code is running here
    pass
except KeyboardInterrupt:
    print("Training interrupted. You can still use the current model.")

    # Use the partially trained model
    y_pred = model.predict(X_val)  # Example prediction on validation data

    # Save the current model to a file
    model.save_model('interrupted_model.txt')


In [None]:
# Printing the best iteration and RMSE
best_iteration = model.best_iteration
best_rmse = model.best_score['valid']['rmse']
print(f"Best iteration: {best_iteration}")
print(f"Best RMSE: {best_rmse}")

In [None]:
# Continue training the LightGBM model for additional rounds
additional_rounds = 5000  # Specify the number of additional boosting rounds

# Train the model further
model = lgb.train(
    params,
    train_data,
    init_model=model,  # Continue from the existing model
    num_boost_round=additional_rounds,
    valid_sets=[val_data],
    valid_names=['valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

# Print the updated best iteration and RMSE
best_iteration = model.best_iteration
best_rmse = model.best_score['valid']['rmse']
print(f"Updated best iteration: {best_iteration}")
print(f"Updated best RMSE: {best_rmse}")


In [None]:
model.save_model('models/lightgbm_model_v96.txt')

In [None]:
y_pred = model.predict(X_val)
pd.DataFrame(data={'tow': y_pred}).to_csv('outout_data/lightgbm_val_v19.csv', index=False)

In [None]:
df_test = dataset.iloc[challenge_set_updated.shape[0]:, :]
# df_test = pd.read_csv("./data/submission_set_updated_v18.csv")
df_test.head()

In [13]:
X_test = df_test.drop('tow', axis=1)


In [14]:
# Assuming X_test is your test dataset

# Convert categorical columns to 'category' type with the same categories as in X_train
for col in categorical_cols:
    if col in X_test.columns:
        X_test[col] = pd.Categorical(X_test[col], categories=X_train[col].cat.categories)

# Now you can safely use the model for prediction
y_pred = model.predict(X_test)
y_test = y_pred

In [16]:
dft0 = pd.read_csv('./data/final_submission_set.csv')
dft0['tow'] = y_test
dft0[['flight_id', 'tow']].to_csv('submissions/lightligbm3.csv', index=False)

In [None]:
!mc cp ./submissions/lightligbm3.csv dc24/submissions/team_tiny_rainbow_v112_7ec66710-1eb8-478e-8976-584c090b6373.csv
