In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v16.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v16.csv")

# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]
# df = challenge_set_updated.sample(frac=0.001)

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']

n_jobs = os.cpu_count() // 2

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame
categorical_columns = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season',
                       'flight_duration_category', 'adep_region', 'ades_region',
                       'flight_direction', 'Manufacturer', 'Model_FAA',
                       'Physical_Class_Engine', 'FAA_Weight']

# Encoding using LabelEncoder
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))  # Ensure data is string type before encoding


In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Global variable to hold the trained model pipeline
global_model = None

# Define the objective function
def objective(trial):
    global global_model

    # Hyperparameters to be tuned by Optuna
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',  # You can leave this as 'gbdt' for GPU acceleration
        'verbosity': -1,
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 150),
        'cat_smooth': trial.suggest_float('cat_smooth', 5, 20),
        'device': 'gpu'  # This enables GPU training
    }

    # Convert DataFrame to LightGBM Dataset, specifying categorical features
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_columns, free_raw_data=False)
    valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_columns, free_raw_data=False)

    # Train the model with early stopping
    model = lgb.train(
        param,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )

    # Predictions on the validation set
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)  # Calculate RMSE

    # Save the trained model pipeline globally
    global_model = model

    return rmse


In [6]:
# Create a study object and specify the direction of the optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)  # Specify the number of trials

# Best trial results
print("Best trial:")
print(study.best_trial.params)
print("Best RMSE:", study.best_value)

[I 2024-10-16 13:17:38,959] A new study created in memory with name: no-name-0fbebc6f-fb3a-4482-b792-a02f31441557


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[997]	valid_0's rmse: 2606.61


[I 2024-10-16 14:41:09,624] Trial 0 finished with value: 2606.6094999350394 and parameters: {'num_leaves': 26, 'learning_rate': 0.18763891802032998, 'min_child_samples': 72, 'min_data_per_group': 139, 'cat_smooth': 7.573846384937663}. Best is trial 0 with value: 2606.6094999350394.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[998]	valid_0's rmse: 2564.21


[I 2024-10-16 18:07:13,804] Trial 1 finished with value: 2564.214247256467 and parameters: {'num_leaves': 74, 'learning_rate': 0.13409243208928043, 'min_child_samples': 5, 'min_data_per_group': 85, 'cat_smooth': 9.750015618641832}. Best is trial 1 with value: 2564.214247256467.


Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 2546.49


[I 2024-10-16 22:18:26,438] Trial 2 finished with value: 2546.4921435437986 and parameters: {'num_leaves': 96, 'learning_rate': 0.06256188395638097, 'min_child_samples': 63, 'min_data_per_group': 110, 'cat_smooth': 8.757471384663825}. Best is trial 2 with value: 2546.4921435437986.


Best trial:
{'num_leaves': 96, 'learning_rate': 0.06256188395638097, 'min_child_samples': 63, 'min_data_per_group': 110, 'cat_smooth': 8.757471384663825}
Best RMSE: 2546.4921435437986


In [7]:

best_params = study.best_trial.params

# Train the final model on the full training set using the best parameters
best_params.update({
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'objective': 'regression'
})
bst = lgb.train(
    best_params,
    lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_columns, free_raw_data=False),
    num_boost_round=1000
)

In [8]:
from sklearn.preprocessing import LabelEncoder

# List of columns to encode
categorical_columns = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season', 'flight_duration_category', 
                       'adep_region', 'ades_region', 'flight_direction', 'Manufacturer', 'Model_FAA', 
                       'Physical_Class_Engine', 'FAA_Weight']

# Initialize the LabelEncoder
label_encoders = {}


# Apply Label Encoding to each categorical column100aa
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    submission_set_updated[column] = label_encoders[column].fit_transform(submission_set_updated[column])


# Now you should be able to run LightGBM predictions
submission_set_features = submission_set_updated.iloc[:, :-1]
submission_set['tow'] = bst.predict(submission_set_features)

print(submission_set)

        flight_id        date                          callsign  adep  \
0       248753821  2022-01-01  3b3de0f3ad0ee192513995c02f7bf7cf  LTFJ   
1       248753822  2022-01-01  e06dd03d4a879ca37d9e18c1bd7cad16  EBBR   
2       248754498  2022-01-01  2d3b1c962c78c4ebeef11bcd51b9e94c  KMIA   
3       248757623  2022-01-01  81564432d3ee97c4bdf4cd8f006753dc  EGCN   
4       248763603  2022-01-01  84be079d7e660db105d91f600b4b3d59  EIDW   
...           ...         ...                               ...   ...   
105954  258066302  2022-12-31  2d3b4446c4d05a25196a9d52cab936fb  LTFJ   
105955  258068609  2022-12-31  253fd692ed441fac523081471c067772  LOWW   
105956  258068876  2022-12-31  c9fca302ca2e28acab0eb0bb1b46f11b  LTFM   
105957  258064675  2022-12-31  00f96ad0e382476649574ba044c764fc  EHAM   
105958  258058370  2022-12-31  5f0c222c7f7ceff3fbe75c854cce74c9  UBBB   

                     name_adep country_code_adep  ades          name_ades  \
0       Istanbul Sabiha Gokcen                

In [9]:

import os
from datetime import datetime

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Define a timestamp for the file name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")

# Assuming submission_set is a DataFrame, save it to CSV
submission_set.to_csv(submission_file, index=False)

print(f"Submission saved to {submission_file}")


Submission saved to submissions/submission_20241017_005132.csv
