In [1]:
import optuna
from lightgbm import LGBMRegressor, early_stopping as lgb_early_stopping
from xgboost import XGBRegressor, callback as xgb_callback
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np 
import pandas as pd


In [3]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v13.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v13.csv")



In [4]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]
# df = challenge_set_updated.sample(frac=0.001)

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']



In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Handle categorical columns by Label Encoding
label_encoder = LabelEncoder()

# Specify the columns that are categorical
categorical_cols = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season', 
                    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction', 
                    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight']

# Convert these categorical columns into numerical form using Label Encoding
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Now, you can proceed with training LightGBM with this processed data


In [8]:
# Global variable to hold the trained model pipeline
global_model_pipeline = None

# Define the objective function
def objective(trial):
    global global_model_pipeline
    
    # Sample hyperparameters for LightGBM
    lgb_params = {
        'n_estimators': trial.suggest_int('lgb_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('lgb_learning_rate', 0.001, 0.5),
        'max_depth': trial.suggest_int('lgb_max_depth', 3, 15),
        'num_leaves': trial.suggest_int('lgb_num_leaves', 31, 1024),
        'reg_alpha': trial.suggest_float('lgb_reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('lgb_reg_lambda', 0, 10),
    }

    # Sample hyperparameters for XGBoost
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.001, 0.5),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 15),
        'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('xgb_reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('xgb_reg_lambda', 0, 10),
    }

    # Sample hyperparameters for CatBoost
    cat_params = {
        'n_estimators': trial.suggest_int('cat_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('cat_learning_rate', 0.001, 0.5),
        'depth': trial.suggest_int('cat_depth', 3, 15),
        'l2_leaf_reg': trial.suggest_float('cat_l2_leaf_reg', 1, 10),
    }

    # Initialize the models with sampled hyperparameters
    lgb_model = LGBMRegressor(**lgb_params)
    xgb_model = XGBRegressor(**xgb_params, objective='reg:squarederror')
    cat_model = CatBoostRegressor(**cat_params, verbose=0)

    # Ensemble the models using VotingRegressor
    ensemble_model = VotingRegressor(
        estimators=[
            ('lgb', lgb_model),
            ('xgb', xgb_model),
            ('cat', cat_model)
        ]
    )

    # Create a pipeline with data scaling and the ensemble model
    model_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('ensemble', ensemble_model)
    ])

    # Cross-validation for better evaluation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Metrics
    rmse_list = []
    r2_list = []
    mae_list = []

    for train_index, valid_index in kf.split(X):
        # Use iloc to select by position
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        # Train the ensemble model with early stopping for each model
        lgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                      callbacks=[lgb_early_stopping(stopping_rounds=50)])
        
        xgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], 
                      verbose=False)
        
        cat_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50)

        # Fit the ensemble
        model_pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model_pipeline.predict(X_valid)
        
        # Calculate metrics for this fold
        rmse = mean_squared_error(y_valid, y_pred, squared=False)
        r2 = r2_score(y_valid, y_pred)
        mae = mean_absolute_error(y_valid, y_pred)

        rmse_list.append(rmse)
        r2_list.append(r2)
        mae_list.append(mae)

    # Compute average metrics
    avg_rmse = sum(rmse_list) / len(rmse_list)
    avg_r2 = sum(r2_list) / len(r2_list)
    avg_mae = sum(mae_list) / len(mae_list)

    # Save the trained model pipeline globally
    global_model_pipeline = model_pipeline

    return avg_rmse  # Optimize RMSE (could also return R², MAE for report

In [7]:
# Create an Optuna study object and optimize
study = optuna.create_study(direction='minimize')  # 'minimize' for minimizing RMSE
study.optimize(objective, n_trials=100)

# Print the best trial and its hyperparameters
print("Best trial:")
trial = study.best_trial
print(f"  Value (RMSE): {trial.value}")
print(f"  Params: {trial.params}")


[I 2024-10-14 09:38:21,417] A new study created in memory with name: no-name-eff0c396-8ac9-48b9-9268-8f13cabd4536


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16324
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79542.054059
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[218]	valid_0's l2: 7.27544e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016930 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16368
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79542.054059




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16326
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79416.998150
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[218]	valid_0's l2: 7.39616e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015615 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16361
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79416.998150




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035078 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16311
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79467.021768
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[217]	valid_0's l2: 7.46558e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16356
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79467.021768




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16315
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79452.042408
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[218]	valid_0's l2: 7.27811e+06
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.034702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16364
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79452.042408




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017944 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16305
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79533.169576
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[218]	valid_0's l2: 7.67092e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16365
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79533.169576


[I 2024-10-14 11:34:12,872] Trial 0 finished with value: 2625.6178862215193 and parameters: {'lgb_n_estimators': 218, 'lgb_learning_rate': 0.2701164252171986, 'lgb_max_depth': 14, 'lgb_num_leaves': 180, 'lgb_reg_alpha': 2.351694146949541, 'lgb_reg_lambda': 2.0897355089392446, 'xgb_n_estimators': 812, 'xgb_learning_rate': 0.22164565163407032, 'xgb_max_depth': 12, 'xgb_subsample': 0.604730686265889, 'xgb_colsample_bytree': 0.9777398111491673, 'xgb_reg_alpha': 5.902510171875793, 'xgb_reg_lambda': 3.0381182193261838, 'cat_n_estimators': 333, 'cat_learning_rate': 0.04278782569786227, 'cat_depth': 13, 'cat_l2_leaf_reg': 7.390329486159146}. Best is trial 0 with value: 2625.6178862215193.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015062 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16324
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79542.054059
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[58]	valid_0's l2: 7.87082e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015975 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16368
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79542.054059




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16326
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79416.998150
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[93]	valid_0's l2: 8.24779e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16361
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79416.998150




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014921 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16311
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79467.021768
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[80]	valid_0's l2: 8.08758e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017490 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16356
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79467.021768




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015234 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16315
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79452.042408
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[83]	valid_0's l2: 7.86706e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016398 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16364
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79452.042408




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16305
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79533.169576
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[100]	valid_0's l2: 8.14774e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014772 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16365
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79533.169576


[I 2024-10-14 13:01:52,278] Trial 1 finished with value: 2584.636256568647 and parameters: {'lgb_n_estimators': 981, 'lgb_learning_rate': 0.36664202139786356, 'lgb_max_depth': 15, 'lgb_num_leaves': 968, 'lgb_reg_alpha': 4.21711344034364, 'lgb_reg_lambda': 8.310877602978064, 'xgb_n_estimators': 917, 'xgb_learning_rate': 0.34605562926738576, 'xgb_max_depth': 7, 'xgb_subsample': 0.852279203062347, 'xgb_colsample_bytree': 0.8103680421720088, 'xgb_reg_alpha': 5.3562296312953634, 'xgb_reg_lambda': 7.930876537792992, 'cat_n_estimators': 467, 'cat_learning_rate': 0.34087350002756595, 'cat_depth': 14, 'cat_l2_leaf_reg': 3.0138675138131212}. Best is trial 1 with value: 2584.636256568647.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032138 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16324
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79542.054059
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[93]	valid_0's l2: 8.14011e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16368
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79542.054059




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015445 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16326
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79416.998150
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[78]	valid_0's l2: 8.44579e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015070 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16361
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79416.998150




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16311
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79467.021768
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[90]	valid_0's l2: 8.39616e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016686 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16356
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79467.021768




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16315
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79452.042408
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[80]	valid_0's l2: 8.32187e+06
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015014 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16364
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79452.042408




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016439 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16305
[LightGBM] [Info] Number of data points in the train set: 295211, number of used features: 95
[LightGBM] [Info] Start training from score 79533.169576
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[77]	valid_0's l2: 8.67871e+06


In [None]:
import joblib


model_final = global_model_pipeline

# Save the ensemble model to a file
model_filename = 'ensemble_model.pkl'
joblib.dump(model_final, model_filename)

print(f"Model saved to {model_filename}")


In [None]:
from sklearn.preprocessing import LabelEncoder

submission_set_features = submission_set_updated.iloc[:,:-1]

# Now you can use the model to make predictions
submission_set['tow'] = model_final.predict(submission_set_features)
print(submission_set)


In [None]:
import os
from datetime import datetime

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Define a timestamp for the file name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")

# Assuming submission_set is a DataFrame, save it to CSV
submission_set.to_csv(submission_file, index=False)

print(f"Submission saved to {submission_file}")
