In [10]:
import optuna
from lightgbm import LGBMRegressor, early_stopping as lgb_early_stopping
from xgboost import XGBRegressor, callback as xgb_callback
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from lightgbm import early_stopping


In [11]:
import numpy as np 
import pandas as pd


In [12]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v13.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v13.csv")



In [14]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]
# df = challenge_set_updated.sample(frac=0.001)

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']



In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Handle categorical columns by Label Encoding
label_encoder = LabelEncoder()

# Specify the columns that are categorical
categorical_cols = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season', 
                    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction', 
                    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight']

# Convert these categorical columns into numerical form using Label Encoding
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Now, you can proceed with training LightGBM with this processed data


In [20]:
def objective(trial):
    global global_model_pipeline
    
    # Sample hyperparameters for LightGBM
    lgb_params = {
        'n_estimators': trial.suggest_int('lgb_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('lgb_learning_rate', 0.001, 0.5),
        'max_depth': trial.suggest_int('lgb_max_depth', 3, 15),
        'num_leaves': trial.suggest_int('lgb_num_leaves', 31, 1024),
        'reg_alpha': trial.suggest_float('lgb_reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('lgb_reg_lambda', 0, 10),
    }

    # Sample hyperparameters for XGBoost
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.001, 0.5),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 15),
        'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('xgb_reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('xgb_reg_lambda', 0, 10),
    }

    # Sample hyperparameters for CatBoost
    cat_params = {
        'n_estimators': trial.suggest_int('cat_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('cat_learning_rate', 0.001, 0.5),
        'depth': trial.suggest_int('cat_depth', 3, 15),
        'l2_leaf_reg': trial.suggest_float('cat_l2_leaf_reg', 1, 10),
    }

    # Initialize the models with sampled hyperparameters
    lgb_model = LGBMRegressor(**lgb_params)
    xgb_model = XGBRegressor(**xgb_params, objective='reg:squarederror')
    cat_model = CatBoostRegressor(**cat_params, verbose=0)

    # Ensemble the models using VotingRegressor
    ensemble_model = VotingRegressor(estimators=[('lgb', lgb_model), ('xgb', xgb_model), ('cat', cat_model)])

    # Create a pipeline with data scaling and the ensemble model
    model_pipeline = Pipeline([('scaler', StandardScaler()), ('ensemble', ensemble_model)])

    # Cross-validation for better evaluation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Metrics
    rmse_list, r2_list, mae_list = [], [], []

    for train_index, valid_index in kf.split(X):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        # Train the ensemble model with early stopping for each model
        lgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                      callbacks=[early_stopping(stopping_rounds=50, verbose=False)])

        # Fit the model with early stopping
        xgb_model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],  # Specify validation set for early stopping
            eval_metric="rmse",             # Metric on which to base early stopping
            early_stopping_rounds=50,       # Stop if validation metric does not improve after 50 rounds
            verbose=True                    # Optional: provide more verbose output
        )

        cat_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                      early_stopping_rounds=50, verbose=False)

        # Fit the ensemble
        model_pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model_pipeline.predict(X_valid)
        
        # Calculate metrics for this fold
        rmse = mean_squared_error(y_valid, y_pred, squared=False)
        r2 = r2_score(y_valid, y_pred)
        mae = mean_absolute_error(y_valid, y_pred)

        rmse_list.append(rmse)
        r2_list.append(r2)
        mae_list.append(mae)

    # Compute average metrics
    avg_rmse = sum(rmse_list) / len(rmse_list)
    avg_r2 = sum(r2_list) / len(r2_list)
    avg_mae = sum(mae_list) / len(mae_list)

    # Save the trained model pipeline globally
    global_model_pipeline = model_pipeline

    return avg_rmse  # Optimize RMSE


In [21]:
# Create an Optuna study object and optimize
study = optuna.create_study(direction='minimize')  # 'minimize' for minimizing RMSE
study.optimize(objective, n_trials=100)

# Print the best trial and its hyperparameters
print("Best trial:")
trial = study.best_trial
print(f"  Value (RMSE): {trial.value}")
print(f"  Params: {trial.params}")


[I 2024-10-15 13:37:56,172] A new study created in memory with name: no-name-d5cd0bbc-00fd-4b5a-ac41-dca7be995c65


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.336632 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16324
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 95
[LightGBM] [Info] Start training from score 79542.054059


[W 2024-10-15 14:40:31,177] Trial 0 failed with parameters: {'lgb_n_estimators': 416, 'lgb_learning_rate': 0.17081811652651363, 'lgb_max_depth': 10, 'lgb_num_leaves': 886, 'lgb_reg_alpha': 8.851885914679224, 'lgb_reg_lambda': 8.705036316649705, 'xgb_n_estimators': 651, 'xgb_learning_rate': 0.09604394326770738, 'xgb_max_depth': 11, 'xgb_subsample': 0.806399557076239, 'xgb_colsample_bytree': 0.8243848011868313, 'xgb_reg_alpha': 1.5580381596110682, 'xgb_reg_lambda': 4.16646832958304, 'cat_n_estimators': 824, 'cat_learning_rate': 0.22723851797606925, 'cat_depth': 15, 'cat_l2_leaf_reg': 8.44552474430012} because of the following error: TypeError("XGBModel.fit() got an unexpected keyword argument 'eval_metric'").
Traceback (most recent call last):
  File "/home/carolima/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_93405/612254732.py", line 59, in objective
    xgb_model.fit(
  File "/home/caro



TypeError: XGBModel.fit() got an unexpected keyword argument 'eval_metric'

In [None]:
import joblib


model_final = global_model_pipeline

# Save the ensemble model to a file
model_filename = 'ensemble_model.pkl'
joblib.dump(model_final, model_filename)

print(f"Model saved to {model_filename}")


In [None]:
from sklearn.preprocessing import LabelEncoder

submission_set_features = submission_set_updated.iloc[:,:-1]

# Now you can use the model to make predictions
submission_set['tow'] = model_final.predict(submission_set_features)
print(submission_set)


In [None]:
import os
from datetime import datetime

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Define a timestamp for the file name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")

# Assuming submission_set is a DataFrame, save it to CSV
submission_set.to_csv(submission_file, index=False)

print(f"Submission saved to {submission_file}")
