In [1]:
import optuna
from lightgbm import LGBMRegressor, early_stopping as lgb_early_stopping
from xgboost import XGBRegressor, callback as xgb_callback
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import KFold, train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import lightgbm as lgb


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np 
import pandas as pd


In [3]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v16.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v16.csv")



In [4]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]
# df = challenge_set_updated.sample(frac=0.001)

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']



In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Handle categorical columns by Label Encoding
label_encoder = LabelEncoder()

# Specify the columns that are categorical
categorical_cols = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season', 
                    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction', 
                    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight']

# Convert these categorical columns into numerical form using Label Encoding
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Now, you can proceed with training LightGBM with this processed data


In [6]:
# Global variable to hold the trained model pipeline
global_model_pipeline = None

# Define the objective function
def objective(trial):
    global global_model_pipeline
    
    # Sample hyperparameters for LightGBM
    lgb_params = {
        'n_estimators': trial.suggest_int('lgb_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('lgb_learning_rate', 0.001, 0.5),
        'max_depth': trial.suggest_int('lgb_max_depth', 3, 15),
        'num_leaves': trial.suggest_int('lgb_num_leaves', 31, 1024),
        'reg_alpha': trial.suggest_float('lgb_reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('lgb_reg_lambda', 0, 10),
        'device': 'gpu'  # Use GPU
    }

    # Sample hyperparameters for XGBoost
    xgb_params = {
        'n_estimators': trial.suggest_int('xgb_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.001, 0.5),
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 15),
        'subsample': trial.suggest_float('xgb_subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('xgb_colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('xgb_reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('xgb_reg_lambda', 0, 10),
        'tree_method': 'gpu_hist',  # Use GPU
        'predictor': 'gpu_predictor'  # GPU predictor
    }

    # Sample hyperparameters for CatBoost
    cat_params = {
        'n_estimators': trial.suggest_int('cat_n_estimators', 50, 1000),
        'learning_rate': trial.suggest_float('cat_learning_rate', 0.001, 0.5),
        'depth': trial.suggest_int('cat_depth', 3, 15),
        'l2_leaf_reg': trial.suggest_float('cat_l2_leaf_reg', 1, 10),
        'task_type': 'GPU',  # Use GPU
        'devices': '0:1'  # Specify GPU devices if needed
    }

    # Initialize the models with sampled hyperparameters
    lgb_model = LGBMRegressor(**lgb_params)
    xgb_model = XGBRegressor(**xgb_params, objective='reg:squarederror')
    cat_model = CatBoostRegressor(**cat_params, verbose=0)

    # Ensemble the models using VotingRegressor
    ensemble_model = VotingRegressor(
        estimators=[
            ('lgb', lgb_model),
            ('xgb', xgb_model),
            ('cat', cat_model)
        ]
    )

    # Create a pipeline with data scaling and the ensemble model
    model_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('ensemble', ensemble_model)
    ])

    # Cross-validation for better evaluation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Metrics
    rmse_list = []
    r2_list = []
    mae_list = []

    for train_index, valid_index in kf.split(X):
        # Use iloc to select by position
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        # Train the ensemble model with early stopping for each model
        lgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                      callbacks=[lgb_early_stopping(stopping_rounds=50)])
        
        xgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], 
                      verbose=False)
        
        cat_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50)

        # Fit the ensemble
        model_pipeline.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model_pipeline.predict(X_valid)
        
        # Calculate metrics for this fold
        rmse = mean_squared_error(y_valid, y_pred, squared=False)
        r2 = r2_score(y_valid, y_pred)
        mae = mean_absolute_error(y_valid, y_pred)

        rmse_list.append(rmse)
        r2_list.append(r2)
        mae_list.append(mae)

    # Compute average metrics
    avg_rmse = sum(rmse_list) / len(rmse_list)
    avg_r2 = sum(r2_list) / len(r2_list)
    avg_mae = sum(mae_list) / len(mae_list)

    # Save the trained model pipeline globally
    global_model_pipeline = model_pipeline



In [None]:
# Create an Optuna study object and optimize
study = optuna.create_study(direction='minimize')  # 'minimize' for minimizing RMSE
study.optimize(objective, n_trials=2)

# Print the best trial and its hyperparameters
print("Best trial:")
trial = study.best_trial
print(f"  Value (RMSE): {trial.value}")
print(f"  Params: {trial.params}")


[I 2024-10-17 09:57:08,801] A new study created in memory with name: no-name-281c2555-7106-420b-9be6-85ea809e23d9


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 16495
[LightGBM] [Info] Number of data points in the train set: 295210, number of used features: 103
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A4500, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 92 dense feature groups (25.90 MB) transferred to GPU in 0.039898 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 79542.054059
Training until validation scores don't improve for 50 rounds


In [None]:
# Use the best hyperparameters to create models
best_params = study.best_trial.params

# Initialize models with the best parameters
lgb_model = LGBMRegressor(
    n_estimators=best_params['lgb_n_estimators'],
    learning_rate=best_params['lgb_learning_rate'],
    max_depth=best_params['lgb_max_depth'],
    num_leaves=best_params['lgb_num_leaves']
)

xgb_model = XGBRegressor(
    n_estimators=best_params['xgb_n_estimators'],
    learning_rate=best_params['xgb_learning_rate'],
    max_depth=best_params['xgb_max_depth'],
    subsample=best_params['xgb_subsample'],
    objective='reg:squarederror'
)

cat_model = CatBoostRegressor(
    n_estimators=best_params['cat_n_estimators'],
    learning_rate=best_params['cat_learning_rate'],
    depth=best_params['cat_depth'],
    verbose=0
)

# Ensemble the models
ensemble_model = VotingRegressor(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('cat', cat_model)
    ]
)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# Train the ensemble model on the entire dataset
ensemble_model.fit(X_train, y_train)



In [None]:

# Make predictions
y_pred = ensemble_model.predict(X_valid)

r2 = r2_score(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))


print(f"Best Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")


In [None]:
import joblib


model_final = global_model_pipeline

# Save the ensemble model to a file
model_filename = 'ensemble_model.pkl'
joblib.dump(model_final, model_filename)

print(f"Model saved to {model_filename}")


In [None]:
from sklearn.preprocessing import LabelEncoder

submission_set_features = submission_set_updated.iloc[:,:-1]

# Now you can use the model to make predictions
submission_set['tow'] = model_final.predict(submission_set_features)
print(submission_set)


In [None]:
import os
from datetime import datetime

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Define a timestamp for the file name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")

# Assuming submission_set is a DataFrame, save it to CSV
submission_set.to_csv(submission_file, index=False)

print(f"Submission saved to {submission_file}")
