In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import optuna
from lightgbm.callback import early_stopping

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v13.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v13.csv")



In [3]:
# Separating features and target variable
X = challenge_set_updated.drop('tow', axis=1)
y = challenge_set_updated['tow']

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame
categorical_columns = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season',
                       'flight_duration_category', 'adep_region', 'ades_region',
                       'flight_direction', 'Manufacturer', 'Model_FAA',
                       'Physical_Class_Engine', 'FAA_Weight']

# Encoding using LabelEncoder
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))  # Ensure data is string type before encoding


In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Global variable to hold the trained model pipeline
global_model = None

# Define the objective function
def objective(trial):
    global global_model

    # Hyperparameters to be tuned by Optuna
    param = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',  # You can leave this as 'gbdt' for GPU acceleration
        'verbosity': -1,
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 150),
        'cat_smooth': trial.suggest_float('cat_smooth', 5, 20),
        'device': 'gpu'  # This enables GPU training
    }

    # Convert DataFrame to LightGBM Dataset, specifying categorical features
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_columns, free_raw_data=False)
    valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_columns, free_raw_data=False)

    # Train the model with early stopping
    model = lgb.train(
        param,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=[lgb.early_stopping(stopping_rounds=50)]
    )

    # Predictions on the validation set
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds, squared=False)  # Calculate RMSE

    # Save the trained model pipeline globally
    global_model = model

    return rmse


In [None]:
# Create a study object and specify the direction of the optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=3)  # Specify the number of trials

# Best trial results
print("Best trial:")
print(study.best_trial.params)
print("Best RMSE:", study.best_value)

[I 2024-10-17 09:56:56,881] A new study created in memory with name: no-name-eeea421d-7891-41ea-8677-17c448742c29


Training until validation scores don't improve for 50 rounds


In [None]:

best_params = study.best_trial.params

# Train the final model on the full training set using the best parameters
best_params.update({
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'objective': 'regression'
})
bst = lgb.train(
    best_params,
    lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_columns, free_raw_data=False),
    num_boost_round=1000
)

In [None]:
from sklearn.preprocessing import LabelEncoder

# List of columns to encode
categorical_columns = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season', 'flight_duration_category', 
                       'adep_region', 'ades_region', 'flight_direction', 'Manufacturer', 'Model_FAA', 
                       'Physical_Class_Engine', 'FAA_Weight']

# Initialize the LabelEncoder
label_encoders = {}


# Apply Label Encoding to each categorical column
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    submission_set_updated[column] = label_encoders[column].fit_transform(submission_set_updated[column])


# Now you should be able to run LightGBM predictions
submission_set_features = submission_set_updated.iloc[:, :-1]
submission_set['tow'] = bst.predict(submission_set_features)

print(submission_set)

In [None]:
import os
from datetime import datetime

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Define a timestamp for the file name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")

# Assuming submission_set is a DataFrame, save it to CSV
submission_set.to_csv(submission_file, index=False)

print(f"Submission saved to {submission_file}")
