In [None]:
import time
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, ParameterGrid, KFold, cross_val_score

# Re-import dataset and preprocess
df = pd.read_parquet('data.parquet')

# Drop unwanted columns
columns_to_drop = [col for col in df.columns if col.startswith('ATA') or col.startswith('ATD') or col == 'Arrival_Delayed']
df.drop(columns=columns_to_drop, inplace=True)

# Convert types to reduce memory usage
float_cols = df.select_dtypes(include='float64').columns
df[float_cols] = df[float_cols].astype('float32')
int_cols = df.select_dtypes(include='int64').columns
df[int_cols] = df[int_cols].astype('int32')

# Split features and target
X = df.drop(columns=["Departure_Status"])
y = df["Departure_Status"].astype('category').cat.codes

# Identify categorical features (assuming object or category types)
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Convert categorical columns to string and fill NaN values
for col in cat_features:
    X[col] = X[col].astype(str).fillna("nan")

# Split the data into training, testing, and validation sets
xTrain, xTemp, yTrain, yTemp = train_test_split(X, y, test_size=0.4, random_state=42)
xTest, xVal, yTest, yVal = train_test_split(xTemp, yTemp, test_size=0.5, random_state=42)

# Define a parameter grid for CatBoostClassifier
param_grid = {
    'depth': [6, 8, 10],
    'learning_rate': [0.05, 0.1, 0.2],
    'l2_leaf_reg': [1, 3, 5],
    'iterations': [300],
    'random_seed': [42],
    'loss_function': ['MultiClass'],
    'eval_metric': ['MultiClass'],
    'task_type': ['GPU']  # Change to 'CPU' if you don't have a GPU or face GPU issues
}

# Create a list of all parameter combinations (exhaustive grid search)
grid = list(ParameterGrid(param_grid))
total_combinations = len(grid)
print("Total parameter combinations to evaluate:", total_combinations)

# Set up cross-validation and grid search variables
cv = KFold(n_splits=5, shuffle=True, random_state=42)
best_score = -np.inf
best_params = None
best_model = None

start_time = time.time()

# Loop over all parameter combinations
for i, params in enumerate(grid):
    # Instantiate a CatBoostClassifier with current parameters and categorical feature info
    model = CatBoostClassifier(**params, cat_features=cat_features, verbose=0)
    
    try:
        # Evaluate using 5-fold cross validation on the training set
        cv_scores = cross_val_score(model, xTrain, yTrain, cv=cv, scoring='accuracy')
        mean_score = cv_scores.mean()
    except Exception as e:
        print(f"Combination {i+1} failed with parameters: {params}")
        print("Error:", e)
        mean_score = -np.inf
    
    # Update best parameters if this model performs better
    if mean_score > best_score:
        best_score = mean_score
        best_params = params
        best_model = model

    # Estimate remaining time
    elapsed = time.time() - start_time
    avg_time = elapsed / (i + 1)
    remaining = avg_time * (total_combinations - i - 1)
    print(f"Combination {i+1}/{total_combinations} evaluated. Estimated time remaining: {remaining:.2f} seconds.")

print("\nBest Parameters:", best_params)
print("Best CV Accuracy Score:", best_score)
print("Total grid search time:", time.time() - start_time, "seconds")

# Retrain the best model on the full training set (including categorical features)
best_model.set_params(**best_params)
best_model.fit(xTrain, yTrain, cat_features=cat_features)

# Save the best model to disk using CatBoost's built-in method
best_model.save_model("best_model.cbm")
print("Best model saved as 'best_model.cbm'")
