# XGBoost Model

In [7]:
import sys
import subprocess
import importlib

# Function to install packages if they are not already installed
def install_and_import(package):
    try:
        importlib.import_module(package)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    finally:
        globals()[package] = importlib.import_module(package)

# Check and install the required packages
required_packages = [
    'pandas', 'xgboost', 'sklearn', 'numpy', 'tqdm', 
    'datetime', 'pytz', 'json', 'joblib', 'os', 'optuna'
]

for package in required_packages:
    install_and_import(package)

# Imports
import pandas as pd
from xgboost import XGBRegressor, callback
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import optuna

In [8]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v19.csv")
submission_set = pd.read_csv("./data/final_submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v19.csv")

In [10]:
def clean_data_better(df, threshold=1e10):
    # Create an explicit copy of the DataFrame to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Replace inf and -inf with NaN using vectorized operations
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Mask values above the threshold directly on numeric columns using .loc
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df.loc[:, numeric_columns] = df.loc[:, numeric_columns].mask(df[numeric_columns].abs() > threshold)
    
    # Fill NaNs using forward fill, then median
    df.loc[:, numeric_columns] = df.loc[:, numeric_columns].fillna(method='ffill').fillna(df[numeric_columns].median())
    
    return df

# Applying the cleaning function
challenge_set_updated = clean_data_better(challenge_set_updated)
submission_set_updated = clean_data_better(submission_set_updated)

  df.loc[:, numeric_columns] = df.loc[:, numeric_columns].fillna(method='ffill').fillna(df[numeric_columns].median())
  df.loc[:, numeric_columns] = df.loc[:, numeric_columns].fillna(method='ffill').fillna(df[numeric_columns].median())


In [11]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]
# df = challenge_set_updated.sample(frac=0.001)

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']

n_jobs = os.cpu_count() // 2

In [12]:
# Split the data into training and test sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function to be used by Optuna for hyperparameter optimization
def objective(trial):
    # Suggest values for the hyperparameters to be optimized
    params = {
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_estimators': 1_000_000,  # Set a high value to allow early stopping
        'learning_rate': 0.1, #trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 1e-3, 1.0, log=True),
        'subsample': 1.0,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True),
        'n_jobs': n_jobs,
        'eval_metric': 'rmse',
        'early_stopping_rounds': 20  # Define early stopping in the constructor
    }
    
    # Initialize the model with the suggested parameters
    model = XGBRegressor(**params)
    
    # Train the model on the validation set
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=1000
    )
    
    # Make predictions and calculate RMSE on the validation set
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    
    # Store the best_iteration for future use
    trial.set_user_attr("best_iteration", model.best_iteration)
    
    return rmse

# Start the study with Optuna
study = optuna.create_study(direction='minimize')  # Minimize the RMSE
study.optimize(objective, n_trials=10)  # Adjust the number of trials as necessary

# Display the best hyperparameters found
print(f"Best trial: {study.best_trial.params}")

# Get the best number of estimators (from early stopping during hyperparameter optimization)
best_n_estimators = study.best_trial.user_attrs["best_iteration"] + 1

# Train the final model with the best hyperparameters and best n_estimators on full training data
best_params = study.best_trial.params
best_params['n_estimators'] = best_n_estimators  # Use the optimal number of estimators

# Train the final model using the full training set (X, y)
best_model = XGBRegressor(
    **best_params,
    objective='reg:squarederror',
    random_state=42,
    n_jobs=n_jobs,
    early_stopping_rounds=20  # Early stopping in the constructor
)

# Train the model on the full dataset
best_model.fit(
    X, y,
    eval_set=[(X_val, y_val)],
    verbose=500
)

# Evaluate the performance on the val set
y_pred = best_model.predict(X_val)
r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

# Display final performance and hyperparameters
print(f"Best Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")
print(f"Updated best_params: {best_params}")

[I 2024-10-18 21:29:16,116] A new study created in memory with name: no-name-e93b4735-64cd-4c43-b5ec-1092787314d9


[0]	validation_0-rmse:48109.65634
[498]	validation_0-rmse:2711.99664


[I 2024-10-18 21:31:48,872] Trial 0 finished with value: 2711.8619157249564 and parameters: {'max_depth': 13, 'min_child_weight': 10, 'gamma': 0.003528186188693561, 'colsample_bytree': 0.5875129489623385, 'reg_alpha': 0.015638678910209404, 'reg_lambda': 0.0005504996034221035}. Best is trial 0 with value: 2711.8619157249564.


[0]	validation_0-rmse:48222.50979
[1000]	validation_0-rmse:2996.89555
[2000]	validation_0-rmse:2857.15491
[2019]	validation_0-rmse:2857.26742


[I 2024-10-18 21:36:24,634] Trial 1 finished with value: 2857.08675224402 and parameters: {'max_depth': 4, 'min_child_weight': 1, 'gamma': 0.944860055048793, 'colsample_bytree': 0.4597664622555391, 'reg_alpha': 0.004216085508157351, 'reg_lambda': 0.0005164622181664028}. Best is trial 0 with value: 2711.8619157249564.


[0]	validation_0-rmse:48214.69751
[1000]	validation_0-rmse:2987.18826
[2000]	validation_0-rmse:2848.27456
[2871]	validation_0-rmse:2799.07920


[I 2024-10-18 21:42:58,967] Trial 2 finished with value: 2799.0329522839597 and parameters: {'max_depth': 4, 'min_child_weight': 2, 'gamma': 0.0055466750716694265, 'colsample_bytree': 0.666128058205729, 'reg_alpha': 0.02514097398756596, 'reg_lambda': 0.010163912745091477}. Best is trial 0 with value: 2711.8619157249564.


[0]	validation_0-rmse:48112.53198
[1000]	validation_0-rmse:2693.56715
[1219]	validation_0-rmse:2692.11950


[I 2024-10-18 21:47:51,273] Trial 3 finished with value: 2692.044198112313 and parameters: {'max_depth': 11, 'min_child_weight': 10, 'gamma': 0.030717549440002732, 'colsample_bytree': 0.5939643897159202, 'reg_alpha': 0.0167980422805703, 'reg_lambda': 0.005069980701783285}. Best is trial 3 with value: 2692.044198112313.


[0]	validation_0-rmse:48271.72678
[1000]	validation_0-rmse:3218.08368
[2000]	validation_0-rmse:3030.29802
[3000]	validation_0-rmse:2938.45090
[4000]	validation_0-rmse:2886.71533
[4892]	validation_0-rmse:2854.59137


[I 2024-10-18 21:59:09,382] Trial 4 finished with value: 2854.482032683168 and parameters: {'max_depth': 3, 'min_child_weight': 2, 'gamma': 0.0022064362855165986, 'colsample_bytree': 0.7714179209840213, 'reg_alpha': 0.6265526557684539, 'reg_lambda': 0.0001408668995952831}. Best is trial 3 with value: 2692.044198112313.


[0]	validation_0-rmse:48214.89964
[1000]	validation_0-rmse:2985.67731
[2000]	validation_0-rmse:2845.03305
[3000]	validation_0-rmse:2790.63275
[4000]	validation_0-rmse:2756.89376
[4336]	validation_0-rmse:2747.61291


[I 2024-10-18 22:08:57,893] Trial 5 finished with value: 2747.5181746129524 and parameters: {'max_depth': 4, 'min_child_weight': 1, 'gamma': 0.07424978950481402, 'colsample_bytree': 0.5845348509118424, 'reg_alpha': 0.04738196195793192, 'reg_lambda': 0.0010649997931709775}. Best is trial 3 with value: 2692.044198112313.


[0]	validation_0-rmse:48201.71972
[1000]	validation_0-rmse:2993.70521
[2000]	validation_0-rmse:2852.56533
[3000]	validation_0-rmse:2798.50317
[4000]	validation_0-rmse:2769.08369
[4122]	validation_0-rmse:2766.32500


[I 2024-10-18 22:18:22,555] Trial 6 finished with value: 2766.2862029737134 and parameters: {'max_depth': 4, 'min_child_weight': 2, 'gamma': 0.026417535432027905, 'colsample_bytree': 0.98759927531056, 'reg_alpha': 0.00015509864755035814, 'reg_lambda': 0.0009064493639640551}. Best is trial 3 with value: 2692.044198112313.


[0]	validation_0-rmse:48107.79898
[492]	validation_0-rmse:2714.80078


[I 2024-10-18 22:20:50,634] Trial 7 finished with value: 2714.7785034858402 and parameters: {'max_depth': 12, 'min_child_weight': 5, 'gamma': 0.2599207503726779, 'colsample_bytree': 0.8014379519145451, 'reg_alpha': 0.007832863105360332, 'reg_lambda': 0.003000418559365606}. Best is trial 3 with value: 2692.044198112313.


[0]	validation_0-rmse:48177.42672
[1000]	validation_0-rmse:2866.09134
[2000]	validation_0-rmse:2773.82472
[3000]	validation_0-rmse:2736.26108
[3319]	validation_0-rmse:2731.46695


[I 2024-10-18 22:28:45,971] Trial 8 finished with value: 2731.0288388395757 and parameters: {'max_depth': 5, 'min_child_weight': 1, 'gamma': 0.01420856544122998, 'colsample_bytree': 0.7521548014170082, 'reg_alpha': 0.0014548901590608542, 'reg_lambda': 0.0005717808996170741}. Best is trial 3 with value: 2692.044198112313.


[0]	validation_0-rmse:48106.51192
[660]	validation_0-rmse:2721.88322


[I 2024-10-18 22:32:13,065] Trial 9 finished with value: 2721.8403150520294 and parameters: {'max_depth': 13, 'min_child_weight': 9, 'gamma': 0.0015634412197371387, 'colsample_bytree': 0.896024877403942, 'reg_alpha': 0.035282574539999126, 'reg_lambda': 0.0009149776434229105}. Best is trial 3 with value: 2692.044198112313.


Best trial: {'max_depth': 11, 'min_child_weight': 10, 'gamma': 0.030717549440002732, 'colsample_bytree': 0.5939643897159202, 'reg_alpha': 0.0167980422805703, 'reg_lambda': 0.005069980701783285}
[0]	validation_0-rmse:37498.72436
[500]	validation_0-rmse:301.48700
[1000]	validation_0-rmse:54.47217
[1200]	validation_0-rmse:27.42262
Best Model Performance - R^2 Score: 0.9972, RMSE: 2801.7486
Updated best_params: {'max_depth': 11, 'min_child_weight': 10, 'gamma': 0.030717549440002732, 'colsample_bytree': 0.5939643897159202, 'reg_alpha': 0.0167980422805703, 'reg_lambda': 0.005069980701783285, 'n_estimators': 1201}


In [13]:
# Display evaluation metrics
print(f"Final Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")

Final Model Performance - R^2 Score: 0.9972, RMSE: 2801.7486


In [14]:
# Save R², RMSE, and hyperparameters
results = {
    'R2': float(r2),
    'RMSE': float(rmse),
    'Best Parameters': {key: (int(value) if isinstance(value, np.integer) else float(value)
                              if isinstance(value, np.floating) else value)
                        for key, value in best_params.items()}
}

# Set timezone to São Paulo (UTC-3)
saopaulo_tz = pytz.timezone('America/Sao_Paulo')
timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')

# Define logs directory, and create them if they don't exist
logs_dir = 'logs'
os.makedirs(logs_dir, exist_ok=True)

# Define file paths within the respective directories
results_file = os.path.join(logs_dir, f'model_results_{timestamp}.txt')

# Save the results to a TXT file
with open(results_file, 'w') as file:
    file.write(f"R2: {results['R2']}\n")
    file.write(f"RMSE: {results['RMSE']}\n")
    file.write("Best Parameters:\n")
    for param, value in results['Best Parameters'].items():
        file.write(f"  {param}: {value}\n")

print(f"Results saved to {results_file}")

Results saved to logs/model_results_20241018_193738.txt


In [15]:
# Define models directory, and create them if they don't exist
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# Update the learning_rate in best_params
best_params['learning_rate'] = 0.005

# Train the final model using the full training+validation+test set with the optimal n_estimators
final_model = XGBRegressor(**best_params, objective='reg:squarederror', random_state=42, n_jobs=n_jobs)

# Train the model on the entire training+validation+set data
final_model.fit(X, y, verbose=True)

print("Final model trained successfully using all available data.")


Final model trained successfully using all available data.


In [16]:
# Define file paths within the respective directories
model_file = os.path.join(models_dir, f'trained_model_{timestamp}.joblib')

# Save the trained model to a file in the models folder
joblib.dump(final_model, model_file)
print(f"Model saved to {model_file}")

Model saved to models/trained_model_20241018_193738.joblib


In [17]:
# Use the final model to predict the `tow` for the submission_set_updated
submission_set_features = submission_set_updated.drop("tow", axis=1)
submission_set['tow'] = final_model.predict(submission_set_features)

submission_set

ValueError: Length of values (264108) does not match length of index (158149)

In [None]:
# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")
submission_set.to_csv(submission_file, index=False)