# XGBoost Model

In [1]:
import pandas as pd
from xgboost import XGBRegressor, callback
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import optuna

In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v7.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v7.csv")

# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]
# df = challenge_set_updated.sample(frac=0.001)

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']

n_jobs = os.cpu_count() // 2

In [None]:
# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define the objective function to be used by Optuna for hyperparameter optimization
def objective(trial):
    # Suggest values for the hyperparameters to be optimized
    params = {
        'objective': 'reg:squarederror',
        'random_state': 42,
        'n_estimators': 1_000_000,  # Set a high value to allow early stopping
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 1e-3, 1.0, log=True),
        'subsample': 1.0,
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-4, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-4, 1.0, log=True),
        'n_jobs': n_jobs,
        'eval_metric': 'rmse',  # Move eval_metric to the constructor
        'early_stopping_rounds': 20  # Move early_stopping_rounds to the constructor
    }
    
    # Initialize the model with the suggested parameters
    model = XGBRegressor(**params)
    
    # Train the model with early stopping
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=1000
    )
    
    # Make predictions and calculate the error on the validation set
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    
    return rmse

# Start the study with Optuna
study = optuna.create_study(direction='minimize')  # Minimize the RMSE
study.optimize(objective, n_trials=50)  # Adjust the number of trials as necessary

# Display the best hyperparameters found
print(f"Best trial: {study.best_trial.params}")

# Train the final model with the best parameters
best_params = study.best_trial.params
best_model = XGBRegressor(
    **best_params,
    objective='reg:squarederror',
    random_state=42,
    n_estimators=1_000_000,
    n_jobs=n_jobs
)

# Train the model with early stopping
best_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=500
)

# Update the n_estimators parameter with the best value found
best_params['n_estimators'] = best_model.best_iteration + 1

# Evaluate the performance on the test set
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Best Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")
print(f"Updated best_params: {best_params}")


[I 2024-09-13 03:14:17,805] A new study created in memory with name: no-name-5e934a07-50a3-4200-96ca-372cbbbfbcac


[0]	validation_0-rmse:53028.08490
[1000]	validation_0-rmse:3403.39350
[2000]	validation_0-rmse:3337.55713
[3000]	validation_0-rmse:3312.21298
[4000]	validation_0-rmse:3300.42199
[4605]	validation_0-rmse:3296.37625


[I 2024-09-13 03:30:41,925] Trial 0 finished with value: 3296.3675170364886 and parameters: {'learning_rate': 0.007569165081214165, 'max_depth': 15, 'min_child_weight': 8, 'gamma': 0.10997095295835893, 'colsample_bytree': 0.5415703588078289, 'reg_alpha': 0.7476742564840364, 'reg_lambda': 0.00010521677052091404}. Best is trial 0 with value: 3296.3675170364886.


[0]	validation_0-rmse:52880.97856
[1000]	validation_0-rmse:3377.56768
[2000]	validation_0-rmse:3331.23844
[3000]	validation_0-rmse:3312.38523
[4000]	validation_0-rmse:3303.20366
[5000]	validation_0-rmse:3297.99779
[5973]	validation_0-rmse:3294.88116


[I 2024-09-13 03:53:18,881] Trial 1 finished with value: 3294.869499072892 and parameters: {'learning_rate': 0.010364167332776215, 'max_depth': 15, 'min_child_weight': 1, 'gamma': 0.06250594645395556, 'colsample_bytree': 0.4340437646679536, 'reg_alpha': 0.255983719187635, 'reg_lambda': 0.3780729793233056}. Best is trial 1 with value: 3294.869499072892.


[0]	validation_0-rmse:53345.89784
[1000]	validation_0-rmse:11424.25894
[2000]	validation_0-rmse:4221.59043
[3000]	validation_0-rmse:3551.95650
[4000]	validation_0-rmse:3467.03170
[5000]	validation_0-rmse:3429.22761
[6000]	validation_0-rmse:3404.97780
[7000]	validation_0-rmse:3386.70512
[8000]	validation_0-rmse:3372.67527
[9000]	validation_0-rmse:3363.00969
[10000]	validation_0-rmse:3353.08688
[11000]	validation_0-rmse:3343.99973
[12000]	validation_0-rmse:3336.50548
[13000]	validation_0-rmse:3330.04958
[14000]	validation_0-rmse:3323.77860
[15000]	validation_0-rmse:3319.10531
[16000]	validation_0-rmse:3314.26127
[17000]	validation_0-rmse:3310.64240
[18000]	validation_0-rmse:3307.73600
[19000]	validation_0-rmse:3305.04661
[20000]	validation_0-rmse:3302.34125
[21000]	validation_0-rmse:3300.06149
[22000]	validation_0-rmse:3298.11016
[23000]	validation_0-rmse:3296.17033
[24000]	validation_0-rmse:3294.44265
[24613]	validation_0-rmse:3293.52471


[I 2024-09-13 05:06:44,972] Trial 2 finished with value: 3293.5102830496303 and parameters: {'learning_rate': 0.0016005007015330677, 'max_depth': 14, 'min_child_weight': 10, 'gamma': 0.037227561890735346, 'colsample_bytree': 0.727812048187759, 'reg_alpha': 0.002188640020079764, 'reg_lambda': 0.5346097638951908}. Best is trial 2 with value: 3293.5102830496303.


[0]	validation_0-rmse:52095.17898
[1000]	validation_0-rmse:3331.25029
[1965]	validation_0-rmse:3294.09492


[I 2024-09-13 05:12:10,164] Trial 3 finished with value: 3293.91024157036 and parameters: {'learning_rate': 0.02519800754609529, 'max_depth': 13, 'min_child_weight': 8, 'gamma': 0.5527998546086038, 'colsample_bytree': 0.7057073947645749, 'reg_alpha': 0.129920199645181, 'reg_lambda': 0.6760486690948311}. Best is trial 2 with value: 3293.5102830496303.


[0]	validation_0-rmse:53153.32281
[1000]	validation_0-rmse:3847.44603
[2000]	validation_0-rmse:3651.30986
[3000]	validation_0-rmse:3566.58683
[4000]	validation_0-rmse:3516.49865
[5000]	validation_0-rmse:3484.64908
[6000]	validation_0-rmse:3455.68742
[7000]	validation_0-rmse:3436.71771
[8000]	validation_0-rmse:3422.00349
[9000]	validation_0-rmse:3404.41670
[10000]	validation_0-rmse:3387.96275
[11000]	validation_0-rmse:3376.54604
[12000]	validation_0-rmse:3366.61040
[12949]	validation_0-rmse:3358.94792


[I 2024-09-13 05:35:40,753] Trial 4 finished with value: 3358.918362889335 and parameters: {'learning_rate': 0.005234907385988487, 'max_depth': 9, 'min_child_weight': 7, 'gamma': 0.03546171148099006, 'colsample_bytree': 0.9668096235967286, 'reg_alpha': 0.009000791127987321, 'reg_lambda': 0.5356866892087341}. Best is trial 2 with value: 3293.5102830496303.


[0]	validation_0-rmse:53030.17946
[1000]	validation_0-rmse:4150.81416
[2000]	validation_0-rmse:3945.66771
[3000]	validation_0-rmse:3828.01466
[4000]	validation_0-rmse:3748.76129
[5000]	validation_0-rmse:3693.03785
[6000]	validation_0-rmse:3652.74676
[7000]	validation_0-rmse:3619.67171
[8000]	validation_0-rmse:3591.84717
[9000]	validation_0-rmse:3566.96239
[10000]	validation_0-rmse:3543.44758
[11000]	validation_0-rmse:3522.01171
[12000]	validation_0-rmse:3503.98518
[13000]	validation_0-rmse:3488.79702
[14000]	validation_0-rmse:3475.46483
[15000]	validation_0-rmse:3463.31114
[16000]	validation_0-rmse:3453.47619
[17000]	validation_0-rmse:3443.07646
[18000]	validation_0-rmse:3433.32364
[19000]	validation_0-rmse:3424.76562
[20000]	validation_0-rmse:3417.26046
[21000]	validation_0-rmse:3411.05236
[22000]	validation_0-rmse:3404.84571
[22399]	validation_0-rmse:3402.59022


[I 2024-09-13 06:09:14,210] Trial 5 finished with value: 3402.574852924511 and parameters: {'learning_rate': 0.007614866109937625, 'max_depth': 6, 'min_child_weight': 8, 'gamma': 0.7651236691866169, 'colsample_bytree': 0.5065945799582415, 'reg_alpha': 0.0009879072272989773, 'reg_lambda': 0.06969008303086907}. Best is trial 2 with value: 3293.5102830496303.


[0]	validation_0-rmse:53065.12462
[1000]	validation_0-rmse:3451.22269
[2000]	validation_0-rmse:3370.65961
[3000]	validation_0-rmse:3337.74122
[4000]	validation_0-rmse:3320.13909
[5000]	validation_0-rmse:3309.07125
[5915]	validation_0-rmse:3303.37901


[I 2024-09-13 06:22:46,983] Trial 6 finished with value: 3303.348185484985 and parameters: {'learning_rate': 0.006887061396352272, 'max_depth': 14, 'min_child_weight': 9, 'gamma': 0.002706979973385143, 'colsample_bytree': 0.7494676020515567, 'reg_alpha': 0.005595471412949319, 'reg_lambda': 0.1169694807207387}. Best is trial 2 with value: 3293.5102830496303.


[0]	validation_0-rmse:51398.28809
[1000]	validation_0-rmse:3587.29899
[2000]	validation_0-rmse:3454.93269
[3000]	validation_0-rmse:3397.11684
[4000]	validation_0-rmse:3369.90614
[4964]	validation_0-rmse:3351.01563


[I 2024-09-13 06:30:22,485] Trial 7 finished with value: 3350.8993561890825 and parameters: {'learning_rate': 0.03852235170880481, 'max_depth': 7, 'min_child_weight': 8, 'gamma': 0.3089031744714691, 'colsample_bytree': 0.4085384156777457, 'reg_alpha': 0.0012916714220192755, 'reg_lambda': 0.045833079502469586}. Best is trial 2 with value: 3293.5102830496303.


[0]	validation_0-rmse:51856.49693
[1000]	validation_0-rmse:4414.52211
[2000]	validation_0-rmse:4176.76790
[3000]	validation_0-rmse:4039.39276
[4000]	validation_0-rmse:3955.64432
[5000]	validation_0-rmse:3899.66701
[6000]	validation_0-rmse:3854.53707
[7000]	validation_0-rmse:3818.10727
[8000]	validation_0-rmse:3787.93047
[9000]	validation_0-rmse:3762.81525
[10000]	validation_0-rmse:3741.67545
[11000]	validation_0-rmse:3722.38326
[12000]	validation_0-rmse:3706.15208
[13000]	validation_0-rmse:3690.34502
[14000]	validation_0-rmse:3677.89511
[15000]	validation_0-rmse:3667.09353
[16000]	validation_0-rmse:3655.97781
[17000]	validation_0-rmse:3645.26995
[18000]	validation_0-rmse:3635.56272
[19000]	validation_0-rmse:3625.80621
[20000]	validation_0-rmse:3616.21834
[21000]	validation_0-rmse:3607.55157
[22000]	validation_0-rmse:3599.76433
[22413]	validation_0-rmse:3596.80693


[I 2024-09-13 07:00:33,033] Trial 8 finished with value: 3596.786766932317 and parameters: {'learning_rate': 0.030741653404632835, 'max_depth': 3, 'min_child_weight': 5, 'gamma': 0.021424965698175973, 'colsample_bytree': 0.44850503901891847, 'reg_alpha': 0.4186623722336174, 'reg_lambda': 0.531347639204047}. Best is trial 2 with value: 3293.5102830496303.


[0]	validation_0-rmse:52962.14594
[1000]	validation_0-rmse:3468.79794
[2000]	validation_0-rmse:3380.66513
[3000]	validation_0-rmse:3340.92719
[4000]	validation_0-rmse:3317.60960
[5000]	validation_0-rmse:3303.46589
[6000]	validation_0-rmse:3294.25444
[6618]	validation_0-rmse:3288.95306


[I 2024-09-13 07:14:02,615] Trial 9 finished with value: 3288.939801874509 and parameters: {'learning_rate': 0.008830139045133923, 'max_depth': 12, 'min_child_weight': 6, 'gamma': 0.01730666445354706, 'colsample_bytree': 0.6705039060878218, 'reg_alpha': 0.0002489023279750477, 'reg_lambda': 0.254477859158967}. Best is trial 9 with value: 3288.939801874509.


[0]	validation_0-rmse:48516.26892
[1000]	validation_0-rmse:3356.83102


[I 2024-09-13 07:15:50,987] Trial 10 finished with value: 3356.4471859530745 and parameters: {'learning_rate': 0.09255159460232132, 'max_depth': 10, 'min_child_weight': 4, 'gamma': 0.0044649227557944605, 'colsample_bytree': 0.8722613811041895, 'reg_alpha': 0.00010846811597153689, 'reg_lambda': 0.003221669444056658}. Best is trial 9 with value: 3288.939801874509.


[0]	validation_0-rmse:53364.99828
[1000]	validation_0-rmse:15886.85308
[2000]	validation_0-rmse:5822.99058
[3000]	validation_0-rmse:3882.92448
[4000]	validation_0-rmse:3619.11581
[5000]	validation_0-rmse:3549.86827
[6000]	validation_0-rmse:3509.35487
[7000]	validation_0-rmse:3481.65541
[8000]	validation_0-rmse:3461.04440
[9000]	validation_0-rmse:3444.80100
[10000]	validation_0-rmse:3431.63972
[11000]	validation_0-rmse:3421.28807
[12000]	validation_0-rmse:3411.07831
[13000]	validation_0-rmse:3400.50534
[14000]	validation_0-rmse:3391.46734
[15000]	validation_0-rmse:3383.94381
[16000]	validation_0-rmse:3377.11583
[17000]	validation_0-rmse:3371.01177
[18000]	validation_0-rmse:3365.33382
[19000]	validation_0-rmse:3360.41977
[20000]	validation_0-rmse:3356.44701
[21000]	validation_0-rmse:3352.64354
[22000]	validation_0-rmse:3347.94604
[23000]	validation_0-rmse:3344.56504
[24000]	validation_0-rmse:3340.80461
[25000]	validation_0-rmse:3337.56585
[26000]	validation_0-rmse:3334.37177
[27000]	vali

[I 2024-09-13 08:23:26,489] Trial 11 finished with value: 3318.8839313862754 and parameters: {'learning_rate': 0.0012386680517441778, 'max_depth': 12, 'min_child_weight': 10, 'gamma': 0.009716340807768656, 'colsample_bytree': 0.6377865000676443, 'reg_alpha': 0.0001428558469890162, 'reg_lambda': 0.006764281891331671}. Best is trial 9 with value: 3288.939801874509.


[0]	validation_0-rmse:53364.96537
[1000]	validation_0-rmse:15895.70869
[2000]	validation_0-rmse:5868.27268
[3000]	validation_0-rmse:3944.57396
[4000]	validation_0-rmse:3677.29341
[5000]	validation_0-rmse:3603.31388
[6000]	validation_0-rmse:3562.12038
[7000]	validation_0-rmse:3530.97463
[8000]	validation_0-rmse:3508.30033
[9000]	validation_0-rmse:3492.53487
[10000]	validation_0-rmse:3478.95158
[11000]	validation_0-rmse:3467.99328
[12000]	validation_0-rmse:3458.45913
[13000]	validation_0-rmse:3448.84037
[14000]	validation_0-rmse:3440.21068
[15000]	validation_0-rmse:3431.51352
[16000]	validation_0-rmse:3423.85437
[17000]	validation_0-rmse:3417.37356
[18000]	validation_0-rmse:3411.73976
[19000]	validation_0-rmse:3406.45118
[20000]	validation_0-rmse:3401.62617
[21000]	validation_0-rmse:3396.70903
[22000]	validation_0-rmse:3391.36122
[23000]	validation_0-rmse:3387.48879
[24000]	validation_0-rmse:3383.25795
[25000]	validation_0-rmse:3378.37539
[26000]	validation_0-rmse:3375.12639
[27000]	vali

[I 2024-09-13 09:53:32,193] Trial 12 finished with value: 3331.148438043154 and parameters: {'learning_rate': 0.0012391954223833696, 'max_depth': 11, 'min_child_weight': 3, 'gamma': 0.012318717038533471, 'colsample_bytree': 0.8020871813838039, 'reg_alpha': 0.0008367835099811545, 'reg_lambda': 0.026581056419018298}. Best is trial 9 with value: 3288.939801874509.


[0]	validation_0-rmse:53281.80250
[1000]	validation_0-rmse:4897.57536
[2000]	validation_0-rmse:3564.81110
[3000]	validation_0-rmse:3475.89165
[4000]	validation_0-rmse:3432.11165
[5000]	validation_0-rmse:3406.42391
[6000]	validation_0-rmse:3386.12259
[7000]	validation_0-rmse:3370.44130
[8000]	validation_0-rmse:3357.46677
[9000]	validation_0-rmse:3347.01733
[10000]	validation_0-rmse:3337.23279
[11000]	validation_0-rmse:3330.02285
[12000]	validation_0-rmse:3324.32212
[13000]	validation_0-rmse:3319.80215
[14000]	validation_0-rmse:3316.38339
[15000]	validation_0-rmse:3312.25495
[16000]	validation_0-rmse:3309.21821
[16158]	validation_0-rmse:3308.89570


[I 2024-09-13 10:26:55,274] Trial 13 finished with value: 3308.874359910568 and parameters: {'learning_rate': 0.0028037697665779446, 'max_depth': 12, 'min_child_weight': 6, 'gamma': 0.0012940687187252568, 'colsample_bytree': 0.6323190991378431, 'reg_alpha': 0.04436462408705523, 'reg_lambda': 0.001427818027213409}. Best is trial 9 with value: 3288.939801874509.


[0]	validation_0-rmse:53284.67403
[1000]	validation_0-rmse:5282.65406
[2000]	validation_0-rmse:3829.45696
[3000]	validation_0-rmse:3701.22490
[4000]	validation_0-rmse:3639.15995
[5000]	validation_0-rmse:3592.49959
[6000]	validation_0-rmse:3557.50944
[7000]	validation_0-rmse:3530.16766


In [None]:
# Save R², RMSE, and hyperparameters
results = {
    'R2': float(r2),
    'RMSE': float(rmse),
    'Best Parameters': {key: (int(value) if isinstance(value, np.integer) else float(value)
                              if isinstance(value, np.floating) else value)
                        for key, value in best_params.items()}
}

# Set timezone to São Paulo (UTC-3)
saopaulo_tz = pytz.timezone('America/Sao_Paulo')
timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')

# Define logs directory, and create them if they don't exist
logs_dir = 'logs'
os.makedirs(logs_dir, exist_ok=True)

# Define file paths within the respective directories
results_file = os.path.join(logs_dir, f'model_results_{timestamp}.txt')

# Save the results to a TXT file
with open(results_file, 'w') as file:
    file.write(f"R2: {results['R2']}\n")
    file.write(f"RMSE: {results['RMSE']}\n")
    file.write("Best Parameters:\n")
    for param, value in results['Best Parameters'].items():
        file.write(f"  {param}: {value}\n")

print(f"Results saved to {results_file}")

In [None]:
# Display evaluation metrics
print(f"Final Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")

In [None]:
# Define models directory, and create them if they don't exist
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# Train the final model using the full training+validation+test set with the optimal n_estimators
final_model = XGBRegressor(**best_params, objective='reg:squarederror', random_state=42, n_jobs=n_jobs)

# Train the model on the entire training+validation+set data
final_model.fit(X, y, verbose=True)

print("Final model trained successfully using all available data.")

In [None]:
# Define file paths within the respective directories
model_file = os.path.join(models_dir, f'trained_model_{timestamp}.joblib')

# Save the trained model to a file in the models folder
joblib.dump(final_model, model_file)
print(f"Model saved to {model_file}")

In [None]:
# Use the final model to predict the `tow` for the submission_set_updated
submission_set_features = submission_set_updated.iloc[:,:-1]
submission_set['tow'] = final_model.predict(submission_set_features)

submission_set

In [None]:
submission_set['tow'].describe()

In [None]:
challenge_set_updated['tow'].describe()

In [None]:
# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")
submission_set.to_csv(submission_file, index=False)