# XGBoost Model

In [1]:
import pandas as pd
from xgboost import XGBRegressor, callback
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE

In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v19_xgboost.csv")
submission_set = pd.read_csv("./data/final_submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v19_xgboost.csv")

In [3]:
def clean_data_better(df, threshold=1e10):
    # Replace inf and -inf with NaN using vectorized operations
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Mask values above the threshold with NaN directly using vectorized operations
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    df[numeric_columns] = df[numeric_columns].mask(df[numeric_columns].abs() > threshold)
    
    # Fill NaNs using a combined approach - first forward fill, then median
    df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df.median())
    
    return df

# Applying the improved cleaning function
challenge_set_updated = clean_data_better(challenge_set_updated)
submission_set_updated = clean_data_better(submission_set_updated)

  df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df.median())
  df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df.median())


In [4]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']

n_jobs = os.cpu_count() // 2

In [5]:
# Further split the training data into training and validation sets for early stopping
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

best_params = {
    'subsample': 1.0,
    'reg_lambda': 0.46415888336127775,
    'reg_alpha': 0.166810053720005,
    'min_child_weight': 4,
    'max_depth': 13,
    'learning_rate': 0.01,
    'gamma': 0.4444444444444444,
    'colsample_bytree': 0.6
}

# Initialize the XGBoost model with the provided best parameters
best_model = XGBRegressor(
    **best_params,
    objective='reg:squarederror',
    random_state=42,
    n_estimators=10_000_000,  # Set a high value to allow early stopping to find the best n_estimators
    n_jobs=n_jobs,
    eval_metric="rmse",  # Set eval_metric in the constructor
    early_stopping_rounds=50,  # Set early_stopping_rounds in the constructor
)

# Train the model on the training data with early stopping using the validation set
best_model.fit(X_train, y_train, verbose=100, eval_set=[(X_val, y_val)])

# Update best_params with the best number of estimators found during early stopping
best_params['n_estimators'] = best_model.best_iteration + 1  # +1 because best_iteration is zero-indexed

# Evaluate the final model on the val set
y_pred = best_model.predict(X_val)
r2 = r2_score(y_val, y_pred)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

print(f"Best Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")
print(f"Updated best_params: {best_params}")

[0]	validation_0-rmse:52382.48765
[100]	validation_0-rmse:19453.26245
[200]	validation_0-rmse:7646.13051
[300]	validation_0-rmse:3791.96361
[400]	validation_0-rmse:2812.75160
[500]	validation_0-rmse:2604.05764
[600]	validation_0-rmse:2545.64236
[700]	validation_0-rmse:2517.97985
[800]	validation_0-rmse:2501.20859
[900]	validation_0-rmse:2488.38003
[1000]	validation_0-rmse:2477.74009
[1100]	validation_0-rmse:2469.37012
[1200]	validation_0-rmse:2462.34166
[1300]	validation_0-rmse:2456.74261
[1400]	validation_0-rmse:2451.99326
[1500]	validation_0-rmse:2447.97381
[1600]	validation_0-rmse:2444.73808
[1700]	validation_0-rmse:2441.73552
[1800]	validation_0-rmse:2439.02880
[1900]	validation_0-rmse:2436.72697
[2000]	validation_0-rmse:2434.74560
[2100]	validation_0-rmse:2432.92566
[2200]	validation_0-rmse:2431.38957
[2300]	validation_0-rmse:2430.05128
[2400]	validation_0-rmse:2428.84388
[2500]	validation_0-rmse:2427.71702
[2600]	validation_0-rmse:2426.71163
[2700]	validation_0-rmse:2425.92155
[2

In [6]:
# Save R², RMSE, and hyperparameters
results = {
    'R2': float(r2),
    'RMSE': float(rmse),
    'Best Parameters': {key: (int(value) if isinstance(value, np.integer) else float(value)
                              if isinstance(value, np.floating) else value)
                        for key, value in best_params.items()}
}

# Set timezone to São Paulo (UTC-3)
saopaulo_tz = pytz.timezone('America/Sao_Paulo')
timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')

# Define logs directory, and create them if they don't exist
logs_dir = 'logs'
os.makedirs(logs_dir, exist_ok=True)

# Define file paths within the respective directories
results_file = os.path.join(logs_dir, f'model_results_{timestamp}_xgboost.txt')

# Save the results to a TXT file
with open(results_file, 'w') as file:
    file.write(f"R2: {results['R2']}\n")
    file.write(f"RMSE: {results['RMSE']}\n")
    file.write("Best Parameters:\n")
    for param, value in results['Best Parameters'].items():
        file.write(f"  {param}: {value}\n")

print(f"Results saved to {results_file}")

Results saved to logs/model_results_20241024_175312_xgboost.txt


In [7]:
# Display evaluation metrics
print(f"Final Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")

Final Model Performance - R^2 Score: 0.9979, RMSE: 2413.0588


In [8]:
# Define models directory, and create them if they don't exist
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# Train the final model using the full training+validation+test set with the optimal n_estimators
final_model = XGBRegressor(**best_params, objective='reg:squarederror', random_state=42, n_jobs=n_jobs)

# Train the model on the entire training+validation+set data
final_model.fit(X, y, verbose=100)

print("Final model trained successfully using all available data.")

Final model trained successfully using all available data.


In [9]:
# Define file paths within the respective directories
model_file = os.path.join(models_dir, f'trained_model_{timestamp}_xgboost.joblib')

# Save the trained model to a file in the models folder
joblib.dump(final_model, model_file)
print(f"Model saved to {model_file}")

Model saved to models/trained_model_20241024_175312_xgboost.joblib


In [20]:
# Remove the columns that are in submission_set_updated but not in challenge_set_updated
common_columns = submission_set_updated.columns.intersection(challenge_set_updated.columns)
submission_set_updated = submission_set_updated[common_columns]

# Reorder the columns to match the order in challenge_set_updated
submission_set_updated = submission_set_updated[challenge_set_updated.columns]

In [21]:
# Use the final model to predict the `tow` for the submission_set_updated
submission_set_features = submission_set_updated.drop("tow", axis=1)
submission_set['tow'] = final_model.predict(submission_set_features)

submission_set

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,248753821,2022-01-01,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,Istanbul Sabiha Gokcen,TR,LFLL,Lyon,FR,2022-01-01T09:44:00Z,2022-01-01T12:48:33Z,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,170,15,1122,70449.234375
1,248753822,2022-01-01,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,Brussels,BE,KJFK,New York JFK,US,2022-01-01T09:45:00Z,2022-01-01T17:49:51Z,A333,H,bdeeef3a675587d530de70a25d7118d2,470,15,3205,213784.859375
2,248754498,2022-01-01,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,Miami,US,EGLL,London Heathrow,GB,2022-01-01T01:52:00Z,2022-01-01T09:55:16Z,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,473,10,3965,224449.812500
3,248763650,2022-01-01,35f7721f68bf85128195547ae38b0f04,EBBR,Brussels,BE,LEAL,Alicante,ES,2022-01-01T12:02:00Z,2022-01-01T14:13:56Z,B738,M,f53c55b5cf0cbb3be755bf50df6fa52d,123,9,802,66894.765625
4,248763651,2022-01-01,eb56918bee9bc5204624186b9bcc4391,LSZH,Zurich,CH,LFPG,Paris Charles de Gaulle,FR,2022-01-01T12:03:00Z,2022-01-01T13:09:44Z,BCS3,M,2d5def0a5a844b343ba1b7cc9cb28fa9,56,11,292,52082.082031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158144,258068876,2022-12-31,c9fca302ca2e28acab0eb0bb1b46f11b,LTFM,iGA Istanbul,TR,LSZH,Zurich,CH,2022-12-31T09:25:00Z,2022-12-31T12:24:24Z,A321,M,6351ec1b849adacc0cbb3b1313d8d39b,154,25,988,75024.734375
158145,258064675,2022-12-31,00f96ad0e382476649574ba044c764fc,EHAM,Amsterdam,NL,EDDF,Frankfurt,DE,2022-12-31T10:04:21Z,2022-12-31T10:55:35Z,A320,M,f502877cab405652cf0dd70c2213e730,42,9,240,60749.546875
158146,258065436,2022-12-31,87c552b7f6d9bbd16a66e95df761c7f2,LEBL,Barcelona,ES,KJFK,New York JFK,US,2022-12-31T09:34:00Z,2022-12-31T17:51:22Z,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,483,14,3426,195640.140625
158147,258058138,2022-12-31,2cd57e434494606c965bac87c024bda2,LIPE,Bologna,IT,LOWW,Vienna,AT,2022-12-31T09:37:00Z,2022-12-31T10:47:00Z,E195,M,5d407cb11cc29578cc3e292e743f5393,55,15,335,42084.140625


In [27]:
# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}_xgboost.csv")
submission_set[['flight_id', 'tow']].to_csv(submission_file, index=False)
# val_file = os.path.join(submissions_dir, f"submission_{timestamp}_xgboost_val.csv")
# pd.DataFrame(y_pred, columns=['tow']).to_csv(val_file, index=False)