# XGBoost Model

In [1]:
import pandas as pd
from xgboost import XGBRegressor, callback
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from tqdm import tqdm

In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v9.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v9.csv")

In [3]:
# Function to analyze missing values
def analyze_missing_values(df):
    # Count and percentage of missing values per column
    missing_values = df.isna().sum()
    missing_percentage = (missing_values / len(df)) * 100
    missing_summary = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
    
    # Display summary of missing values
    print(missing_summary[missing_summary['Missing Values'] > 0])
    
# Analyzing missing values in both datasets
print("Analysis of challenge_set_updated:")
analyze_missing_values(challenge_set_updated)

print("\nAnalysis of submission_set_updated:")
analyze_missing_values(submission_set_updated)

Analysis of challenge_set_updated:
                               Missing Values  Percentage
track_variation_ARR_100                 23528    6.375927
track_variation_DEP_100                 54966   14.895410
track_variation_ENR                     50848   13.779460
average_vertical_rate_ARR_100           23728    6.430126
average_vertical_rate_DEP_100           55412   15.016273
average_vertical_rate_ENR               50852   13.780544
average_airspeed_ARR_100                23996    6.502752
average_airspeed_DEP_100                55471   15.032262
average_airspeed_ENR                    50852   13.780544
groundspeed_ARR_100                     23725    6.429313
groundspeed_DEP_100                     55409   15.015460
groundspeed_ENR                         50850   13.780002
wind_distance_ARR_100                   23528    6.375927
wind_distance_DEP_100                   54966   14.895410
wind_distance_ENR                       50848   13.779460
average_temperature_ARR_100          

In [4]:
# Function to drop columns with more than 16% missing values, except for 'tow' in the submission set
def drop_columns_above_threshold(df, threshold=16, preserve_columns=None):
    if preserve_columns is None:
        preserve_columns = []
    
    missing_percentage = df.isna().mean() * 100
    cols_to_keep = missing_percentage[missing_percentage <= threshold].index.tolist()
    
    # Ensure columns in preserve_columns are kept even if they exceed the threshold
    cols_to_keep.extend([col for col in preserve_columns if col in df.columns])
    
    df = df[cols_to_keep]
    return df

# Applying the function to challenge_set_updated
challenge_set_updated = drop_columns_above_threshold(challenge_set_updated)

# Applying the function to submission_set_updated, keeping 'tow'
submission_set_updated = drop_columns_above_threshold(submission_set_updated, preserve_columns=['tow'])

In [5]:
# Function to handle infinities and large values
def clean_infinities(df):
    """
    Replace infinite values with NaN in the dataframe to handle large or infinite values.
    """
    return df.replace([np.inf, -np.inf], np.nan)

# Function to impute missing values using KNN
def knn_imputation(df, exclude_columns=None, n_neighbors=5):
    """
    Impute missing values using K-Nearest Neighbors (KNN), using only the columns that do not have missing values,
    excluding the specified columns.
    
    Parameters:
    df (pd.DataFrame): The dataframe to impute.
    exclude_columns (list): List of columns to exclude from the imputation (e.g., 'tow').
    n_neighbors (int): Number of neighbors to use for KNN imputation.
    
    Returns:
    pd.DataFrame: Dataframe with imputed values.
    """
    df = df.copy()  # Create a copy of the dataframe to avoid modifying the original

    # Clean infinite values in the dataframe by replacing them with NaN
    df = clean_infinities(df)

    # Exclude specified columns from the imputation process
    if exclude_columns is None:
        exclude_columns = []

    # Ensure 'tow' is in the exclude columns, regardless of its state
    if 'tow' not in exclude_columns:
        exclude_columns.append('tow')

    # Select numeric columns that are not in the exclude list
    numeric_columns = df.select_dtypes(include=[np.number]).drop(columns=exclude_columns, errors='ignore')

    # Identify columns with missing values
    cols_with_missing = numeric_columns.columns[numeric_columns.isnull().any()]

    # Loop through each column that has missing values with a progress bar using tqdm
    for col in tqdm(cols_with_missing, desc="Imputing columns"):
        # Print the column being processed
        print(f"Processing column: {col}")
        
        # Separate rows with and without missing values in the target column
        missing_rows = numeric_columns[col].isnull()
        non_missing_rows = ~missing_rows

        # Find columns that do not have missing values both in non-missing and missing rows
        non_missing_features = numeric_columns.loc[non_missing_rows].dropna(axis=1).columns
        missing_features = numeric_columns.loc[missing_rows, non_missing_features].dropna(axis=1).columns

        # Ensure both training and missing feature sets are aligned and non-empty
        consistent_features = non_missing_features.intersection(missing_features)

        if consistent_features.empty:
            print(f"Skipping column {col}: No consistent features between training and prediction.")
            continue

        # Use only the consistent columns without missing values for KNN imputation
        X_train = df.loc[non_missing_rows, consistent_features]
        X_missing = df.loc[missing_rows, consistent_features]

        # Verifica se há NaNs nas features antes de aplicar o KNN
        if X_train.isnull().values.any():
            print(f"Skipping column {col}: NaNs found in training data.")
            continue

        # Apply KNN Imputer
        imputer = KNNImputer(n_neighbors=n_neighbors)
        combined_data = pd.concat([X_train, X_missing])
        imputed_data = imputer.fit_transform(combined_data)

        # Replace the original missing values with imputed values
        df.loc[combined_data.index, consistent_features] = imputed_data

    return df

# Applying the KNN imputation on challenge_set_updated
challenge_set_updated = knn_imputation(challenge_set_updated, exclude_columns=['tow'], n_neighbors=5)
# challenge_set_updated.to_csv("./data/challenge_set_updated_imputed.csv", index=False)

# Applying the KNN imputation on submission_set_updated
submission_set_updated = knn_imputation(submission_set_updated, exclude_columns=['tow'], n_neighbors=5)
# submission_set_updated.to_csv("./data/submission_set_updated_imputed.csv", index=False)

Imputing columns:   0%|          | 0/27 [00:00<?, ?it/s]

Processing column: track_variation_ARR_100


Imputing columns:   4%|▎         | 1/27 [01:57<51:04, 117.85s/it]

Processing column: track_variation_DEP_100


Imputing columns:   7%|▋         | 2/27 [02:06<22:27, 53.90s/it] 

Processing column: track_variation_ENR


Imputing columns:  11%|█         | 3/27 [02:15<13:12, 33.01s/it]

Processing column: average_vertical_rate_ARR_100


Imputing columns:  15%|█▍        | 4/27 [02:23<08:52, 23.15s/it]

Processing column: average_vertical_rate_DEP_100


Imputing columns:  19%|█▊        | 5/27 [02:31<06:30, 17.77s/it]

Processing column: average_vertical_rate_ENR


Imputing columns:  22%|██▏       | 6/27 [02:39<05:04, 14.52s/it]

Processing column: average_airspeed_ARR_100


Imputing columns:  26%|██▌       | 7/27 [02:47<04:08, 12.40s/it]

Processing column: average_airspeed_DEP_100


Imputing columns:  30%|██▉       | 8/27 [02:56<03:31, 11.12s/it]

Processing column: average_airspeed_ENR


Imputing columns:  33%|███▎      | 9/27 [03:04<03:03, 10.20s/it]

Processing column: groundspeed_ARR_100


Imputing columns:  37%|███▋      | 10/27 [03:12<02:41,  9.52s/it]

Processing column: groundspeed_DEP_100


Imputing columns:  41%|████      | 11/27 [03:20<02:26,  9.13s/it]

Processing column: groundspeed_ENR


Imputing columns:  44%|████▍     | 12/27 [03:28<02:12,  8.83s/it]

Processing column: wind_distance_ARR_100


Imputing columns:  48%|████▊     | 13/27 [03:36<02:00,  8.60s/it]

Processing column: wind_distance_DEP_100


Imputing columns:  52%|█████▏    | 14/27 [03:44<01:50,  8.46s/it]

Processing column: wind_distance_ENR


Imputing columns:  56%|█████▌    | 15/27 [03:53<01:40,  8.38s/it]

Processing column: average_temperature_ARR_100


Imputing columns:  59%|█████▉    | 16/27 [04:01<01:31,  8.31s/it]

Processing column: average_temperature_DEP_100


Imputing columns:  63%|██████▎   | 17/27 [04:09<01:22,  8.28s/it]

Processing column: average_temperature_ENR


Imputing columns:  67%|██████▋   | 18/27 [04:17<01:14,  8.28s/it]

Processing column: average_humidity_ARR_100


Imputing columns:  70%|███████   | 19/27 [04:25<01:05,  8.20s/it]

Processing column: average_humidity_DEP_100


Imputing columns:  74%|███████▍  | 20/27 [04:33<00:57,  8.19s/it]

Processing column: average_humidity_ENR


Imputing columns:  78%|███████▊  | 21/27 [04:41<00:49,  8.18s/it]

Processing column: specific_energy_DEP_100


Imputing columns:  81%|████████▏ | 22/27 [04:50<00:40,  8.19s/it]

Processing column: specific_energy_ENR


Imputing columns:  85%|████████▌ | 23/27 [04:58<00:32,  8.20s/it]

Processing column: flown_distance_ARR_100


Imputing columns:  89%|████████▉ | 24/27 [05:06<00:24,  8.13s/it]

Processing column: flown_distance_DEP_100


Imputing columns:  93%|█████████▎| 25/27 [05:14<00:16,  8.16s/it]

Processing column: flown_distance_ENR


Imputing columns:  96%|█████████▋| 26/27 [05:22<00:08,  8.18s/it]

Processing column: specific_energy


Imputing columns: 100%|██████████| 27/27 [05:30<00:00, 12.25s/it]
Imputing columns:   0%|          | 0/27 [00:00<?, ?it/s]

Processing column: track_variation_ARR_100


Imputing columns:   4%|▎         | 1/27 [00:29<12:35, 29.06s/it]

Processing column: track_variation_DEP_100


Imputing columns:   7%|▋         | 2/27 [00:31<05:36, 13.47s/it]

Processing column: track_variation_ENR


Imputing columns:  11%|█         | 3/27 [00:33<03:20,  8.34s/it]

Processing column: average_vertical_rate_ARR_100


Imputing columns:  15%|█▍        | 4/27 [00:36<02:16,  5.94s/it]

Processing column: average_vertical_rate_DEP_100


Imputing columns:  19%|█▊        | 5/27 [00:38<01:41,  4.60s/it]

Processing column: average_vertical_rate_ENR


Imputing columns:  22%|██▏       | 6/27 [00:40<01:19,  3.80s/it]

Processing column: average_airspeed_ARR_100


Imputing columns:  26%|██▌       | 7/27 [00:42<01:06,  3.31s/it]

Processing column: average_airspeed_DEP_100


Imputing columns:  30%|██▉       | 8/27 [00:45<00:56,  2.97s/it]

Processing column: average_airspeed_ENR


Imputing columns:  33%|███▎      | 9/27 [00:47<00:49,  2.74s/it]

Processing column: groundspeed_ARR_100


Imputing columns:  37%|███▋      | 10/27 [00:49<00:44,  2.60s/it]

Processing column: groundspeed_DEP_100


Imputing columns:  41%|████      | 11/27 [00:51<00:39,  2.49s/it]

Processing column: groundspeed_ENR


Imputing columns:  44%|████▍     | 12/27 [00:54<00:36,  2.41s/it]

Processing column: wind_distance_ARR_100


Imputing columns:  48%|████▊     | 13/27 [00:56<00:33,  2.37s/it]

Processing column: wind_distance_DEP_100


Imputing columns:  52%|█████▏    | 14/27 [00:58<00:30,  2.33s/it]

Processing column: wind_distance_ENR


Imputing columns:  56%|█████▌    | 15/27 [01:00<00:27,  2.31s/it]

Processing column: average_temperature_ARR_100


Imputing columns:  59%|█████▉    | 16/27 [01:03<00:25,  2.32s/it]

Processing column: average_temperature_DEP_100


Imputing columns:  63%|██████▎   | 17/27 [01:05<00:23,  2.30s/it]

Processing column: average_temperature_ENR


Imputing columns:  67%|██████▋   | 18/27 [01:07<00:20,  2.30s/it]

Processing column: average_humidity_ARR_100


Imputing columns:  70%|███████   | 19/27 [01:10<00:18,  2.29s/it]

Processing column: average_humidity_DEP_100


Imputing columns:  74%|███████▍  | 20/27 [01:12<00:15,  2.28s/it]

Processing column: average_humidity_ENR


Imputing columns:  78%|███████▊  | 21/27 [01:14<00:13,  2.27s/it]

Processing column: specific_energy_DEP_100


Imputing columns:  81%|████████▏ | 22/27 [01:16<00:11,  2.26s/it]

Processing column: specific_energy_ENR


Imputing columns:  85%|████████▌ | 23/27 [01:19<00:09,  2.26s/it]

Processing column: flown_distance_ARR_100


Imputing columns:  89%|████████▉ | 24/27 [01:21<00:06,  2.27s/it]

Processing column: flown_distance_DEP_100


Imputing columns:  93%|█████████▎| 25/27 [01:23<00:04,  2.26s/it]

Processing column: flown_distance_ENR


Imputing columns:  96%|█████████▋| 26/27 [01:25<00:02,  2.26s/it]

Processing column: specific_energy


Imputing columns: 100%|██████████| 27/27 [01:28<00:00,  3.26s/it]


In [6]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']

n_jobs = os.cpu_count() - 1

In [7]:
# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets for early stopping
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define the best parameters provided
best_params = {
    'subsample': 1.0,
    'reg_lambda': 0.46415888336127775,
    'reg_alpha': 0.166810053720005,
    'min_child_weight': 4,
    'max_depth': 13,
    'learning_rate': 0.008,
    'gamma': 0.4444444444444444,
    'colsample_bytree': 0.6
}

# Initialize the XGBoost model with the provided best parameters
best_model = XGBRegressor(
    **best_params,
    objective='reg:squarederror',
    random_state=42,
    n_estimators=10_000_000,  # Set a high value to allow early stopping to find the best n_estimators
    n_jobs=n_jobs,
    eval_metric="rmse",  # Set eval_metric in the constructor
    early_stopping_rounds=20,  # Set early_stopping_rounds in the constructor
    # enable_categorical=True  # Enable categorical feature handling
)

# Train the model on the training data with early stopping using the validation set
best_model.fit(X_train, y_train, verbose=50, eval_set=[(X_val, y_val)])

# Update best_params with the best number of estimators found during early stopping
best_params['n_estimators'] = best_model.best_iteration + 1  # +1 because best_iteration is zero-indexed

# Evaluate the final model on the test set
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Best Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")
print(f"Updated best_params: {best_params}")

[0]	validation_0-rmse:53006.27714
[50]	validation_0-rmse:35616.97985
[100]	validation_0-rmse:24019.71245
[150]	validation_0-rmse:16323.17530
[200]	validation_0-rmse:11254.95525
[250]	validation_0-rmse:7974.48395
[300]	validation_0-rmse:5922.21594
[350]	validation_0-rmse:4695.10843
[400]	validation_0-rmse:4005.54282
[450]	validation_0-rmse:3636.49691
[500]	validation_0-rmse:3445.29560
[550]	validation_0-rmse:3343.26518
[600]	validation_0-rmse:3284.48316
[650]	validation_0-rmse:3248.48205
[700]	validation_0-rmse:3225.04844
[750]	validation_0-rmse:3209.46269
[800]	validation_0-rmse:3194.57403
[850]	validation_0-rmse:3182.41555
[900]	validation_0-rmse:3172.03515
[950]	validation_0-rmse:3163.74053
[1000]	validation_0-rmse:3155.94142
[1050]	validation_0-rmse:3148.49154
[1100]	validation_0-rmse:3142.54755
[1150]	validation_0-rmse:3137.03179
[1200]	validation_0-rmse:3131.90869
[1250]	validation_0-rmse:3127.32468
[1300]	validation_0-rmse:3123.41842
[1350]	validation_0-rmse:3119.19458
[1400]	val

In [8]:
# Save R², RMSE, and hyperparameters
results = {
    'R2': float(r2),
    'RMSE': float(rmse),
    'Best Parameters': {key: (int(value) if isinstance(value, np.integer) else float(value)
                              if isinstance(value, np.floating) else value)
                        for key, value in best_params.items()}
}

# Set timezone to São Paulo (UTC-3)
saopaulo_tz = pytz.timezone('America/Sao_Paulo')
timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')

# Define logs directory, and create them if they don't exist
logs_dir = 'logs'
os.makedirs(logs_dir, exist_ok=True)

# Define file paths within the respective directories
results_file = os.path.join(logs_dir, f'model_results_{timestamp}.txt')

# Save the results to a TXT file
with open(results_file, 'w') as file:
    file.write(f"R2: {results['R2']}\n")
    file.write(f"RMSE: {results['RMSE']}\n")
    file.write("Best Parameters:\n")
    for param, value in results['Best Parameters'].items():
        file.write(f"  {param}: {value}\n")

print(f"Results saved to {results_file}")

Results saved to logs/model_results_20240918_194727.txt


In [9]:
# Display evaluation metrics
print(f"Final Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")

Final Model Performance - R^2 Score: 0.9968, RMSE: 2973.0258


In [10]:
# Define models directory, and create them if they don't exist
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

# Train the final model using the full training+validation+test set with the optimal n_estimators
final_model = XGBRegressor(**best_params, objective='reg:squarederror', random_state=42, n_jobs=n_jobs)

# Train the model on the entire training+validation+set data
final_model.fit(X, y, verbose=100)

print("Final model trained successfully using all available data.")

Final model trained successfully using all available data.


In [11]:
# Define file paths within the respective directories
model_file = os.path.join(models_dir, f'trained_model_{timestamp}.joblib')

# Save the trained model to a file in the models folder
joblib.dump(final_model, model_file)
print(f"Model saved to {model_file}")

Model saved to models/trained_model_20240918_194727.joblib


In [12]:
# Use the final model to predict the `tow` for the submission_set_updated
submission_set_features = submission_set_updated.iloc[:,:-1]
submission_set['tow'] = final_model.predict(submission_set_features)

submission_set

Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,248753821,2022-01-01,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,Istanbul Sabiha Gokcen,TR,LFLL,Lyon,FR,2022-01-01T09:44:00Z,2022-01-01T12:48:33Z,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,170,15,1122,68857.656250
1,248753822,2022-01-01,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,Brussels,BE,KJFK,New York JFK,US,2022-01-01T09:45:00Z,2022-01-01T17:49:51Z,A333,H,bdeeef3a675587d530de70a25d7118d2,470,15,3205,215556.250000
2,248754498,2022-01-01,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,Miami,US,EGLL,London Heathrow,GB,2022-01-01T01:52:00Z,2022-01-01T09:55:16Z,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,473,10,3965,223199.718750
3,248757623,2022-01-01,81564432d3ee97c4bdf4cd8f006753dc,EGCN,Doncaster Sheffield,GB,LEAL,Alicante,ES,2022-01-01T08:20:00Z,2022-01-01T11:06:08Z,B38M,M,3922524069809ac4326134429751e26f,156,10,986,63746.378906
4,248763603,2022-01-01,84be079d7e660db105d91f600b4b3d59,EIDW,Dublin,IE,LFLL,Lyon,FR,2022-01-01T11:01:00Z,2022-01-01T13:00:43Z,A320,M,a73f82288988b79be490c6322f4c32ed,105,15,686,63619.421875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105954,258066302,2022-12-31,2d3b4446c4d05a25196a9d52cab936fb,LTFJ,Istanbul Sabiha Gokcen,TR,EKCH,Copenhagen,DK,2022-12-31T09:36:00Z,2022-12-31T13:12:17Z,B38M,M,6351ec1b849adacc0cbb3b1313d8d39b,201,15,1199,68013.453125
105955,258068609,2022-12-31,253fd692ed441fac523081471c067772,LOWW,Vienna,AT,KIAD,Washington Dulles,US,2022-12-31T09:49:00Z,2022-12-31T19:38:26Z,B763,H,5d407cb11cc29578cc3e292e743f5393,575,14,3937,174963.531250
105956,258068876,2022-12-31,c9fca302ca2e28acab0eb0bb1b46f11b,LTFM,iGA Istanbul,TR,LSZH,Zurich,CH,2022-12-31T09:25:00Z,2022-12-31T12:24:24Z,A321,M,6351ec1b849adacc0cbb3b1313d8d39b,154,25,988,74369.757812
105957,258064675,2022-12-31,00f96ad0e382476649574ba044c764fc,EHAM,Amsterdam,NL,EDDF,Frankfurt,DE,2022-12-31T10:04:21Z,2022-12-31T10:55:35Z,A320,M,f502877cab405652cf0dd70c2213e730,42,9,240,61011.003906


In [13]:
# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")
submission_set.to_csv(submission_file, index=False)