# Lightgbm Model

In [2]:
import pandas as pd
from xgboost import XGBRegressor, callback
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import numpy as np
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from lightgbm import LGBMRegressor


In [3]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v18.csv")
submission_set = pd.read_csv("./data/final_submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v18.csv")

In [4]:
challenge_set_updated.shape

(369013, 215)

In [5]:
submission_set_updated.shape

(158149, 215)

In [6]:
# Function to drop columns with more than 40% missing values, except for 'tow' in the submission set
def drop_columns_above_threshold(df, threshold=40, preserve_columns=None):
    if preserve_columns is None:
        preserve_columns = []
    
    missing_percentage = df.isna().mean() * 100
    cols_to_keep = missing_percentage[missing_percentage <= threshold].index.tolist()
    
    # Ensure columns in preserve_columns are kept even if they exceed the threshold
    cols_to_keep.extend([col for col in preserve_columns if col in df.columns])
    
    df = df[cols_to_keep]
    return df

# Applying the function to challenge_set_updated
challenge_set_updated = drop_columns_above_threshold(challenge_set_updated)

# Applying the function to submission_set_updated, keeping 'tow'
submission_set_updated = drop_columns_above_threshold(submission_set_updated, preserve_columns=['tow'])

In [7]:


def clean_data_better(df, threshold=1e10):
    # Replace inf and -inf with NaN using vectorized operations
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    # Identify numeric and non-numeric columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
    
    # Mask values above the threshold in numeric columns
    df[numeric_columns] = df[numeric_columns].mask(df[numeric_columns].abs() > threshold)
    
    # Fill NaNs in numeric columns with forward fill, then median
    df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df[numeric_columns].median())
    
    # Fill NaNs in non-numeric columns using forward fill only
    df[non_numeric_columns] = df[non_numeric_columns].fillna(method='ffill')
    
    return df

# Applying the improved cleaning function
challenge_set_updated = clean_data_better(challenge_set_updated)
submission_set_updated = clean_data_better(submission_set_updated)


  df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df[numeric_columns].median())
  df[non_numeric_columns] = df[non_numeric_columns].fillna(method='ffill')
  df[numeric_columns] = df[numeric_columns].fillna(method='ffill').fillna(df[numeric_columns].median())
  df[non_numeric_columns] = df[non_numeric_columns].fillna(method='ffill')


In [8]:
submission_set_updated.shape

(158149, 151)

In [10]:
challenge_set_updated.shape

(369013, 209)

In [11]:
# Find the common columns between both datasets
common_columns = submission_set_updated.columns.intersection(challenge_set_updated.columns)

# Filter challenge_set_updated to only include these common columns
challenge_set_updated = challenge_set_updated[common_columns]

# Display the shape of the filtered challenge_set_updated to confirm the changes
print(challenge_set_updated.shape)


(369013, 151)


In [12]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']

n_jobs = os.cpu_count() // 2

In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame
categorical_columns = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season',
                       'flight_duration_category', 'adep_region', 'ades_region',
                       'flight_direction', 'Manufacturer', 'Model_FAA',
                       'Physical_Class_Engine', 'FAA_Weight',  'offblock_weekday_name', 'arrival_season', 'arrival_weekday_name']

# Encoding using LabelEncoder
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))  # Ensure data is string type before encoding


In [14]:
from lightgbm import LGBMRegressor
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets for early stopping
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

# Define the best parameters for LightGBM with GPU support
best_params = {
    'boosting_type': 'gbdt',           # Gradient boosting decision tree
    'device': 'gpu',                   # Use GPU for training
    'tree_method': 'gpu_hist',         # Use GPU-optimized histogram method
    'subsample': 1.0,
    'lambda_l2': 0.46415888336127775,  # reg_lambda in LightGBM
    'lambda_l1': 0.166810053720005,    # reg_alpha in LightGBM
    'min_child_weight': 4,             # This corresponds to min_data_in_leaf in LightGBM
    'max_depth': 13,
    'learning_rate': 0.01,
    'colsample_bytree': 0.6            # same as feature_fraction in LightGBM
}

# Initialize the LightGBM model with the best parameters and GPU support
best_model = LGBMRegressor(
    **best_params,
    objective='regression',
    random_state=42,
    n_estimators=10_000_000,  # Set a high value to allow early stopping to find the best n_estimators
    n_jobs=n_jobs,
    metric="rmse",            # Use RMSE as the evaluation metric
    early_stopping_rounds=20  # Early stopping based on validation performance
)

# Train the model on the training data with early stopping using the validation set
best_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)])

# Update best_params with the best number of estimators found during early stopping
best_params['n_estimators'] = best_model.best_iteration_

# Evaluate the final model on the test set
y_pred = best_model.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Best Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")
print(f"Updated best_params: {best_params}")


[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 22101
[LightGBM] [Info] Number of data points in the train set: 236168, number of used features: 147
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A4500, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 134 dense feature groups (30.63 MB) transferred to GPU in 0.009082 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 79525.199947
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[24786]	valid_0's rmse: 2688.01
Best Model Performance - R^2 Score: 0.9975, RMSE: 2636.6377
Updated best_params: {'boosting_type': 'gbdt', 'device': 'gpu', 'tree_method': 'gpu_hist', 'subsample': 1.0, 'lambda_l2': 0.46415888336127775, 'lambda_l1': 0.166810053720005, 'min_child_weight': 4, 'max_depth': 13, 'learning_rate'

In [15]:
import pandas as pd

# Assuming X_val and y_val are NumPy arrays or pandas DataFrames
# Combine X_val and y_val into a single DataFrame
val_data = pd.DataFrame(X_val)
val_data['tow'] = y_val

# Save the validation dataset to a CSV file
val_data.to_csv('output_data/validation_dataset.csv', index=False)

print("Validation dataset saved as 'validation_dataset.csv'")


  val_data['tow'] = y_val


Validation dataset saved as 'validation_dataset.csv'


In [17]:
# Save R², RMSE, and hyperparameters
results = {
    'R2': float(r2),
    'RMSE': float(rmse),
    'Best Parameters': {key: (int(value) if isinstance(value, np.integer) else float(value)
                              if isinstance(value, np.floating) else value)
                        for key, value in best_params.items()}
}

# Set timezone to São Paulo (UTC-3)
saopaulo_tz = pytz.timezone('America/Sao_Paulo')
timestamp = datetime.now(saopaulo_tz).strftime('%Y%m%d_%H%M%S')

# Define logs directory, and create them if they don't exist
logs_dir = 'logs'
os.makedirs(logs_dir, exist_ok=True)

# Define file paths within the respective directories
results_file = os.path.join(logs_dir, f'model_results_{timestamp}.txt')

# Save the results to a TXT file
with open(results_file, 'w') as file:
    file.write(f"R2: {results['R2']}\n")
    file.write(f"RMSE: {results['RMSE']}\n")
    file.write("Best Parameters:\n")
    for param, value in results['Best Parameters'].items():
        file.write(f"  {param}: {value}\n")

print(f"Results saved to {results_file}")

Results saved to logs/model_results_20241021_111520.txt


In [18]:
# Display evaluation metrics
print(f"Final Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")

Final Model Performance - R^2 Score: 0.9975, RMSE: 2636.6377


In [19]:
# Define models directory, and create them if they don't exist
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

final_model = LGBMRegressor(**best_params, objective='regression', random_state=42, n_jobs=n_jobs)

# Train the model on the entire training+validation+set data
final_model.fit(X, y)

print("Final model trained successfully using all available data.")

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 22111
[LightGBM] [Info] Number of data points in the train set: 369013, number of used features: 147
[LightGBM] [Info] Using GPU Device: NVIDIA RTX A4500, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 134 dense feature groups (47.86 MB) transferred to GPU in 0.014501 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 79482.257206
Final model trained successfully using all available data.


In [21]:
# Define file paths within the respective directories
model_file = os.path.join(models_dir, f'trained_model_{timestamp}.joblib')

# Save the trained model to a file in the models folder
joblib.dump(final_model, model_file)
print(f"Model saved to {model_file}")

Model saved to models/trained_model_20241021_111520.joblib


In [22]:
submission_set_updated.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,158139,158140,158141,158142,158143,158144,158145,158146,158147,158148
adep,LTFJ,EBBR,KMIA,EBBR,LSZH,EGCN,EIDW,LTFJ,EGLL,LEMG,...,KSFO,LOWW,LTFJ,LOWW,LOWW,LTFM,EHAM,LEBL,LIPE,UBBB
ades,LFLL,KJFK,EGLL,LEAL,LFPG,LEAL,LFLL,LTAC,EIDW,EBAW,...,EIDW,KEWR,EKCH,LTFM,KIAD,LSZH,EDDF,KJFK,LOWW,LTFM
aircraft_type,B738,A333,B77W,B738,BCS3,B38M,A320,B738,A320,E190,...,A333,B772,B38M,A21N,B763,A321,A320,B772,E195,B738
wtc,M,H,H,M,M,M,M,M,M,M,...,H,H,M,M,H,M,M,H,M,M
airline,6351ec1b849adacc0cbb3b1313d8d39b,bdeeef3a675587d530de70a25d7118d2,5543e4dc327359ffaf5b9c0e6faaf0e1,f53c55b5cf0cbb3be755bf50df6fa52d,2d5def0a5a844b343ba1b7cc9cb28fa9,3922524069809ac4326134429751e26f,a73f82288988b79be490c6322f4c32ed,6351ec1b849adacc0cbb3b1313d8d39b,a73f82288988b79be490c6322f4c32ed,f53c55b5cf0cbb3be755bf50df6fa52d,...,a73f82288988b79be490c6322f4c32ed,5d407cb11cc29578cc3e292e743f5393,6351ec1b849adacc0cbb3b1313d8d39b,6351ec1b849adacc0cbb3b1313d8d39b,5d407cb11cc29578cc3e292e743f5393,6351ec1b849adacc0cbb3b1313d8d39b,f502877cab405652cf0dd70c2213e730,5543e4dc327359ffaf5b9c0e6faaf0e1,5d407cb11cc29578cc3e292e743f5393,6351ec1b849adacc0cbb3b1313d8d39b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
bearing,293.477205,291.395141,43.036806,197.753476,293.398537,178.644825,131.790949,104.173834,301.052709,20.934088,...,33.907846,300.276494,329.024194,123.86017,300.682568,300.085737,132.448927,296.50592,43.672437,279.85055
elevation_gradient,-0.030154,-0.009004,0.003095,-0.008914,-0.655061,-0.006508,0.149765,1.983765,0.10915,-0.002268,...,0.008554,-0.025929,-0.150159,0.115989,-0.012142,0.061473,0.311654,0.0,0.255442,0.179499
adep_geo_cluster,11,6,12,6,2,13,0,11,13,7,...,4,16,11,16,16,11,6,19,2,10
ades_geo_cluster,17,1,13,19,6,19,17,11,0,6,...,0,1,9,11,1,2,2,1,16,11


In [23]:
submission_set_features = submission_set_updated.iloc[:,:-1]


In [24]:


label_encoder = LabelEncoder()
for col in ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season',
                       'flight_duration_category', 'adep_region', 'ades_region',
                       'flight_direction', 'Manufacturer', 'Model_FAA',
                       'Physical_Class_Engine', 'FAA_Weight',  'offblock_weekday_name', 'arrival_season', 'arrival_weekday_name']:
    submission_set_features[col] = label_encoder.fit_transform(submission_set_features[col].astype(str))


In [25]:

submission_set['tow'] = final_model.predict(submission_set_features)

submission_set



Unnamed: 0,flight_id,date,callsign,adep,name_adep,country_code_adep,ades,name_ades,country_code_ades,actual_offblock_time,arrival_time,aircraft_type,wtc,airline,flight_duration,taxiout_time,flown_distance,tow
0,248753821,2022-01-01,3b3de0f3ad0ee192513995c02f7bf7cf,LTFJ,Istanbul Sabiha Gokcen,TR,LFLL,Lyon,FR,2022-01-01T09:44:00Z,2022-01-01T12:48:33Z,B738,M,6351ec1b849adacc0cbb3b1313d8d39b,170,15,1122,69874.872397
1,248753822,2022-01-01,e06dd03d4a879ca37d9e18c1bd7cad16,EBBR,Brussels,BE,KJFK,New York JFK,US,2022-01-01T09:45:00Z,2022-01-01T17:49:51Z,A333,H,bdeeef3a675587d530de70a25d7118d2,470,15,3205,213279.552158
2,248754498,2022-01-01,2d3b1c962c78c4ebeef11bcd51b9e94c,KMIA,Miami,US,EGLL,London Heathrow,GB,2022-01-01T01:52:00Z,2022-01-01T09:55:16Z,B77W,H,5543e4dc327359ffaf5b9c0e6faaf0e1,473,10,3965,243532.032550
3,248763650,2022-01-01,35f7721f68bf85128195547ae38b0f04,EBBR,Brussels,BE,LEAL,Alicante,ES,2022-01-01T12:02:00Z,2022-01-01T14:13:56Z,B738,M,f53c55b5cf0cbb3be755bf50df6fa52d,123,9,802,66969.218602
4,248763651,2022-01-01,eb56918bee9bc5204624186b9bcc4391,LSZH,Zurich,CH,LFPG,Paris Charles de Gaulle,FR,2022-01-01T12:03:00Z,2022-01-01T13:09:44Z,BCS3,M,2d5def0a5a844b343ba1b7cc9cb28fa9,56,11,292,51918.728223
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158144,258068876,2022-12-31,c9fca302ca2e28acab0eb0bb1b46f11b,LTFM,iGA Istanbul,TR,LSZH,Zurich,CH,2022-12-31T09:25:00Z,2022-12-31T12:24:24Z,A321,M,6351ec1b849adacc0cbb3b1313d8d39b,154,25,988,72757.490004
158145,258064675,2022-12-31,00f96ad0e382476649574ba044c764fc,EHAM,Amsterdam,NL,EDDF,Frankfurt,DE,2022-12-31T10:04:21Z,2022-12-31T10:55:35Z,A320,M,f502877cab405652cf0dd70c2213e730,42,9,240,59933.913445
158146,258065436,2022-12-31,87c552b7f6d9bbd16a66e95df761c7f2,LEBL,Barcelona,ES,KJFK,New York JFK,US,2022-12-31T09:34:00Z,2022-12-31T17:51:22Z,B772,H,5543e4dc327359ffaf5b9c0e6faaf0e1,483,14,3426,208962.066726
158147,258058138,2022-12-31,2cd57e434494606c965bac87c024bda2,LIPE,Bologna,IT,LOWW,Vienna,AT,2022-12-31T09:37:00Z,2022-12-31T10:47:00Z,E195,M,5d407cb11cc29578cc3e292e743f5393,55,15,335,41458.776777


In [26]:
# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")
submission_set.to_csv(submission_file, index=False)