In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v9_median.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v9_median.csv")



In [3]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]
# df = challenge_set_updated.sample(frac=0.001)

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']



In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Handle categorical columns by Label Encoding
label_encoder = LabelEncoder()

# Specify the columns that are categorical
categorical_cols = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season', 
                    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction', 
                    'Manufacturer', 'Model_FAA', 'Physical_Class_Engine', 'FAA_Weight']

# Convert these categorical columns into numerical form using Label Encoding
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

# Now, you can proceed with training LightGBM with this processed data


In [5]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import optuna

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)


In [8]:
from sklearn.ensemble import VotingRegressor
import joblib  # or pickle

# Load the saved models
lgb_model = joblib.load('lgb_model.pkl')  # Adjust the file path as needed
xgb_model = joblib.load('xgb_model.pkl')  # Adjust the file path as needed
cat_model = joblib.load('cat_model.pkl')  # Adjust the file path as needed

# Ensemble the pre-trained models
ensemble_model = VotingRegressor(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('cat', cat_model)
    ]
)


# Train the ensemble model on the entire dataset
ensemble_model.fit(X, y)



In [12]:

# Make predictions
y_pred = ensemble_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


print(f"Best Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010003 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12511
[LightGBM] [Info] Number of data points in the train set: 235756, number of used features: 80
[LightGBM] [Info] Start training from score 79499.123553
Best Model Performance - R^2 Score: 0.9964, RMSE: 3197.4635


In [17]:
import joblib

# Save the ensemble model to a file
model_filename = 'ensemble_model.pkl'
joblib.dump(ensemble_model, model_filename)

print(f"Model saved to {model_filename}")


Model saved to ensemble_model.pkl


In [26]:
from sklearn.preprocessing import LabelEncoder

submission_set_features = submission_set_updated.iloc[:,:-1]


# Now you can use the model to make predictions
submission_set['tow'] = ensemble_model.predict(submission_set_features)
print(submission_set)


        flight_id        date                          callsign  adep  \
0       248753821  2022-01-01  3b3de0f3ad0ee192513995c02f7bf7cf  LTFJ   
1       248753822  2022-01-01  e06dd03d4a879ca37d9e18c1bd7cad16  EBBR   
2       248754498  2022-01-01  2d3b1c962c78c4ebeef11bcd51b9e94c  KMIA   
3       248757623  2022-01-01  81564432d3ee97c4bdf4cd8f006753dc  EGCN   
4       248763603  2022-01-01  84be079d7e660db105d91f600b4b3d59  EIDW   
...           ...         ...                               ...   ...   
105786  258035188  2022-12-30  17d94ade55650fe95373f0b91ac01514  EPWA   
105787  258035195  2022-12-30  c233a13ca55f946fdd1b13c444163764  LTFM   
105788  258035230  2022-12-30  98bcbcc3e6db32d491c58262ab782f14  ESSA   
105789  258035474  2022-12-30  71c9f60bd1c3f375bf365916d323f1a4  EGLL   
105790  258035327  2022-12-30  2b62a10e5da30da7dadd1c930f1acd86  LSZH   

                     name_adep country_code_adep  ades        name_ades  \
0       Istanbul Sabiha Gokcen                TR

In [27]:
import os
from datetime import datetime

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Define a timestamp for the file name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")

# Assuming submission_set is a DataFrame, save it to CSV
submission_set.to_csv(submission_file, index=False)

print(f"Submission saved to {submission_file}")


Submission saved to submissions/submission_20241007_180401.csv
