In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from tqdm import tqdm
from datetime import datetime
import pytz
import json
import joblib 
import os
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v14.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v14.csv")



In [3]:
# If necessary change this part to test the model before the training process
df = challenge_set_updated.iloc[:,:]
# df = challenge_set_updated.sample(frac=0.001)

# Separating features and target variable
X = df.drop('tow', axis=1)
y = df['tow']



In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Handle categorical columns by Label Encoding
label_encoder = LabelEncoder()

# Specify the columns that are categorical
categorical_cols = ['aircraft_type', 'wtc', 'airline', 
                    'flight_duration_category', 'adep_region', 'ades_region', 'flight_direction', 
                    'Physical_Class_Engine', 'FAA_Weight', 'Main_Gear_Config']

# Convert these categorical columns into numerical form using Label Encoding
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])



In [13]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import optuna

# Split the data into training and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)


In [14]:
import xgboost as xgb
from catboost import CatBoostRegressor


# Define individual models
lgb_model = lgb.LGBMRegressor(n_estimators=100, random_state=42)
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42, objective='reg:squarederror')
catboost_model = CatBoostRegressor(n_estimators=100, random_state=42, verbose=0)


In [15]:
from sklearn.ensemble import VotingRegressor
# Create the ensemble model using VotingRegressor
ensemble_model = VotingRegressor(
    estimators=[
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('catboost', catboost_model)
    ]
)


In [16]:
# Train the ensemble model
ensemble_model.fit(X_train, y_train)

# Make predictions
y_pred = ensemble_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


print(f"Best Model Performance - R^2 Score: {r2:.4f}, RMSE: {rmse:.4f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011844 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17210
[LightGBM] [Info] Number of data points in the train set: 236168, number of used features: 101
[LightGBM] [Info] Start training from score 79525.199947
Best Model Performance - R^2 Score: 0.9968, RMSE: 3012.0332


In [None]:
import joblib

# Save the ensemble model to a file
model_filename = 'ensemble_model.pkl'
joblib.dump(ensemble_model, model_filename)

print(f"Model saved to {model_filename}")


In [None]:
from sklearn.preprocessing import LabelEncoder

submission_set_features = submission_set_updated.iloc[:,:-1]

submission_set_features = submission_set_features.dropna()

assert len(submission_set_features) == len(submission_set), "Mismatch in lengths"

# Align submission_set with submission_set_features
submission_set = submission_set.loc[submission_set_features.index]

# Identify columns with 'object' type that need to be converted
object_columns = submission_set_features.select_dtypes(include=['object']).columns

# Apply Label Encoding or One-Hot Encoding to these columns
# Option 1: Label Encoding
label_encoders = {}
for col in object_columns:
    le = LabelEncoder()
    submission_set_features[col] = le.fit_transform(submission_set_features[col])
    label_encoders[col] = le

# Alternatively, you could use One-Hot Encoding for better results
# submission_set_features = pd.get_dummies(submission_set_features, columns=object_columns)

# Ensure all columns are of numeric type
assert submission_set_features.dtypes.apply(lambda x: np.issubdtype(x, np.number)).all()

# Now you can use the model to make predictions
submission_set['tow'] = ensemble_model.predict(submission_set_features)
print(submission_set)


In [None]:
# Use the final model to predict the `tow` for the submission_set_updated
submission_set['tow'] = ensemble_model.predict(submission_set_features)

submission_set

In [None]:
import os
from datetime import datetime

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Define a timestamp for the file name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")

# Assuming submission_set is a DataFrame, save it to CSV
submission_set.to_csv(submission_file, index=False)

print(f"Submission saved to {submission_file}")
