In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import optuna
from lightgbm.callback import early_stopping

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the dataset after the exploratory data analysis
challenge_set_updated = pd.read_csv("./data/challenge_set_updated_v13.csv")
submission_set = pd.read_csv("./data/submission_set.csv")
submission_set_updated = pd.read_csv("./data/submission_set_updated_v13.csv")



In [3]:
# Separating features and target variable
X = challenge_set_updated.drop('tow', axis=1)
y = challenge_set_updated['tow']

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming df is your DataFrame
categorical_columns = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season',
                       'flight_duration_category', 'adep_region', 'ades_region',
                       'flight_direction', 'Manufacturer', 'Model_FAA',
                       'Physical_Class_Engine', 'FAA_Weight']

# Encoding using LabelEncoder
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))  # Ensure data is string type before encoding


In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Convert DataFrame to LightGBM Dataset, specifying categorical feature
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_columns, free_raw_data=False)
valid_data = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_columns, free_raw_data=False)

# Parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'verbose': -1,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'min_data_per_group': 100,  # Use this to prevent overfitting on categorical data
    'cat_smooth': 10  # Smoothing factor to balance categorical feature influence
}

# Training the model
# Training the model with early stopping
bst = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[valid_data],
    callbacks=[early_stopping(stopping_rounds=50)]
)

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's rmse: 2692.03


In [7]:
# Generate predictions for the validation set
val_pred = bst.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, val_pred))

# Print the RMSE
print(f"The RMSE of the validation set is: {rmse}")

The RMSE of the validation set is: 2692.0330361958218


In [8]:
from sklearn.preprocessing import LabelEncoder

# List of columns to encode
categorical_columns = ['adep', 'ades', 'aircraft_type', 'wtc', 'airline', 'offblock_season', 'flight_duration_category', 
                       'adep_region', 'ades_region', 'flight_direction', 'Manufacturer', 'Model_FAA', 
                       'Physical_Class_Engine', 'FAA_Weight']

# Initialize the LabelEncoder
label_encoders = {}


# Apply Label Encoding to each categorical column
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    submission_set_updated[column] = label_encoders[column].fit_transform(submission_set_updated[column])


# Now you should be able to run LightGBM predictions
submission_set_features = submission_set_updated.iloc[:, :-1]
submission_set['tow'] = bst.predict(submission_set_features)

print(submission_set)

        flight_id        date                          callsign  adep  \
0       248753821  2022-01-01  3b3de0f3ad0ee192513995c02f7bf7cf  LTFJ   
1       248753822  2022-01-01  e06dd03d4a879ca37d9e18c1bd7cad16  EBBR   
2       248754498  2022-01-01  2d3b1c962c78c4ebeef11bcd51b9e94c  KMIA   
3       248757623  2022-01-01  81564432d3ee97c4bdf4cd8f006753dc  EGCN   
4       248763603  2022-01-01  84be079d7e660db105d91f600b4b3d59  EIDW   
...           ...         ...                               ...   ...   
105954  258066302  2022-12-31  2d3b4446c4d05a25196a9d52cab936fb  LTFJ   
105955  258068609  2022-12-31  253fd692ed441fac523081471c067772  LOWW   
105956  258068876  2022-12-31  c9fca302ca2e28acab0eb0bb1b46f11b  LTFM   
105957  258064675  2022-12-31  00f96ad0e382476649574ba044c764fc  EHAM   
105958  258058370  2022-12-31  5f0c222c7f7ceff3fbe75c854cce74c9  UBBB   

                     name_adep country_code_adep  ades          name_ades  \
0       Istanbul Sabiha Gokcen                

In [9]:
import os
from datetime import datetime

# Define the submissions directory and create it if it doesn't exist
submissions_dir = 'submissions'
os.makedirs(submissions_dir, exist_ok=True)

# Define a timestamp for the file name
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the submission with a timestamp in the filename
submission_file = os.path.join(submissions_dir, f"submission_{timestamp}.csv")

# Assuming submission_set is a DataFrame, save it to CSV
submission_set.to_csv(submission_file, index=False)

print(f"Submission saved to {submission_file}")


Submission saved to submissions/submission_20241015_205815.csv
