In [7]:
import os
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from scipy.stats import randint, uniform

In [8]:
MODEL_DIR = "models"
os.makedirs(MODEL_DIR, exist_ok=True)

In [10]:
data_path = "D:/Avs/ICBT/CI/system3/train.csv"
train_df = pd.read_csv(data_path)

In [11]:
train_df['DepHour'] = train_df['DepTime'] // 100
train_df['DepMinute'] = train_df['DepTime'] % 100

def create_time_slot(hour):
    """Classifies departure time into slots."""
    if 6 <= hour < 10:
        return 'Morning'
    elif 10 <= hour < 14:
        return 'Midday'
    elif 14 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 22:
        return 'Evening'
    return 'Night'

train_df['DepTimeSlot'] = train_df['DepHour'].apply(create_time_slot)
train_df.drop(['DepHour', 'DepMinute'], axis=1, inplace=True)

In [5]:
categorical_cols = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 'DepTimeSlot']
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_train = encoder.fit_transform(train_df[categorical_cols])
encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(categorical_cols))

train_df.drop(columns=categorical_cols, inplace=True)
train_df = pd.concat([train_df, encoded_train_df], axis=1)

In [6]:
train_df['dep_delayed_15min'] = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).astype(int)

In [7]:
X = train_df.drop(columns=['dep_delayed_15min'])
y = train_df['dep_delayed_15min']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [10]:
param_dist_rf = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(5, 20),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4)
}

rf_model = RandomForestClassifier(random_state=42)
random_search_rf = RandomizedSearchCV(rf_model, param_distributions=param_dist_rf, n_iter=20, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)
random_search_rf.fit(X_train, y_train)

best_rf_model = random_search_rf.best_estimator_
print("Best RandomForest Parameters:", random_search_rf.best_params_)

Best RandomForest Parameters: {'max_depth': 19, 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 363}


In [11]:
joblib.dump(best_rf_model, "best_rf_model.joblib")
print("Random Forest model saved successfully!")

Random Forest model saved successfully!


In [12]:
loaded_rf_model = joblib.load("best_rf_model.joblib")
print("Random Forest model loaded successfully!")

Random Forest model loaded successfully!


## Hyperparameter Tuning - XGBoost

In [13]:
param_dist_xgb = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4)
}

xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')  # Removed use_label_encoder=False if using XGBoost v1.3+
random_search_xgb = RandomizedSearchCV(xgb_model, param_distributions=param_dist_xgb, n_iter=20, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)
random_search_xgb.fit(X_train, y_train)

best_xgb_model = random_search_xgb.best_estimator_
print("Best XGBoost Parameters:", random_search_xgb.best_params_)

Best XGBoost Parameters: {'colsample_bytree': 0.9140703845572055, 'learning_rate': 0.06990213464750791, 'max_depth': 9, 'n_estimators': 343, 'subsample': 0.836965827544817}


In [14]:
y_pred_rf = best_rf_model.predict_proba(X_val)[:, 1]
roc_auc_rf = roc_auc_score(y_val, y_pred_rf)
print("Validation ROC AUC (RandomForest):", roc_auc_rf)

y_pred_xgb = best_xgb_model.predict_proba(X_val)[:, 1]
roc_auc_xgb = roc_auc_score(y_val, y_pred_xgb)
print("Validation ROC AUC (XGBoost):", roc_auc_xgb)

Validation ROC AUC (RandomForest): 0.7227903694086082
Validation ROC AUC (XGBoost): 0.7585504663830318


In [15]:
if roc_auc_xgb > roc_auc_rf:
    best_model = best_xgb_model
    model_filename = os.path.join(MODEL_DIR, 'flight_delay_model_xgb_tuned.joblib')
else:
    best_model = best_rf_model
    model_filename = os.path.join(MODEL_DIR, 'flight_delay_model_rf_tuned.joblib')

joblib.dump(best_model, model_filename)
joblib.dump(scaler, os.path.join(MODEL_DIR, 'standard_scaler_tuned.joblib'))
joblib.dump(encoder, os.path.join(MODEL_DIR, 'one_hot_encoder_tuned.joblib'))
joblib.dump(X.columns.tolist(), os.path.join(MODEL_DIR, 'model_columns_tuned.joblib'))

print(f"Best model saved as: {model_filename}")

Best model saved as: models\flight_delay_model_xgb_tuned.joblib


In [4]:
loaded_model = joblib.load('flight_delay_model.joblib')
loaded_scaler = joblib.load('standard_scaler.joblib')
loaded_encoder = joblib.load('one_hot_encoder.joblib')
loaded_columns = joblib.load('model_columns.joblib')

In [12]:
test_df = pd.read_csv("D:/Avs/ICBT/CI/system2/test.csv")

In [13]:
test_df['DepHour'] = test_df['DepTime'] // 100
test_df['DepMinute'] = test_df['DepTime'] % 100

In [14]:
def create_time_slot(hour):
    if 6 <= hour < 10:
        return 'Morning'
    elif 10 <= hour < 14:
        return 'Midday'
    elif 14 <= hour < 18:
        return 'Afternoon'
    elif 18 <= hour < 22:
        return 'Evening'
    else:
        return 'Night'

test_df['DepTimeSlot'] = test_df['DepHour'].apply(create_time_slot)
test_df = test_df.drop(['DepHour', 'DepMinute'], axis=1)


In [15]:
categorical_cols = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 'DepTimeSlot']
encoded_test = loaded_encoder.transform(test_df[categorical_cols])
encoded_test_df = pd.DataFrame(encoded_test, columns=loaded_encoder.get_feature_names_out(categorical_cols))
test_df = test_df.drop(categorical_cols, axis=1)
test_df = pd.concat([test_df, encoded_test_df], axis=1)

In [16]:
test_df = test_df[loaded_columns]

In [17]:
scaled_test = loaded_scaler.transform(test_df)

In [18]:
predictions = loaded_model.predict_proba(scaled_test)[:, 1]

In [21]:
submission_df = pd.DataFrame({'id': range(0, len(predictions)), 'dep_delayed_15min': predictions})
submission_df.to_csv('submission.csv', index=False)

print('submission.csv created!')

submission.csv created!
