In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score

In [2]:
df=pd.read_csv('final_data2.csv')

In [3]:
# Features and target
X = df.drop(columns=['delay'])  # Drop the target column
y = df['delay']                 # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Define the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Define the parameter grid
param_dist = {
    'n_estimators': [100, 200, 500],  # Number of trees
    'learning_rate': [0.01, 0.05, 0.1],  # Learning rate
    'max_depth': [3, 6, 10],  # Depth of trees
    'min_child_weight': [1, 5, 10],  # Minimum sum of instance weight in a child
    'subsample': [0.7, 0.8, 1.0],  # Fraction of samples for fitting
    'colsample_bytree': [0.7, 0.8, 1.0],  # Fraction of features for each tree
}

# RandomizedSearchCV with a reduced parameter grid
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=20,  # Number of iterations (reduce to make it faster)
    cv=3,  # Number of cross-validation folds
    random_state=42,
    n_jobs=-1,  # Use all CPU cores
    verbose=1
)

# Fit the model
random_search.fit(X_train, y_train)

# Best parameters and accuracy
best_params = random_search.best_params_
print("Best Parameters:", best_params)

# Use the best model for predictions
best_xgb = random_search.best_estimator_
y_pred = best_xgb.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy:.4f}")


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'subsample': 1.0, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.8}
XGBoost Accuracy: 0.8228


In [5]:
# Get feature importances
importances = best_xgb.feature_importances_

# Create a DataFrame with features and their importance scores
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
})

# Sort the DataFrame by importance (descending)
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)

# Show the top 10 features
pd.set_option('display.max_rows', None)  # No limit on the number of rows shown

print(feature_importance_df)


                                               feature  importance
26                                   city_visibility_y    0.054387
7                                             distance    0.031172
30                                   travel_time_hours    0.027126
35                              day_of_month_departure    0.026905
97   city_description_y_Moderate or heavy snow showers    0.023016
52                       city_description_x_Light snow    0.021920
72                            city_description_x_Sunny    0.021076
64                city_description_x_Patchy heavy snow    0.019726
87                city_description_y_Light rain shower    0.016647
50                       city_description_x_Light rain    0.016248
99                    city_description_y_Moderate rain    0.015943
113                           city_description_y_Sunny    0.015815
68                city_description_x_Patchy light snow    0.015579
108               city_description_y_Patchy light snow    0.01

: 