# Model Inference

#### Imports

In [1]:
import joblib
import pandas as pd

#### Load Model and Columns

In [2]:
# ---- Load Trained Model ----
model = joblib.load("../models/delay_predictor.pkl")
print("✅ Model loaded successfully")

✅ Model loaded successfully


In [3]:
# ---- Load Training Columns ----
# (We need the same feature columns as used during training)
training_data = pd.read_csv("../datasets/processed/final_clean_dataset.csv")
training_data = pd.get_dummies(training_data, columns=["FestivalImpact", "Weather", "TimeOfDay"], drop_first=True)
X_train_columns = training_data.drop(columns=["DelayMinutes", "Date", "StationName", "StationCode", "ArrivalTime", "DepartureTime"]).columns
print(f"Feature columns loaded: {len(X_train_columns)}")

Feature columns loaded: 11


#### Prediction Function

In [4]:
# ---- Prediction Function ----
def predict_delay(input_data, model=model, feature_columns=X_train_columns):
    """
    Predict delay for a single input sample.

    Parameters:
        input_data (dict): Example:
            {
                "Distance": 500,
                "Weather": "Rain",
                "FestivalImpact": "FestivalEve",
                "TimeOfDay": "Day"
            }
        model: Trained RandomForest model
        feature_columns: Columns used during training

    Returns:
        float: Predicted delay in minutes
    """
    # Convert to DataFrame
    input_df = pd.DataFrame([input_data])
    
    # One-hot encode categorical variables
    input_df = pd.get_dummies(input_df)
    
    # Align with training columns
    for col in feature_columns:
        if col not in input_df.columns:
            input_df[col] = 0
    input_df = input_df[feature_columns]
    
    # Predict
    prediction = model.predict(input_df)[0]
    return prediction


#### Testing

In [5]:
# ---- Example Input ----
sample_input = {
    "Distance": 500,
    "Weather": "Rain",
    "FestivalImpact": "FestivalEve",
    "TimeOfDay": "Day"
}

predicted_delay = predict_delay(sample_input)
print(f"Predicted Delay: {predicted_delay:.2f} minutes")

Predicted Delay: 41.13 minutes


In [6]:
# ---- Batch Prediction ----
batch_data = [
    {"Distance": 250, "Weather": "Clear", "FestivalImpact": "None", "TimeOfDay": "Morning"},
    {"Distance": 1200, "Weather": "Fog", "FestivalImpact": "FestivalDay", "TimeOfDay": "Evening"}
]

batch_df = pd.DataFrame(batch_data)
batch_df["PredictedDelay"] = batch_df.apply(lambda row: predict_delay(row.to_dict()), axis=1)
display(batch_df)


Unnamed: 0,Distance,Weather,FestivalImpact,TimeOfDay,PredictedDelay
0,250,Clear,,Morning,13.630467
1,1200,Fog,FestivalDay,Evening,70.886075
