# Model Inference

#### Imports

In [2]:
import joblib
import pandas as pd
import os
import sys
import sys, os
sys.path.append(os.path.abspath(".."))
from data_fetching.fetch_external_data import get_weather, get_festival_impact

#### Load Model and Columns

In [3]:
# ---- Load Trained Model ----
model = joblib.load("../models/delay_predictor.pkl")
print("✅ Model loaded successfully")

✅ Model loaded successfully


In [4]:
# ---- Load Training Columns ----
# (We need the same feature columns as used during training)
training_data = pd.read_csv("../datasets/processed/final_clean_dataset.csv")
training_data = pd.get_dummies(training_data, columns=["FestivalImpact", "Weather", "TimeOfDay"], drop_first=True)
X_train_columns = training_data.drop(columns=["DelayMinutes", "Date", "StationName", "StationCode", "ArrivalTime", "DepartureTime"]).columns
print(f"Feature columns loaded: {len(X_train_columns)}")

Feature columns loaded: 11


#### Prediction Function

In [None]:
def predict_delay(train_number, date):
    """
    Predict delay for given train number & date using real APIs.
    """
    # --- Load route data to get distance ---
    routes = pd.read_csv("../datasets/processed/master_routes.csv")
    train_data = routes[routes["TrainNumber"].astype(str) == str(train_number)]
    if train_data.empty:
        raise ValueError(f"Train number {train_number} not found in routes data")

    distance = train_data["Distance"].max()
    city_name = train_data["StationName"].iloc[0]  # use first station as reference
    departure_time_str = train_data["DepartureTime"].iloc[0]

    # --- Fetch real-time weather and festival data ---
    weather = get_weather(city_name, pd.to_datetime(date).date())
    festival = get_festival_impact(pd.to_datetime(date).date())
    if not festival:
        festival = "None"
    try:
        departure_time = pd.to_datetime(departure_time_str, format="%H:%M:%S")
    except ValueError:
        departure_time = pd.to_datetime(departure_time_str, format="%H:%M")
        
    hour = departure_time.hour
    if 4 <= hour < 9:
        time_of_day = "Morning"
    elif 9 <= hour < 18:
        time_of_day = "Day"
    elif 18 <= hour < 22:
        time_of_day = "Evening"
    else:
        time_of_day = "Night"

    # --- Prepare input dataframe ---
    input_df = pd.DataFrame([{
        "Distance": distance,
        "Weather": weather,
        "FestivalImpact": festival,
        "TimeOfDay": time_of_day
    }])

    # --- One-hot encode & align columns ---
    input_df = pd.get_dummies(input_df)
    for col in X_train_columns:   # X_train_columns already computed earlier in your code
        if col not in input_df.columns:
            input_df[col] = 0
    input_df = input_df[X_train_columns]

    # --- Predict ---
    prediction = model.predict(input_df)[0]
    return prediction


#### Testing

In [7]:
predicted_delay = predict_delay("12305", "2024-08-15")
print(f"Predicted Delay: {predicted_delay:.2f} minutes")

Predicted Delay: 26.01 minutes
