# Inference Pipeline
Predicts delays for the next hour given delays from the last three 30 minutes

### Imports

In [None]:
from datetime import date
import hopsworks
import os
import pandas as pd
from pathlib import Path
import sys
import joblib
import time

root_dir = Path().absolute()
# Strip subdirectories if the notebook started in any
if root_dir.parts[-1:] == ('pipeline',):
    root_dir = Path(*root_dir.parts[:-1])
if root_dir.parts[-1:] == ('src',):
    root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir) 

os.chdir(root_dir)
print(f"Root dir: {Path.cwd()}")

from src.data_utils.plots import *

Root dir: /Users/serkan/ID2223-project


### Connect to Hopsworks

In [None]:
# Enter the project name if the project in Hopsworks is not your main project
#project_name = None
project_name = 'metro_delay_prediction'
max_retries = 3
for i in range(max_retries):
    try:
        if project_name:
            project = hopsworks.login(project=f'{project_name}')
        else:
            project = hopsworks.login()
        fs = project.get_feature_store()
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to connect to hopsworks')

2026-01-03 19:07:55,725 INFO: Initializing external client
2026-01-03 19:07:55,726 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2026-01-03 19:07:57,541 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1330333


### Retrieve Model from Model Registry and load model + artifacts

In [None]:
max_retries = 3
for i in range(max_retries):
    try:
        mr = project.get_model_registry()
        retrieved_model = mr.get_model(name='xgb_regressor', version=3)

        model_dir = retrieved_model.download()

        # fv = retrieved_model.get_feature_view()
        # saved_dir = retrieved_model.download()
        xgb_model = joblib.load(f'{model_dir}/model.joblib')
        line_encoder = joblib.load(f'{model_dir}/line_encoder.pkl')
        day_encoder = joblib.load(f'{model_dir}/day_encoder.pkl')
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to get model and artifacts.')

Downloading: 0.000%|          | 0/229483 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/517 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/579 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 3 files)... DONE

### Fetch features and take most recent data

In [None]:
max_retries = 3
for i in range(max_retries):
    try:
        features_fg = fs.get_feature_group(
            name="delay_features_fg",
            version=1
        )
        features_df = features_fg.read(online=True)
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to get features')

In [5]:
latest_df = features_df.sort_values("timestamp").groupby("line").tail(1).reset_index(drop=True)
print(latest_df)

            timestamp        timestamp_str          line    delay_60  \
0 2026-01-03 18:39:00  2026-01-03T18:39:00   Röda linjen   -8.040000   
1 2026-01-03 18:39:00  2026-01-03T18:39:00    Blå linjen  -28.500000   
2 2026-01-03 18:39:00  2026-01-03T18:39:00  Gröna linjen  751.384615   

     delay_30  delay_current  
0  -15.000000     -22.000000  
1  -30.833333     -31.666667  
2  224.384615      80.566667  


### Encode non-numerical features and set up data for inference

In [6]:
latest_df["day"] = latest_df["timestamp"].dt.day_name()
latest_df["day_encoded"] = day_encoder.transform(latest_df["day"])
latest_df["line_encoded"] = line_encoder.transform(latest_df["line"])
print(latest_df.head())





            timestamp        timestamp_str          line    delay_60  \
0 2026-01-03 18:39:00  2026-01-03T18:39:00   Röda linjen   -8.040000   
1 2026-01-03 18:39:00  2026-01-03T18:39:00    Blå linjen  -28.500000   
2 2026-01-03 18:39:00  2026-01-03T18:39:00  Gröna linjen  751.384615   

     delay_30  delay_current       day  day_encoded  line_encoded  
0  -15.000000     -22.000000  Saturday            2             2  
1  -30.833333     -31.666667  Saturday            2             0  
2  224.384615      80.566667  Saturday            2             1  


In [7]:
inference_df = latest_df[["line_encoded", "day_encoded", "delay_60", "delay_30", "delay_current"]].copy()
X = inference_df.dropna(subset=["delay_60", "delay_30", "delay_current"]).reset_index(drop=True)
print(X)

   line_encoded  day_encoded    delay_60    delay_30  delay_current
0             2            2   -8.040000  -15.000000     -22.000000
1             0            2  -28.500000  -30.833333     -31.666667
2             1            2  751.384615  224.384615      80.566667


### Make predictions

In [8]:
preds = xgb_model.predict(X)
print(preds)

[-22.617893 -35.29938   41.85341 ]


In [9]:
inference_df["line"] = line_encoder.inverse_transform(inference_df["line_encoded"])
inference_df["prediction"] = preds
inference_df["timestamp"] = latest_df["timestamp"]
inference_df["timestamp_pk"] = inference_df["timestamp"].dt.strftime("%Y-%m-%dT%H:%M:%S")
inference_df["delay_hind"] = latest_df["delay_current"]
result = inference_df[["line", "timestamp", "timestamp_pk", "prediction", "delay_hind"]]
print(result)



           line           timestamp         timestamp_pk  prediction  \
0   Röda linjen 2026-01-03 18:39:00  2026-01-03T18:39:00  -22.617893   
1    Blå linjen 2026-01-03 18:39:00  2026-01-03T18:39:00  -35.299381   
2  Gröna linjen 2026-01-03 18:39:00  2026-01-03T18:39:00   41.853409   

   delay_hind  
0  -22.000000  
1  -31.666667  
2   80.566667  


### Upload predictions to hopsworks

In [None]:
max_retries = 3
for i in range(max_retries):
    try:
        monitor_fg = fs.get_or_create_feature_group(
            name='sl_prediction',
            description='SL metro lines prediction monitoring',
            version=1,
            primary_key=['line', 'timestamp_pk'],
            event_time='timestamp',
        )
        monitor_fg.insert(result, wait=True)
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to insert to monitor feature group')

### Get all predictions and update plots

In [None]:
max_retries = 3
for i in range(max_retries):
    try:
        predictions_fg = fs.get_feature_group(
            name="sl_prediction",
            version=1
        )
        predictions_df = predictions_fg.read()
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to read predictions feature group')

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.12s) 


In [None]:
images_path = Path('docs/images/')
plot_metro_delay_predictions(predictions_df, images_path, hindcast=True)
plot_metro_delay_predictions(predictions_df, images_path, hindcast=False)


