# Inference Pipeline
Predicts delays for the next hour given delays from the last three 30 minutes

### Imports

In [12]:
from datetime import date
import hopsworks
import os
import pandas as pd
from pathlib import Path
import sys
import joblib
import time

root_dir = Path().absolute()
# Strip subdirectories if the notebook started in any
if root_dir.parts[-1:] == ('pipeline',):
    root_dir = Path(*root_dir.parts[:-1])
if root_dir.parts[-1:] == ('src',):
    root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir) 

os.chdir(root_dir)
print(f"Root dir: {Path.cwd()}")

from src.data_utils.plots import *

Root dir: /Users/serkan/ID2223-project


### Connect to Hopsworks

In [13]:
# Enter the project name if the project in Hopsworks is not your main project
#project_name = None
project_name = 'metro_delay_prediction'
max_retries = 3
for i in range(max_retries):
    try:
        if project_name:
            project = hopsworks.login(project=f'{project_name}')
        else:
            project = hopsworks.login()
        fs = project.get_feature_store()
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to connect to hopsworks')

2026-01-05 16:19:12,155 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-01-05 16:19:12,160 INFO: Initializing external client
2026-01-05 16:19:12,160 INFO: Base URL: https://c.app.hopsworks.ai:443
To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'







2026-01-05 16:19:14,054 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1330333


### Retrieve Model from Model Registry and load model + artifacts

In [14]:
max_retries = 3
for i in range(max_retries):
    try:
        mr = project.get_model_registry()
        retrieved_model = mr.get_model(name='MLP', version=1)

        model_dir = retrieved_model.download()

        # fv = retrieved_model.get_feature_view()
        # saved_dir = retrieved_model.download()
        MLP = joblib.load(f'{model_dir}/model.joblib')
        line_encoder = joblib.load(f'{model_dir}/line_encoder.pkl')
        day_encoder = joblib.load(f'{model_dir}/day_encoder.pkl')
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to get model and artifacts.')

Downloading: 0.000%|          | 0/125071 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 1 files)... 

Downloading: 0.000%|          | 0/517 elapsed<00:00 remaining<?

Downloading model artifact (0 dirs, 2 files)... 

Downloading: 0.000%|          | 0/21293 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 3 files)... 

Downloading: 0.000%|          | 0/579 elapsed<00:00 remaining<?

Downloading model artifact (1 dirs, 4 files)... DONE

### Fetch features and take most recent data

In [15]:
max_retries = 3
for i in range(max_retries):
    try:
        features_fg = fs.get_feature_group(
            name="features_fg",
            version=1
        )
        features_df = features_fg.read(online=True)
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to get features')

In [16]:
latest_df = features_df.sort_values("timestamp").groupby("line").tail(1).reset_index(drop=True)
print(latest_df)

            timestamp        timestamp_str          line   delay_60  \
0 2026-01-04 15:42:00  2026-01-04T15:42:00    Blå linjen -38.000000   
1 2026-01-04 15:42:00  2026-01-04T15:42:00  Gröna linjen  53.451613   
2 2026-01-04 15:42:00  2026-01-04T15:42:00   Röda linjen  -1.200000   

    delay_30  delay_current  
0 -24.916667         -32.25  
1  54.903226          51.10  
2  -8.200000         -13.92  


### Encode non-numerical features and set up data for inference

In [17]:
latest_df["day"] = latest_df["timestamp"].dt.day_name()
latest_df["day_encoded"] = day_encoder.transform(latest_df["day"])
latest_df["line_encoded"] = line_encoder.transform(latest_df["line"])
print(latest_df.head())





            timestamp        timestamp_str          line   delay_60  \
0 2026-01-04 15:42:00  2026-01-04T15:42:00    Blå linjen -38.000000   
1 2026-01-04 15:42:00  2026-01-04T15:42:00  Gröna linjen  53.451613   
2 2026-01-04 15:42:00  2026-01-04T15:42:00   Röda linjen  -1.200000   

    delay_30  delay_current     day  day_encoded  line_encoded  
0 -24.916667         -32.25  Sunday            3             0  
1  54.903226          51.10  Sunday            3             1  
2  -8.200000         -13.92  Sunday            3             2  


In [18]:
inference_df = latest_df[["line_encoded", "day_encoded", "delay_60", "delay_30", "delay_current"]].copy()
X = inference_df.dropna(subset=["delay_60", "delay_30", "delay_current"]).reset_index(drop=True)
print(X)

   line_encoded  day_encoded   delay_60   delay_30  delay_current
0             0            3 -38.000000 -24.916667         -32.25
1             1            3  53.451613  54.903226          51.10
2             2            3  -1.200000  -8.200000         -13.92


### Make predictions

In [19]:
preds = MLP.predict(X).astype("float32")
print(preds)




[-30.356695  51.729263  -9.12046 ]


In [None]:
inference_df["line"] = line_encoder.inverse_transform(inference_df["line_encoded"])
inference_df["prediction"] = preds
inference_df["timestamp"] = latest_df["timestamp"]
inference_df["timestamp_pk"] = inference_df["timestamp"].dt.strftime("%Y-%m-%dT%H:%M:%S")
inference_df["delay_hind"] = latest_df["delay_current"]
result = inference_df[["line", "timestamp", "timestamp_pk", "prediction", "delay_hind"]]
print(result)

### Upload predictions to hopsworks

In [None]:
max_retries = 3
for i in range(max_retries):
    try:
        monitor_fg = fs.get_or_create_feature_group(
            name='sl_prediction',
            description='SL metro lines prediction monitoring',
            version=1,
            primary_key=['line', 'timestamp_pk'],
            event_time='timestamp',
        )
        monitor_fg.insert(result, wait=True)
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to insert to monitor feature group')

### Get all predictions and update plots

In [None]:
max_retries = 3
for i in range(max_retries):
    try:
        predictions_fg = fs.get_feature_group(
            name="sl_prediction",
            version=1
        )
        predictions_df = predictions_fg.read()
        break
    except Exception as e:
        print(f'Error {e}, retrying in 1 second')
        time.sleep(1)
else:
    raise RuntimeError(f'Failed to read predictions feature group')

In [None]:
images_path = Path('docs/images/')
plot_metro_delay_predictions(predictions_df, images_path, hindcast=True)
plot_metro_delay_predictions(predictions_df, images_path, hindcast=False)