# Inference Pipeline
Predicts delays for the next hour given delays from the last three 30 minutes

### Imports

In [None]:
from datetime import date
import hopsworks
import os
import pandas as pd
from pathlib import Path
import sys
import joblib

root_dir = Path().absolute()
# Strip subdirectories if the notebook started in any
if root_dir.parts[-1:] == ('pipeline',):
    root_dir = Path(*root_dir.parts[:-1])
if root_dir.parts[-1:] == ('src',):
    root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir) 

os.chdir(root_dir)
print(f"Root dir: {Path.cwd()}")

### Connect to Hopsworks

In [None]:
# Enter the project name if the project in Hopsworks is not your main project
#project_name = None
project_name = 'metro_delay_prediction'
if project_name:
    project = hopsworks.login(project=f'{project_name}')
else:
    project = hopsworks.login()

fs = project.get_feature_store()

### Retrieve Model from Model Registry and load model + artifacts

In [None]:
mr = project.get_model_registry()
retrieved_model = mr.get_model(name='xgb_regressor', version=2)

model_dir = retrieved_model.download()

# fv = retrieved_model.get_feature_view()
# saved_dir = retrieved_model.download()
xgb_model = joblib.load(f'{model_dir}/model.joblib')
line_encoder = joblib.load(f'{model_dir}/line_encoder.pkl')
day_encoder = joblib.load(f'{model_dir}/day_encoder.pkl')

### Fetch features and take most recent data

In [None]:
features_fg = fs.get_feature_group(
    name="delay_features_fg",
    version=1
)

features_df = features_fg.read(online=True)

In [None]:
latest_df = features_df.sort_values("timestamp").groupby("line").tail(1).reset_index(drop=True)
print(latest_df)

### Encode non-numerical features and set up data for inference

In [None]:
latest_df["day"] = latest_df["timestamp"].dt.day_name()
latest_df["day_encoded"] = day_encoder.transform(latest_df["day"])
latest_df["line_encoded"] = line_encoder.transform(latest_df["line"])
print(latest_df.head())

In [None]:
inference_df = latest_df[["line_encoded", "day_encoded", "delay_60", "delay_30", "delay_current"]].copy()
X = inference_df.dropna(subset=["delay_60", "delay_30", "delay_current"]).reset_index(drop=True)
print(X)

### Make predictions

In [None]:
preds = xgb_model.predict(X)
print(preds)

In [None]:
inference_df["line"] = line_encoder.inverse_transform(inference_df["line_encoded"])
inference_df["prediction"] = preds
inference_df["timestamp"] = latest_df["timestamp"]
inference_df["timestamp_pk"] = inference_df["timestamp"].dt.strftime("%Y-%m-%dT%H:%M:%S")
result = inference_df[["line", "timestamp", "timestamp_pk", "prediction"]]
print(result)

### Upload predictions to hopsworks

In [None]:
monitor_fg = fs.get_or_create_feature_group(
    name='sl_prediction',
    description='SL metro lines prediction monitoring',
    version=1,
    primary_key=['line', 'timestamp_pk'],
    event_time='timestamp',
)

monitor_fg.insert(result, wait=True)