# Training Pipeline
Selects features, creates training data, trains and saves model

## Imports

In [None]:
from datetime import datetime, timedelta
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, plot_importance
from sklearn.metrics import mean_squared_error, r2_score
import hopsworks
import json

In [None]:
import os
from pathlib import Path
import sys
from dotenv import load_dotenv

root_dir = Path().absolute()
# Strip subdirectories if the notebook started in any
if root_dir.parts[-1:] == ('pipeline',):
    root_dir = Path(*root_dir.parts[:-1])
if root_dir.parts[-1:] == ('src',):
    root_dir = Path(*root_dir.parts[:-1])
root_dir = str(root_dir) 

os.chdir(root_dir)
print(f"Root dir: {Path.cwd()}")

from datetime import date, timedelta
from src.data_utils.filter import *
from src.data_utils.ingest import *

load_dotenv()
hopsworks_key = os.getenv('HOPSWORKS_API_KEY')

## Connect to Hopsworks Feature Store

In [None]:
if hopsworks_key is not None:
    os.environ['HOPSWORKS_API_KEY'] = hopsworks_key

project = hopsworks.login()
fs = project.get_feature_store()

# Set up secrets here
secrets = hopsworks.get_secrets_api()
# ...
# ...
# ...

In [None]:
# TODO: Retrieve feature groups here (after feature engineering step)

# air_quality_fg = fs.get_feature_group(
#     name='air_quality',
#     version=1,
# )
# weather_fg = fs.get_feature_group(
#     name='weather',
#     version=1,
# )

## Feature View Creation and Retrieving

In [None]:
# selected_features = feature_group_variable.select()... 
# feature_view = fs.get_or_create_feature_view(
# ...,
# ...,
#)

## Split the training data into train / test data sets

In [None]:
# TODO: Set up preliminaries such as dates / hours for train and test

# X_train, X_test, y_train, y_test = feature_view.train_test_split(test_start=test_start)

In [None]:
# X_train # For inspection

## Modeling

In [None]:
xgb_regressor = XGBRegressor()

# TODO: Fit model to data
# xgb_regressor.fit(X_features, y_train)

In [None]:
# TODO: Predict target values on the test set
# y_pred = xgb_regressor.predict(X_test_features)

# TODO: calculate MSE
# mse = mean_squared_error(y_test.iloc[:,0], y_pred)
# print("MSE:" mse)

# TODO: calculate R sequared using sklearn
# r2 = r2_score(y_test.iloc[:,0], y_pred)
# print("R squared:", r2)

In [None]:
# TODO: Create model directory if it does not exist
model_dir = "metro_delay_prediction_model"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if not os.path.exists(images_dir):
    os.mkdir(images_dir)

## Plotting here

In [None]:
# TODO: add plots for hindcasts, feature importance etc here

## Model Registry

In [None]:
xgb_regressor.save_model(model_dir + "/model.json")
res_dict = {
    "MSE": str(mse),
    "R squared": str(r2)
}

In [None]:
mr = project.get_model_registry()

mdp_model = mr.python.create_model(
    name="metro_delay_prediction_model",
    metrics=res_dict,
    feature_view=feature_view,
    description="metro delay predictor"
)

mdp_model.save(model_dir)