# Interface for Pipeline Execution 
## (Simulated Data from SARIMAX Process)

## 1) Import Models, Metrics, Paths, and Functions

In [1]:
from utils.helpers import csv_reader
from utils.paths import *
from pipeline.run_pipeline import run_pipeline


from models import MODELS
from metrics import METRICS


Loading paths...
Loading helper functions...
Loading data transformers...
Loading models...
Loading metrics...


In [2]:
# print(METRICS)
# print(MODELS)

## 2) Select Input Data 

In [3]:
# Read input data
df = csv_reader(SIMDATA_DIR, 'noisy_simdata')
print(df.head())

                    y          x1         x2          x3
Date                                                    
2004-01-01  50.840469  427.595799  55.337904  900.325291
2004-02-01  52.871538  434.062163  54.959155  900.775888
2004-03-01  53.769316  453.264284  56.470633  899.510058
2004-04-01  57.672973  459.367523  56.704233  903.524834
2004-05-01  57.182051  462.354356  61.557907  905.071762


In [4]:
# ## Using the EUR-USD Exchange Rate data
# 
# df = csv_reader(TESTDATA_DIR, 'eurusd', columns=['datetime', 'bid_close'])
# import sys
# import warnings
# # warnings( sys.__stdout__())
# 
# #sys.jupyter_stdout = sys.__stdout__
# print("hey")
# print(df.head())
# 
# # For testng pipeline for now I use a small subset:
# df = df.iloc[:10000, :]

## 3) Run Pipeline 
##    ... on Simulated Data

In [5]:
output_dict = run_pipeline(
    df=df, models=MODELS, metrics=METRICS,
    start="2004-01-01", end="2013-12-31",  # filtering the first 10 years of data
    verbose=True
)

[2024-03-07 03:03] Starting  Pipeline...[0m
[33;1m
== Pipeline Step 1: Data Preprocessing ==
[0m
[33;1mSearching for time information...[0m
[33;1mDates found in 'index' column![0m
[33;1mInferred frequency: month start[0m
[33;1mData from goes from 2004-01-01 to 2013-12-01, resulting in 120 observations.
[0m
[33;1mSelecting target and covariates...[0m
[33;1mTarget: y[0m
[33;1mCovariates: x1, x2, x3[0m
[33;1m
Data Insight:
[0m
[33;1m                    y          x1         x2          x3
Date                                                    
2004-01-01  50.840469  427.595799  55.337904  900.325291
2004-02-01  52.871538  434.062163  54.959155  900.775888
2004-03-01  53.769316  453.264284  56.470633  899.510058
2004-04-01  57.672973  459.367523  56.704233  903.524834
2004-05-01  57.182051  462.354356  61.557907  905.071762[0m
[33;1m[Time elapsed: 00s]
[0m
[33;1m
== Pipeline Step 2: Individual Models' Predictions ==
[0m
[33;1mSplitting data (train/test ratio: 30/

## 4) Show Ranking Table

In [6]:
import pandas as pd
pd.set_option('display.float_format', '{:.2f}'.format)
# Format performance metric values
def format_numeric(val):
    if isinstance(val, float):
        return '{:.3f}'.format(val)
    return val
formatted_metrics = output_dict['metrics ranking'].applymap(format_numeric)

display(formatted_metrics.style.hide())
pd.reset_option('display.float_format')

Model,MAE,RMSE,MAPE,sMAPE,MAE Ranking,RMSE Ranking,MAPE Ranking,sMAPE Ranking
Weighted Ensemble: Simple,1.961,2.595,0.027,1.839,1,2,1,1
Weighted Ensemble: Inverse Variance,1.967,2.596,0.027,1.845,3,3,2,3
Weighted Ensemble: Inverse RMSE,1.962,2.579,0.027,1.844,2,1,3,2
Exponential Smoothing,2.059,2.668,0.029,1.934,4,4,4,4
AutoSARIMA,2.087,2.689,0.029,1.953,5,5,5,5
Naive,2.095,2.717,0.029,1.972,6,6,6,6
Naive (drift),2.122,2.718,0.03,1.995,7,7,7,7
Meta Ensemble: RandomForest,2.195,2.771,0.031,2.058,8,8,8,8
Meta Ensemble: Ridge,2.217,2.882,0.031,2.087,9,9,9,9
Weighted Ensemble: Inverse Error Covariance,2.233,2.958,0.031,2.103,10,10,10,10
