## TOC:
* [Datasets](#datasets-bullet)
* [Logging Settings](#logging-bullet)
* [Request](#requests-bullet)
* [Forecast](#forecast-bullet)
* [Save Results](#results-bullet)

Additional to train api testing notebook features, this notebook contains 
- train test split for evaluation,
- forecasting to see results,
- metrics for forecasted period

In [1]:
import json
import yaml
import requests
import getpass
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

## Datasets <a class="anchor" id="datasets-bullet"></a>

In [30]:
testing_datasets = [
    {
        "file_name": "../datasets/retail/retail_sales.csv",
        "format": "csv",
        "time_col_index": 0,
        "target_col_index": -1,
        "exovar": False,
    },
    {
        "file_name": "../datasets/finance/Returns_short_interest_data_train.csv",
        "format": "csv",
        "time_col_index": 0,
        "target_col_index": -1,
        "exovar": False,
    },
]

In [31]:
TEST_PERIOD=7

In [32]:
def prepare_dataset_forecast(dataset_info, test_period=TEST_PERIOD):
    if dataset_info["format"] == "csv":
        train_df = pd.read_csv(dataset_info["file_name"])
        time_col = train_df.columns[dataset_info["time_col_index"]]
        target_col = train_df.columns[dataset_info["target_col_index"]]
        train_df[time_col] = train_df[time_col].astype(str)
        train_df[target_col] = train_df[target_col].astype(float)
        train_data = []
        for value in train_df.iloc[:, [dataset_info["time_col_index"],
                                       dataset_info["target_col_index"]]].values:
            train_data.append(list(value))
    return train_data[:-test_period], train_data[-test_period:]

## Logging Settings <a class="anchor" id="logging-bullet"></a>

In [33]:
LOG_RESULTS = True

In [34]:
user_name = getpass.getuser()

In [35]:
output_file_name = "./test_logs/test_logs_forecast_{}.csv".format(user_name)
print(output_file_name)

column_names = ["file_name", "model_request", "user", "time", "response", 
                "type", "metrics", "model", "test_data"]
log_df = pd.DataFrame(columns=column_names)

./test_logs/test_logs_forecast_studio-lab-user.csv


## Request <a class="anchor" id="requests-bullet"></a>

Define Model Request (optional)

In [36]:
# Customized model request
model_request = {
    "type": "meta_lr",  # 'meta_wa'
    "scorers": ["smape", "mape"],
    "params": {
        "preprocessors": [
            {"type": "dartsimputer", "params": {"strategy": "mean"}},
            # {'type': 'simpleimputer', 'params': {'strategy': 'mean'}},
            {"type": "minmaxscaler"},
        ],
        "base_models": [
            {"type": "darts_naive"},
            {"type": "darts_seasonalnaive"},
            {"type": "darts_autotheta"},
            {"type": "darts_autoets"},
            {"type": "darts_autoarima"},
            #{"type": "darts_tbats"},
            #{"type": "darts_linearregression"},
            # {'type': 'darts_lightgbm'},
            # {'type': 'darts_rnn'},
        ],
    },
}

In [37]:
with open("url.yaml", "r") as file:
    url_dict = yaml.safe_load(file)

# URL to our SYBIL AWS service
protocol = url_dict["protocol"]
host = url_dict["host"]
port = url_dict["port"]
endpoint = "train"

url = "%s://%s:%s/%s" % (protocol, host, str(port), endpoint)

In [38]:
for dataset in testing_datasets:
    train_data, test_data = prepare_dataset_forecast(dataset)
    api_json = {
        "data": train_data,
        "model": model_request,  # (optional) can be commented out
    }
    start_time = time.time()
    response = requests.post(url, json=api_json)
    exc_time = time.time() - start_time
    model_req = api_json["model"] if "model" in api_json.keys() else "default"
    new_row = pd.DataFrame(
        [
            {
                "file_name": dataset["file_name"],
                "model_request": model_req,
                "user": user_name,
                "time": exc_time,
                "response": response.status_code,
                "type": response.json()['type'],
                "metrics": response.json()['metrics'],
                "model": response.json()['model'],
                "test_data": test_data
            }
        ]
    )
    log_df = pd.concat([log_df, new_row], ignore_index=True)
    print(response)

<Response [200]>
<Response [200]>


In [39]:
log_df

Unnamed: 0,file_name,model_request,user,time,response,type,metrics,model,test_data
0,../datasets/retail/retail_sales.csv,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,4.940552,200,meta_lr,"[{'type': 'smape', 'value': 38.12746439829832}...",AgEBCEcMAQBADAEAW6UAABgAAABQpQAAFRcAAD+AAGVsaZ...,"[[2015-11-01, 444507.0], [2015-12-01, 518253.0..."
1,../datasets/finance/Returns_short_interest_dat...,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,13.911756,200,meta_lr,"[{'type': 'smape', 'value': 199.99999677569406...",AgEBCFM2AgBQNgIA3m8BABgAAADXbwEA3C0AAD+AAGVsaZ...,"[[2002-06-01, -0.0727656666666666], [2002-07-0..."


## Forecast <a class="anchor" id="forecast-bullet"></a>

In [40]:
responses = []
for idx, row in log_df.iterrows():
    dates = list(np.array(row['test_data'])[:,0])
    model = row['model']
    api_json = {
                    'model': model,
                    'data': dates
                }
    endpoint = 'forecast'
    url = '%s://%s:%s/%s' % (protocol, host, str(port), endpoint)
    response = requests.post(url, json=api_json)
    responses.append(response.json()['data'])

In [41]:
log_df['forecast_data'] = responses

In [42]:
# Comparison of the results
for idx, row in log_df.iterrows():
    comparison_df = pd.DataFrame({'test':list(np.array(row['test_data'])[:,1]),
                'pred':list(np.array(row['forecast_data'])[:,1])})
    display(comparison_df)

Unnamed: 0,test,pred
0,444507.0,538812.0810625734
1,518253.0,554122.3903617455
2,400928.0,535718.6125380868
3,413554.0,543128.8328519018
4,460093.0,558538.4791210323
5,450935.0,539992.9729526193
6,471421.0,547443.67965084


Unnamed: 0,test,pred
0,-0.0727656666666666,2855912.2869539745
1,-0.075387,2855912.291698833
2,0.0057109999999999,2855912.2750650398
3,-0.1103773333333333,2855912.3082263693
4,0.0872823333333333,2855912.297117535
5,0.057994,2855912.28504902
6,-0.0598256666666666,2855912.2891600262


In [43]:
comparison_df['test']

0    -0.0727656666666666
1              -0.075387
2     0.0057109999999999
3    -0.1103773333333333
4     0.0872823333333333
5               0.057994
6    -0.0598256666666666
Name: test, dtype: object

## Save Results <a class="anchor" id="results-bullet"></a>

In [44]:
# Logs
log_df

Unnamed: 0,file_name,model_request,user,time,response,type,metrics,model,test_data,forecast_data
0,../datasets/retail/retail_sales.csv,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,4.940552,200,meta_lr,"[{'type': 'smape', 'value': 38.12746439829832}...",AgEBCEcMAQBADAEAW6UAABgAAABQpQAAFRcAAD+AAGVsaZ...,"[[2015-11-01, 444507.0], [2015-12-01, 518253.0...","[[2015-10-31T00:00:00, 538812.0810625734], [20..."
1,../datasets/finance/Returns_short_interest_dat...,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,13.911756,200,meta_lr,"[{'type': 'smape', 'value': 199.99999677569406...",AgEBCFM2AgBQNgIA3m8BABgAAADXbwEA3C0AAD+AAGVsaZ...,"[[2002-06-01, -0.0727656666666666], [2002-07-0...","[[2002-05-31T00:00:00, 2855912.2869539745], [2..."


In [45]:
if LOG_RESULTS:
    if os.path.exists(output_file_name):
        existing_log_df = pd.read_csv(output_file_name)
        updated_df = pd.concat([existing_log_df, log_df], ignore_index=True)
    else:
        updated_df = log_df
    updated_df.to_csv(output_file_name, index=False)

In [46]:
log_df = pd.read_csv(output_file_name)

In [47]:
log_df

Unnamed: 0,file_name,model_request,user,time,response,type,metrics,model,test_data,forecast_data
0,https://github.com/ourownstory/neuralprophet-d...,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,65.887619,200,meta_lr,"[{'type': 'smape', 'value': 4.667966174549589}...",AgEBCNHWAADQ1gAAUXcAABgAAABMdwAASg4AAD+AAGVsaZ...,"[['1960-06-01', 535.0], ['1960-07-01', 622.0],...","[['1960-05-31T00:00:00', 536.6447784284949], [..."
1,../datasets/air_quality/BeijingPM25_0.csv,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,1156.983846,200,meta_lr,"[{'type': 'smape', 'value': 200.0}, {'type': '...",AgEBCPQYcwAAABAAvUxHADAAAACatg4A9bkBALXbHQBiih...,"[['2014-12-31 17:00:00', 9.0], ['2014-12-31 18...","[['2014-12-31T17:00:00', 244.06093488531084], ..."
2,../datasets/energy/elecdemand_dataset.csv,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,275.278771,200,meta_lr,"[{'type': 'smape', 'value': 200.0}, {'type': '...",AgEBCJJFMAAAABAAGWQlAOBjHgCjng8AtkUAACAAAACSRQ...,"[['2014-12-31 20:30:00', 3.8734], ['2014-12-31...","[['2014-12-31T20:30:00', -502.8264662468482], ..."
3,../datasets/climate/temp_anom_monthly.csv,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,105.654048,200,meta_lr,"[{'type': 'smape', 'value': 200.0}, {'type': '...",AgEBCC2dBQAonQUAXqcDABgAAABVpwMAkmgAAD+AAGVsaZ...,"[['2022-12-01', 0.84], ['2023-01-01', 0.87], [...","[['2022-12-02T00:00:00', -236716.56937655865],..."
4,../datasets/climate/temp_anom_w_forcing.csv,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,12.695486,200,meta_lr,"[{'type': 'smape', 'value': 178.41811091755022...",AgEBCJOtAACQrQAAU4UAABgAAABMhQAAuxAAAD+AAGVsaZ...,"[['2006', 0.66], ['2007', 0.65], ['2008', 0.55...","[['2006-01-02T00:00:00', 89717453190.78818], [..."
5,../datasets/climate/yosemite_temps.csv,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,228.408271,200,meta_lr,"[{'type': 'smape', 'value': 199.99982946699566...",AgEBCI6TMwAAABAA7uofAAu8DwBdiBUAspMDACAAAACOkw...,"[['2017-07-04 23:30:00', 43.6], ['2017-07-04 2...","[['2017-07-04T23:30:00', -47712.19258963689], ..."
6,../datasets/retail/retail_sales.csv,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,4.940552,200,meta_lr,"[{'type': 'smape', 'value': 38.12746439829832}...",AgEBCEcMAQBADAEAW6UAABgAAABQpQAAFRcAAD+AAGVsaZ...,"[['2015-11-01', 444507.0], ['2015-12-01', 5182...","[['2015-10-31T00:00:00', 538812.0810625734], [..."
7,../datasets/finance/Returns_short_interest_dat...,"{'type': 'meta_lr', 'scorers': ['smape', 'mape...",studio-lab-user,13.911756,200,meta_lr,"[{'type': 'smape', 'value': 199.99999677569406...",AgEBCFM2AgBQNgIA3m8BABgAAADXbwEA3C0AAD+AAGVsaZ...,"[['2002-06-01', -0.0727656666666666], ['2002-0...","[['2002-05-31T00:00:00', 2855912.2869539745], ..."


In [28]:
from ast import literal_eval

In [29]:
# Comparison of the results
for idx, row in log_df.iterrows():
    comparison_df = pd.DataFrame({'test':list(np.array(literal_eval(row['test_data']))[:,1]),
                'pred':list(np.array(literal_eval(row['forecast_data']))[:,1])})
    display(comparison_df)

Unnamed: 0,test,pred
0,535.0,536.6447784284949
1,622.0,624.7126156353609
2,606.0,640.3087357562652
3,508.0,528.9142194897604
4,461.0,470.3129232763376
5,390.0,425.8023794460936
6,432.0,472.8051842789944


Unnamed: 0,test,pred
0,9.0,244.06093488531084
1,10.0,244.48628522951088
2,8.0,244.63454529401204
3,10.0,244.8708786939524
4,10.0,245.1138124924792
5,8.0,245.36014990794823
6,12.0,245.59873940475293


Unnamed: 0,test,pred
0,3.8734,-502.8264662468482
1,3.7916,-502.3340654989552
2,3.7248,-502.07583139561393
3,3.7619,-501.8132927970073
4,3.8094,-501.6797112133754
5,4.1359,-501.5344886645544
6,4.217,-501.457731425804


Unnamed: 0,test,pred
0,0.84,-236716.56937655865
1,0.87,-236716.75797421116
2,1.0,-236717.21293754716
3,1.23,-236717.3857011199
4,0.98,-236717.53365027648
5,0.96,-236717.61220792425
6,1.05,-236717.4635986198


Unnamed: 0,test,pred
0,0.66,89717453190.78818
1,0.65,89717453191.64856
2,0.55,89717453192.40302
3,0.66,89717453192.7938
4,0.73,89717453191.89508
5,0.62,89717453192.43274
6,0.66,89717453192.7128


Unnamed: 0,test,pred
0,43.6,-47712.19258963689
1,43.3,-47582.104079052806
2,42.8,-47554.03632862121
3,43.0,-47452.84592115879
4,42.1,-47441.376500979066
5,42.1,-47433.12783724442
6,41.4,-47400.16464453563
