# Example to train a model
Using the openstf tasks

In [1]:
import pandas as pd
import IPython
from openstef.pipeline.train_model import train_model_pipeline
from openstef.pipeline.create_forecast import create_forecast_pipeline



In [2]:
# define properties of training/prediction. We call this a 'prediction_job'
pj=dict(id=287,
        model='xgb',
        quantiles=[10,30,50,70,90],
        hyper_params={}, # Note, this should become optional
        feature_names=None, # Note, this should become optional
       )

# Load input data
input_data = pd.read_csv('data/get_model_input_pid_287.csv', index_col='index', parse_dates=True)

# Split in training and forecasting data
train_data = input_data.iloc[:-200,:] # everything except last 200 rows (~ 48 hours)
to_forecast_data = input_data.iloc[:-200,:] # last 200 rows


In [3]:
train_data.head()

Unnamed: 0_level_0,load,APX,clouds,radiation,temp,winddeg,windspeed,windspeed_100m,pressure,humidity,...,sjv_E1A,sjv_E1B,sjv_E1C,sjv_E2A,sjv_E2B,sjv_E3A,sjv_E3B,sjv_E3C,sjv_E3D,sjv_E4A
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-02 09:45:00+00:00,3.31,,,,,,,,,,...,,,,,,,,,,
2020-10-02 10:00:00+00:00,2.62,34.0,99.758911,1552899.0,16.449036,154.711456,3.527778,9.349441,99453.476562,0.68624,...,3.1e-05,3e-05,2.9e-05,3.3e-05,3.2e-05,6.1e-05,4.8e-05,4.8e-05,3.1e-05,0.0
2020-10-02 10:15:00+00:00,0.796667,34.0,99.819193,1575618.0,16.400948,157.491554,3.557639,9.232026,99416.363281,0.68378,...,3.2e-05,3e-05,2.9e-05,3.3e-05,3.2e-05,6e-05,4.8e-05,4.8e-05,3.1e-05,0.0
2020-10-02 10:30:00+00:00,0.3,34.0,99.879475,1598338.0,16.352859,160.271652,3.5875,9.114612,99379.25,0.681319,...,3.2e-05,3.1e-05,2.9e-05,3.3e-05,3.1e-05,5.8e-05,4.8e-05,4.8e-05,3.1e-05,0.0
2020-10-02 10:45:00+00:00,1.773333,34.0,99.939756,1594736.0,16.304771,163.05175,3.617361,8.997197,99342.136719,0.678859,...,3.2e-05,3e-05,2.9e-05,3.2e-05,3.1e-05,5.7e-05,4.8e-05,4.8e-05,3.1e-05,0.0


In [4]:
to_forecast_data.head()

Unnamed: 0_level_0,load,APX,clouds,radiation,temp,winddeg,windspeed,windspeed_100m,pressure,humidity,...,sjv_E1A,sjv_E1B,sjv_E1C,sjv_E2A,sjv_E2B,sjv_E3A,sjv_E3B,sjv_E3C,sjv_E3D,sjv_E4A
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-10-02 09:45:00+00:00,3.31,,,,,,,,,,...,,,,,,,,,,
2020-10-02 10:00:00+00:00,2.62,34.0,99.758911,1552899.0,16.449036,154.711456,3.527778,9.349441,99453.476562,0.68624,...,3.1e-05,3e-05,2.9e-05,3.3e-05,3.2e-05,6.1e-05,4.8e-05,4.8e-05,3.1e-05,0.0
2020-10-02 10:15:00+00:00,0.796667,34.0,99.819193,1575618.0,16.400948,157.491554,3.557639,9.232026,99416.363281,0.68378,...,3.2e-05,3e-05,2.9e-05,3.3e-05,3.2e-05,6e-05,4.8e-05,4.8e-05,3.1e-05,0.0
2020-10-02 10:30:00+00:00,0.3,34.0,99.879475,1598338.0,16.352859,160.271652,3.5875,9.114612,99379.25,0.681319,...,3.2e-05,3.1e-05,2.9e-05,3.3e-05,3.1e-05,5.8e-05,4.8e-05,4.8e-05,3.1e-05,0.0
2020-10-02 10:45:00+00:00,1.773333,34.0,99.939756,1594736.0,16.304771,163.05175,3.617361,8.997197,99342.136719,0.678859,...,3.2e-05,3e-05,2.9e-05,3.2e-05,3.1e-05,5.7e-05,4.8e-05,4.8e-05,3.1e-05,0.0


# Train a model
Train a model using the high-level pipeline. Store the model and reports on training proces in ./trained_models

In [6]:
train_model_pipeline(
    pj,
    train_data,
    check_old_model_age=False,
    trained_models_folder='./trained_models',
    )

2022-02-09 17:32.30 [debug    ] MLflow path at init= file:///C:/repos/openstf-offline-example/examples/trained_models/mlruns


2022/02/09 17:32:30 INFO mlflow.tracking.fluent: Experiment with name '287' does not exist. Creating a new experiment.


2022-02-09 17:32.30 [info     ] No previous model found in MLflow pid=287
2022-02-09 17:32.31 [info     ] Found 2 values of constant load (repeated values), converted to NaN value. cleansing_step=repeated_values frac_values=0.00017666283897182228 num_values=2 pj_id=287
2022-02-09 17:32.31 [info     ] Removed 2 NaN values           num_removed_values=2
2022-02-09 17:32.37 [info     ] No previous model found in MLflow pid=287




2022-02-09 17:32.42 [info     ] Model saved with MLflow        pid=287
2022-02-09 17:32.44 [info     ] logged figures to MLflow
2022-02-09 17:32.44 [debug    ] MLflow path after saving= file:///C:/repos/openstf-offline-example/examples/trained_models/mlruns
2022-02-09 17:32.45 [info     ] Stored report to disk: trained_models\287


You can find the trained model in ./trained_models, along with reports on the training process

In [15]:
## Inspect local files
IPython.display.HTML(f"<iframe src=./trained_models/{pj['id']}/Predictor0.25.html width=800 height=400></iframe>"
                     f"<iframe src=./trained_models/{pj['id']}/Predictor47.0.html width=800 height=400></iframe>"
                     f"<iframe src=./trained_models/{pj['id']}/weight_plot.html width=800 height=400></iframe>")