In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

import sys
# adding to the path variables the one folder higher (locally, not changing system variables)
sys.path.append("..")

%matplotlib inline

In [2]:
import mlflow

from modeling.config import EXPERIMENT_NAME
TRACKING_URI = open(".mlflow_uri").read().strip()

"""
# setting the MLFlow connection and experiment
mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.start_run()
run = mlflow.active_run()

or parameters of the model (fit_intercept for Linear Regression model)
mlflow_params = {
      "altitude_low_meters_mean": altitude_low_meters_mean,
      "altitude_high_meters_mean": altitude_high_meters_mean,
      "altitude_mean_log_mean": altitude_mean_log_mean,
      "fit_intercept": True,
  }


# logging params to mlflow
#mlflow.log_params(params)
# setting tags
mlflow.set_tag("model", "XGBoost")
mlflow.set_tag("features", "imbalance price")
# logging metrics
# mlflow.log_metric("train-" + "RMSE", rmse_train)
# mlflow.log_metric("test-" + "RMSE", rmse_test)
# mlflow.log_metric("train-" + "R2", rsquared_train)
# mlflow.log_metric("test-" + "R2", rsquared_test)
# end run
# mlflow.end_run()
"""

'\n# setting the MLFlow connection and experiment\nmlflow.set_tracking_uri(TRACKING_URI)\nmlflow.set_experiment(EXPERIMENT_NAME)\nmlflow.start_run()\nrun = mlflow.active_run()\n\nor parameters of the model (fit_intercept for Linear Regression model)\nmlflow_params = {\n      "altitude_low_meters_mean": altitude_low_meters_mean,\n      "altitude_high_meters_mean": altitude_high_meters_mean,\n      "altitude_mean_log_mean": altitude_mean_log_mean,\n      "fit_intercept": True,\n  }\n\n\n# logging params to mlflow\n#mlflow.log_params(params)\n# setting tags\nmlflow.set_tag("model", "XGBoost")\nmlflow.set_tag("features", "imbalance price")\n# logging metrics\n# mlflow.log_metric("train-" + "RMSE", rmse_train)\n# mlflow.log_metric("test-" + "RMSE", rmse_test)\n# mlflow.log_metric("train-" + "R2", rsquared_train)\n# mlflow.log_metric("test-" + "R2", rsquared_test)\n# end run\n# mlflow.end_run()\n'

In [3]:
train = pd.read_csv('../data/train.csv')
train.head()

Unnamed: 0,datetime,1
0,2013-12-31 23:00:00,917.118
1,2014-01-01 00:00:00,1264.266
2,2014-01-01 01:00:00,746.81
3,2014-01-01 02:00:00,-15.512
4,2014-01-01 03:00:00,-36.904


In [4]:
test = pd.read_csv('../data/test.csv')
test.head()

Unnamed: 0,datetime,1
0,2019-03-05 06:00:00,-349.899
1,2019-03-05 07:00:00,372.963
2,2019-03-05 08:00:00,371.4125
3,2019-03-05 09:00:00,817.3485
4,2019-03-05 10:00:00,638.5965


In [5]:
# shift function for one-lagged series for training data set

train['shift1'] = train['1'].shift(-1)
train.tail()

Unnamed: 0,datetime,1,shift1
45338,2019-03-05 01:00:00,147.278,-28.272
45339,2019-03-05 02:00:00,-28.272,-201.291
45340,2019-03-05 03:00:00,-201.291,83.71
45341,2019-03-05 04:00:00,83.71,-102.683
45342,2019-03-05 05:00:00,-102.683,


In [6]:
# shift function for one-lagged series for test data set

test['shift1'] = test['1'].shift(-1)
test.tail()

Unnamed: 0,datetime,1,shift1
19428,2021-05-22 18:00:00,-1.051667,14.628167
19429,2021-05-22 19:00:00,14.628167,30.308
19430,2021-05-22 20:00:00,30.308,164.239
19431,2021-05-22 21:00:00,164.239,298.17
19432,2021-05-22 22:00:00,298.17,


In [7]:
# removing the last line in the training data set
train = train.dropna()
train.tail()

Unnamed: 0,datetime,1,shift1
45337,2019-03-05 00:00:00,-328.8805,147.278
45338,2019-03-05 01:00:00,147.278,-28.272
45339,2019-03-05 02:00:00,-28.272,-201.291
45340,2019-03-05 03:00:00,-201.291,83.71
45341,2019-03-05 04:00:00,83.71,-102.683


In [8]:
# removing the last line in the test data set
test = test.dropna()
test.tail()

Unnamed: 0,datetime,1,shift1
19427,2021-05-22 17:00:00,-16.7315,-1.051667
19428,2021-05-22 18:00:00,-1.051667,14.628167
19429,2021-05-22 19:00:00,14.628167,30.308
19430,2021-05-22 20:00:00,30.308,164.239
19431,2021-05-22 21:00:00,164.239,298.17


In [9]:
# removing datetime column time in train data set in order to convert it into a non-time series problem
train = train.drop('datetime', axis=1)
train.head()

Unnamed: 0,1,shift1
0,917.118,1264.266
1,1264.266,746.81
2,746.81,-15.512
3,-15.512,-36.904
4,-36.904,360.392


In [10]:
# removing datetime column time in test data set in order to convert it into a non-time series problem
test = test.drop('datetime', axis=1)
test.head()

Unnamed: 0,1,shift1
0,-349.899,372.963
1,372.963,371.4125
2,371.4125,817.3485
3,817.3485,638.5965
4,638.5965,-206.986


In [11]:
# fit an xgboost model and make a one step prediction
def xgboost_forecast(train, testX):
	# transform list into array
	train = np.asarray(train)
	# split into input and output columns
	trainX, trainy = train[:, :-1], train[:, -1]
	# fit model
	model = XGBRegressor(objective='reg:squarederror', n_estimators=1000)
	model.fit(trainX, trainy)
	# make a one-step prediction
	yhat = model.predict([testX])
	return yhat[0]