In [1]:
import sys
import numpy as np
import holidays
sys.path.append('../')
from model import data_preparation as prep

# Get Some Data

In [2]:
con = prep.create_connection()
prepared_data = prep.prepare_data(con,20)

In [3]:
prepared_data

{274: {'abahn': False,
  'brand': 'HEM',
  'bstr': True,
  'county': '13073',
  'sstr': False,
  'state': 'Mecklenburg-Vorpommern',
  'test_stamps': array(['2017-03-03T19:00:00.000000000', '2017-03-03T20:00:00.000000000',
         '2017-03-03T21:00:00.000000000', ...,
         '2017-09-17T22:00:00.000000000', '2017-09-17T23:00:00.000000000',
         '2017-09-18T00:00:00.000000000'], dtype='datetime64[ns]'),
  'time_series': {'test': array([1289, 1289, 1289, ..., 1289, 1289, 1399]),
   'train': array([1349, 1349, 1349, ..., 1289, 1289, 1289])},
  'train_stamps': array(['2015-01-01T00:00:00.000000000', '2015-01-01T01:00:00.000000000',
         '2015-01-01T02:00:00.000000000', ...,
         '2017-03-03T16:00:00.000000000', '2017-03-03T17:00:00.000000000',
         '2017-03-03T18:00:00.000000000'], dtype='datetime64[ns]')},
 1466: {'abahn': True,
  'brand': 'SHELL',
  'bstr': False,
  'county': '03353',
  'sstr': False,
  'state': 'Niedersachsen',
  'test_stamps': array(['2017-03-03T19:00

# Remove Trend
## with np.log

In [8]:
for key in prepared_data:
    prepared_data[key]["time_series"]["test"] = np.log(prepared_data[key]["time_series"]["test"].reshape(-1,1))
    prepared_data[key]["time_series"]["train"] = np.log(prepared_data[key]["time_series"]["train"].reshape(-1,1))
    

# using partial autocorelation

In [9]:
from statsmodels.tsa.stattools import pacf

keys = list(prepared_data.keys())

x_pacf=pacf( prepared_data[keys[0]]['time_series']['train'] , nlags =10, method='ols')

In [10]:
x_pacf # tells us one time lag correlates the most with 0.9

array([ 1.        ,  0.93588039, -0.04030626, -0.07638168, -0.0299094 ,
       -0.05777687, -0.03972393, -0.00227531,  0.25327688,  0.13011483,
        0.06040027])

# Arrange Data to have List of training instances

In [None]:
x_train_adjusted = {}
x_test_adjusted = {}
y_train_adjusted = {}
y_test_adjusted = {}

vacations = prep.get_vacations(con)
for k in keys:
    x_train = prepared_data[k]["time_series"]["train"][0:-2]
    ts_train = prepared_data[k]["train_stamps"][0:-2]
    y_train_adjusted[k] = prepared_data[k]["time_series"]["train"][1:-1]
    x_test = prepared_data[k]["time_series"]["test"][0:-2]
    ts_test = prepared_data[k]["test_stamps"][0:-2]
    y_test_adjusted[k] = prepared_data[k]["time_series"]["test"][1:-1]
    x_train_adjusted[k] = []
    x_test_adjusted[k] = []
    for i, p in enumerate(x_train):
        features = []
        features.append(np.float(p[0]))
        features.append(ts_train[i])
        features.append(prep.is_timestamp_in_vacations(prepared_data[k]["state"], ts_train[i], vacations))
        features.append((str(ts_train[i].astype('datetime64[D]')) in holidays.DE(state=prep.get_state_shortcut(prepared_data[k]["state"]))))
        features.append(prep.encode_state(prepared_data[k]["state"]))
        features.append(k)
        features.append(hash(prepared_data[k]["brand"]))
        features.append(prepared_data[k]["county"])
        features.append(int(prepared_data[k]["abahn"]))
        features.append(int(prepared_data[k]["bstr"]))
        features.append(int(prepared_data[k]["sstr"]))
        x_train_adjusted[k].append(features)
        
    x_test_adjusted[k] = []
    for i, p in enumerate(x_test):
        features = []
        features.append(np.float(p[0]))
        features.append(ts_test[i])
        features.append(prep.is_timestamp_in_vacations(prepared_data[k]["state"], ts_test[i], vacations))
        features.append((str(ts_test[i].astype('datetime64[D]')) in holidays.DE(state=prep.get_state_shortcut(prepared_data[k]["state"]))))
        features.append(k)
        features.append(hash(prepared_data[k]["brand"]))
        features.append(prep.encode_state(prepared_data[k]["state"]))
        features.append(prepared_data[k]["county"])
        features.append(int(prepared_data[k]["abahn"]))
        features.append(int(prepared_data[k]["bstr"]))
        features.append(int(prepared_data[k]["sstr"]))
        x_test_adjusted[k].append(features)

In [None]:
x_train_adjusted = np.array([x_train_adjusted[k] for k in sorted(x_train_adjusted.keys())])
x_train_adjusted = x_train_adjusted.reshape(-1,9)
x_test_adjusted = np.array([x_test_adjusted[k] for k in sorted(x_test_adjusted.keys())])
x_test_adjusted = x_test_adjusted.reshape(-1,9)
y_test_adjusted = np.array([y_test_adjusted[k] for k in sorted(y_test_adjusted.keys())])
y_test_adjusted = y_test_adjusted.reshape(-1,1)
y_train_adjusted = np.array([y_train_adjusted[k] for k in sorted(y_train_adjusted.keys())])
y_train_adjusted = y_train_adjusted.reshape(-1,1)


# normalize price MinMax

In [None]:
maxValue =y_train_adjusted[:,0].max()
x_train_adjusted = x_train_adjusted.astype(float)
x_test_adjusted = x_test_adjusted.astype(float)
x_train_adjusted[:,0] *=1/maxValue
x_test_adjusted[:,0] *=1/maxValue
y_test_adjusted[:,0] *=1/maxValue
y_train_adjusted[:,0] *=1/maxValue

In [None]:
x_train_adjusted[0]

In [None]:
x_train_adjusted = np.array(x_train_adjusted)
x_test_adjusted = np.array(x_test_adjusted)
#x_train_adjusted = x_train_adjusted.reshape(x_train_adjusted.shape + (1, ))
#x_test_adjusted = x_test_adjusted.reshape(x_test_adjusted.shape + (1, ))

# fitting lasso regression model

seems to fit best our problem with a penalty value of 1e^-6

In [None]:
from sklearn import linear_model
import matplotlib.pyplot as plt
clf = linear_model.LassoLars(0.0000001)
clf.fit (x_train_adjusted, y_train_adjusted)
print(clf.score(x_test_adjusted , y_test_adjusted))

plt.plot(np.exp(y_test_adjusted*maxValue))
plt.plot(np.exp(clf.predict(x_test_adjusted)*maxValue))
plt.show()

In [None]:
x_test_adjusted

In [None]:

from sklearn.metrics import mean_squared_error
mean_squared_error(np.exp(y_test_adjusted*maxValue), np.exp(clf.predict(x_test_adjusted*maxValue)))