<a href="https://colab.research.google.com/github/Schmutzz/PV_Forecast_Platone/blob/master/platone_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import sqlite3
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import Model
from keras.layers import LSTM, Dense, Dropout, Input, concatenate
from keras.callbacks import EarlyStopping
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
from datetime import datetime

# establish sql connection
db_path = "drive/MyDrive/data/input_data.db"
conn = sqlite3.connect(db_path)

In [None]:
parser = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

df_station43_long = pd.read_sql_query('SELECT Date, solar_radiation, pressureTrend, windspeedAvg, avg_wind_dir '
                                      'FROM wunderground_historical_43_long', conn,
                                      parse_dates=['Date'], index_col='Date')
df_station43_long.dropna(inplace=True)
# df_station43_long = df_station43_long.resample('5Min').mean().interpolate(method='linear')

df_mb_15 = pd.read_sql_query('SELECT Timestamp, pvpower_instant FROM mb_pvpro_15min', conn, 
                             parse_dates=['Timestamp'], index_col='Timestamp')
df_mb_15.dropna(inplace=True)
df_mb_15 = df_mb_15.resample('5Min').mean().interpolate(method='linear')

df_mb_clouds = pd.read_sql_query('SELECT Timestamp, lowclouds, midclouds, highclouds, totalcloudcover '
                                 'FROM mb_clouds', conn, parse_dates=['Timestamp'], index_col='Timestamp')
df_mb_clouds.dropna(inplace=True)
df_mb_clouds = df_mb_clouds.resample('5Min').mean().interpolate(method='linear')

# convert dtypes to reduce RAM usage
df_station43_long['solar_radiation'] = df_station43_long['solar_radiation'].astype(np.float32)
df_station43_long['avg_wind_dir'] = df_station43_long['avg_wind_dir'].astype(np.int16)
df_station43_long['windspeedAvg'] = df_station43_long['windspeedAvg'].astype(np.int8)
# df_station43_long['tempAvg'] = df_station43_long['tempAvg'].astype(np.int8)
df_station43_long['pressureTrend'] = df_station43_long['pressureTrend'].astype(np.float16)
df_station43_long['day'] = df_station43_long.index.day.astype(np.int8)
df_station43_long['month'] = df_station43_long.index.month.astype(np.int8)
df_station43_long['hour'] = df_station43_long.index.hour.astype(np.int8)
df_station43_long['minute'] = df_station43_long.index.minute.astype(np.int8)
# df_station43_long['precipTotal'] = df_station43_long['precipTotal'].astype(np.float16)

In [None]:
# fig = px.scatter(df_station43_long, y='solar_radiation')
# fig.show()

handle missing data (01.01.2021-09.05.2021 and 26.05.2021-29.11.2022)

In [None]:
df_station43_long = df_station43_long.resample('5Min').mean().interpolate(method='linear')

In [None]:
# interpolate small data gaps
# data_1 = df_station43_long.loc[:pd.to_datetime('09.05.2021', format='%d.%m.%Y')]
data_2 = df_station43_long.merge(df_mb_15, how='inner', left_index=True, right_index=True)
data_2 = data_2.merge(df_mb_clouds, how='inner', left_index=True, right_index=True)

# remove negative values for scaling
data_2.loc[data_2['solar_radiation'] < 0, 'solar_radiation'] = 0
data_2.loc[data_2['pvpower_instant'] < 0, 'pvpower_instant'] = 0

# del df_station43_long

# data_1 = data_1.resample('5Min').mean().interpolate(method='quadratic')
# data_2 = data_2.resample('5Min').mean().interpolate(method='quadratic')

# validation split ~10%
"""split_1 = pd.to_datetime('20.04.2021', format='%d.%m.%Y')
split_2 = pd.to_datetime('10.10.2022', format='%d.%m.%Y')

train_1 = data_1.loc[:split_1]
test_1 = data_1.loc[split_1:]

train_2 = data_2.loc[:split_2]
test_2 = data_2.loc[split_2:]

del data_1, data_2"""

In [None]:
# test = data_1.to_numpy().T[0].reshape(-1, 1)
# data_1.shape[-1]

In [None]:
# scale the data using MinMax Scaler from to 1 as LSTM has a default tanh activation function
# use data_2 for scaling since the majority of dates are in the second training set
# scaler = MinMaxScaler(feature_range=(-1,1)).fit(data_2.to_numpy())

# scalers_1 = [MinMaxScaler(feature_range=(-1,1)).fit(data_1.to_numpy().T[i].reshape(-1, 1)) for i in range(data_1.shape[-1])]
scalers_2 = [MinMaxScaler(feature_range=(-1,1)).fit(data_2.to_numpy().T[i].reshape(-1, 1)) for i in range(data_2.shape[-1])]

# data_1_scaled = np.array([scaler.transform(data_1.to_numpy().T[i].reshape(-1, 1)) for i, scaler in enumerate(scalers_1)]).squeeze()
data_2_scaled = np.array([scaler.transform(data_2.to_numpy().T[i].reshape(-1, 1)) for i, scaler in enumerate(scalers_2)]).squeeze()

"""
scaler_solar = MinMaxScaler(feature_range=(-1,1)).fit(data_2.to_numpy()[0].reshape(-1, 1))
scaler_pressure = MinMaxScaler(feature_range=(-1,1)).fit(data_2.to_numpy()[1].reshape(-1, 1))
scaler_day = MinMaxScaler(feature_range=(-1,1)).fit(data_2.to_numpy()[2].reshape(-1, 1))
scaler_month = MinMaxScaler(feature_range=(-1,1)).fit(data_2.to_numpy()[3].reshape(-1, 1))
scaler_hour = MinMaxScaler(feature_range=(-1,1)).fit(data_2.to_numpy()[4].reshape(-1, 1))
scaler_minute = MinMaxScaler(feature_range=(-1,1)).fit(data_2.to_numpy()[5].reshape(-1, 1))
"""
"""
test_scaled = scaler.transform(test_1.to_numpy())
train_2_scaled = scaler.transform(train_2.to_numpy())
test_2_scaled = scaler.transform(test_2.to_numpy())
del train_1, test_1, train_2, test_2
"""

In [None]:
# create a function to split the datasets into two week windows
timesteps_input = 12*24  # 12 five min intervals * 24 hours = 1 day
timesteps_prediction = 12*6  # 12 five min intervals * 2 hours

def create_dataset(dataset, steps_in=timesteps_input, steps_pred=timesteps_prediction):
    """
    Function which creates two week chunks of x_train data, and a single
    value for y_train.
    """
    X_hist, X_pred, y = [], [], []
    print(dataset.shape)
    for i in tqdm(range(dataset.shape[1])):
        target_val_start = i + steps_in
        target_val_end = target_val_start + steps_pred
        if target_val_end >= dataset.shape[1]:
            break
        feature_chunk, meteoblue_pred, target = dataset[:-5, i:target_val_start], \
                                                dataset[-5:, target_val_start:target_val_end], \
                                                dataset[0, target_val_start:target_val_end]
        X_hist.append(feature_chunk)
        X_pred.append(meteoblue_pred)
        y.append(target)

    return np.array(X_hist), np.array(X_pred), np.array(y)

In [None]:
"""
test1 = np.zeros((5, 10))
test2 = np.ones((3, 5))
np.append(test1, test2)
"""

In [None]:
# create training data for NN
# X_1, y_1 = create_dataset(data_1_scaled)
X_hist_2, X_pred_2, y_2 = create_dataset(data_2_scaled)
# del train_1_scaled, train_2_scaled

# combine all training data
# save data to dataset 2 for RAM reasons
"""
X_train_2 = np.append(X_train_1, X_train_2, axis=0)
del X_train_1
y_train_2 = np.append(y_train_1, y_train_2, axis=0)
del y_train_1
"""

In [None]:
"""
# create testing data for NN
X_test_1, y_test_1 = create_dataset(test_1_scaled)
X_test_2, y_test_2 = create_dataset(test_2_scaled)
# del test_1_scaled, test_2_scaled

# combine all test data
X_test_2 = np.append(X_test_1, X_test_2, axis=0)
del X_test_1
y_test_2 = np.append(y_test_1, y_test_2, axis=0)
del y_test_1
"""

In [None]:
"""
X_train, X_test, y_train, y_test = X_train_2, X_test_2, y_train_2, y_test_2
del X_train_2, X_test_2, y_train_2, y_test_2
"""
# print(X_1.shape)
# print(y_1.shape)
print(X_hist_2.shape)
print(X_pred_2.shape)
print(y_2.shape)

Create LSTM Model

In [None]:
# input needs to be [samples, timesteps, features]
units_1 = 96
units_2 = 64
units_3 = 64
units_dense = 32
dropout = 0.05
epochs = 40
val_split = 0.1
optimizer = 'adam'
file_name = 'wg_multi_6hour_branches_mb_forecast_L96rs_L96_D64_C_D64_better_scaling'

# multiple inputs from https://pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/
input_hist = Input(shape=(X_hist_2.shape[1], X_hist_2.shape[2]))
input_pred = Input(shape=(X_pred_2.shape[1], X_pred_2.shape[2]))

# branch for historical wonderground data
x = LSTM(96, dropout=dropout, return_sequences=True)(input_hist)
x = LSTM(96)(x)
x = Dense(64, activation='relu')(x)
x = Model(inputs=input_hist, outputs=x)

# branch for meteoblue forecast data
y = LSTM(96, dropout=dropout, return_sequences=True)(input_pred)
y = LSTM(96)(y)
y = Dense(64, activation='relu')(y)
y = Model(inputs=input_pred, outputs=y)

# combine branches
combined = concatenate([x.output, y.output])

z = Dense(64, activation='relu')(combined)
# z = LSTM(64)(z)
z = Dense(y_2.shape[1])(z)
model = Model(inputs=[x.input, y.input], outputs=z)

checkpoint_filepath = 'drive/MyDrive/data/{}_cp'.format(file_name)
mcp_save = keras.callbacks.ModelCheckpoint(checkpoint_filepath, save_best_only=True, monitor='val_loss', mode='min')
early_stopping = EarlyStopping(patience=10, monitor='val_loss', restore_best_weights=True)

"""
model = keras.Sequential()
model.add(LSTM(units=units_1, dropout=dropout, return_sequences=True,
                input_shape=(X_hist_2.shape[1], X_hist_2.shape[2])))
model.add(LSTM(units=units_2, dropout=dropout, return_sequences=True))
model.add(LSTM(units=units_3, dropout=dropout))
model.add(Dense(units=units_dense))
model.add(Dense(units=y_2.shape[1]))
"""

model.compile(optimizer=optimizer, loss='mean_squared_error')
print(model.summary())

# history_1 = model.fit(X_1, y_1, validation_split=val_split, batch_size=192,
#             epochs=epochs, verbose=1, callbacks=[early_stopping])

history_2 = model.fit([X_hist_2, X_pred_2], y_2, validation_split=val_split, batch_size=96,
            epochs=epochs, verbose=1, callbacks=[mcp_save])

# batchsize größer

In [None]:
loss = history_2.history["loss"]
val_loss = history_2.history["val_loss"]
epoch = np.arange(1, len(val_loss)+1, 1)

fig = plt.figure(figsize=(12, 8))
plt.plot(epoch,loss)
plt.plot(epoch,val_loss)
plt.show()

In [None]:
model.save('drive/MyDrive/data/{}'.format(file_name))

In [None]:
# model = keras.models.load_model('drive/MyDrive/data/{}'.format(file_name))

In [None]:
# plot relevant times (7am - 3pm)

while True:
    r = random.randrange(0, len(X_hist_2))
    x_hist = X_hist_2[r]
    x_hist_exp = np.expand_dims(x_hist, 0)
    x_pred = X_pred_2[r]
    x_pred_exp = np.expand_dims(x_pred, 0)
    expected = y_2[r]
    prediction = model([x_hist_exp, x_pred_exp], training=False)[0]

    x_hist_real = [scaler.inverse_transform(x_hist[i].reshape(-1, 1)) for i, scaler in enumerate(scalers_2[:-5])]
    x_pred_real = [scaler.inverse_transform(x_pred[i].reshape(-1, 1)) for i, scaler in enumerate(scalers_2[-5:])]
    expected_real = scalers_2[0].inverse_transform(expected.reshape(-1, 1))
    prediction_real = scalers_2[0].inverse_transform(prediction.numpy().reshape(-1, 1))

    if int(x_hist_real[-2][-1]) in range(7, 15, 1):
        break

"""
fig = make_subplots(rows=1, cols=2, subplot_titles=('Scaled', 'Real'))

fig.add_trace(go.Scatter(y=expected, name='expected'), 1, 1)
fig.add_trace(go.Scatter(y=prediction, name='prediction'), 1, 1)
fig.add_trace(go.Scatter(y=expected_real.flatten(), name='expected'), 1, 2)
fig.add_trace(go.Scatter(y=prediction_real.flatten(), name='prediction'), 1, 2)
fig.show()
"""

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 9))
# fig = plt.figure(figsize=(12, 8))
axes[0].set_title('Scaled Values - Date: {:02d}.-{:02d}. {:02d}:{:02d}'.format(int(x_hist_real[-4][-1]), int(x_hist_real[-3][-1]), 
                                                       int(x_hist_real[-2][-1]), int(x_hist_real[-1][-1])))
axes[0].plot(expected, label='Expected')
axes[0].plot(prediction, label='Prediction')
axes[0].plot(x_pred[0], label='MeteoBlue PV-Forecast')
axes[0].plot(x_pred[-1], label='MeteoBlue Cloud-Forecast')
axes[0].legend()

print('MSE MeteoBlue: {:.4f}'.format(mean_squared_error(expected, x_pred[0])))
print('MSE Prediction: {:.4f}'.format(mean_squared_error(expected, prediction)))
print('\n')

# fig = plt.figure(figsize=(12, 8))
axes[1].set_title('Real Values - Date: {:02d}.-{:02d}. {:02d}:{:02d}'.format(int(x_hist_real[-4][-1]), int(x_hist_real[-3][-1]), 
                                                       int(x_hist_real[-2][-1]), int(x_hist_real[-1][-1])))
axes[1].plot(expected_real, label='Expected')
axes[1].plot(prediction_real, label='Prediction')
axes[1].plot(x_pred_real[0], label='MeteoBlue Forecast')
# axes[1].plot(x_pred_real[-1], label='MeteoBlue Cloud-Forecast')
axes[1].legend()

print('MSE MeteoBlue: {:.4f}'.format(mean_squared_error(expected_real, x_pred_real[0])))
print('MSE Prediction: {:.4f}'.format(mean_squared_error(expected_real, prediction_real)))
print('\n')

fig.show()

In [None]:
data_2.min()

In [None]:
data_2.max()