In [3]:
import tensorflow as tf
import tensorflow_probability as tfp
import pandas as pd 

from tensorflow import keras
from keras.layers import Input, Dense, LSTM, Dropout, TimeDistributed, Lambda
from datetime import datetime, timedelta
from numpy import array


In [4]:
# 定义模型
class DeepAR(tf.keras.models.Model):
    def __init__(self, lstm_units, n_steps_in, n_steps_out, n_features):
        super().__init__()

        self.lstm = tf.keras.layers.LSTM(lstm_units, return_sequences=True, return_state=True, input_shape=(n_steps_in, n_features))
        self.dense_mu = tf.keras.layers.Dense(1) 
        self.dense_sigma = tf.keras.layers.Dense(1, activation='softplus')

    def call(self, inputs, initial_state=None):
        outputs, state_h, state_c = self.lstm(inputs, initial_state=initial_state)

        mu = self.dense_mu(outputs)
        sigma = self.dense_sigma(outputs)
        state = [state_h, state_c]

        return [mu, sigma, state]

def log_gaussian_loss(mu, sigma, y_true):
    """
    Gaussian loss function
    """
    return -tf.reduce_sum(tfp.distributions.Normal(loc=mu, scale=sigma).log_prob(y_true))


In [5]:
# 定义工具函数
def split_sequence(sequence, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequence)):
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        if out_end_ix > len(sequence):
            break
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix:out_end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return array(X), array(y)

In [8]:
# 设置超参数
n_steps_in = 30
n_steps_out = 1
n_features = 1

In [7]:
# 加载数据
data_df = pd.read_csv("../../../2.1.datasets/Twitter_volume_AMZN.csv", header=0, index_col=0)
print("type:{}, shape:{}, min_index:{}, max_index:{}".format(type(data_df), data_df.shape, data_df.index.min(), data_df.index.max()))
data_df.head(5)

type:<class 'pandas.core.frame.DataFrame'>, shape:(15831, 1), min_index:2015-02-26 21:42:53, max_index:2015-04-22 20:52:53


Unnamed: 0_level_0,value
timestamp,Unnamed: 1_level_1
2015-02-26 21:42:53,57
2015-02-26 21:47:53,43
2015-02-26 21:52:53,55
2015-02-26 21:57:53,64
2015-02-26 22:02:53,93


In [10]:
# 处理数据
test_cutoff_date = datetime.strftime(datetime.strptime(data_df.index.max(), '%Y-%m-%d %H:%M:%S')- timedelta(days=50), '%Y-%m-%d %H:%M:%S'
train_df = data_df[data_df.index <= test_cutoff_date]
test_df = data_df[data_df.index > test_cutoff_date]

training_mean = train_df.mean()
training_std = train_df.std()
training_value_df = (train_df - training_mean) / training_std 

test_mean = test_df.mean()
test_std = test_df.std()
test_value_df = (test_df - test_mean) / test_std

X, y = split_sequence(training_value_df, n_steps_in, n_steps_out)
X = X.reshape((X.shape[0], X.shape[1], n_features))
y = y.reshape((y.shape[0], y.shape[1], n_features))

test_X, test_y = split_sequence(test_value_df, n_steps_in, n_steps_out)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[1], n_features))

SyntaxError: invalid syntax (488661589.py, line 3)

In [None]:
LSTM_UNITS = 128
EPOCHS = 100

model = DeepAR(LSTM_UNITS, n_steps_in, n_steps_out, n_features)

# optmizer
optimizer = tf.keras.optimizers.Adam()


# metric
rmse = tf.keras.metrics.RootMeanSquaredError()

def train_step(x, y):
    with tf.GradientTape() as tape:
        mu, sigma, _ = model(x)
        loss = log_gaussian_loss(mu, sigma, y) # Forward Learning
    # backword
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    rmse(y, mu)

for epoch in range(EPOCHS):
    train_step(X, y)
    print('Epoch %d, RMSE %.4f' % (epoch + 1, rmse.result().numpy()))
    rmse.reset_states()

In [None]:
# 샘플링된 값들을 예측값으로 반환.
pred = model.predict(test_X)

In [None]:
pred[0].shape

# median => 실제 예측값.
# 5% 퍼센타일 => lower bound
# 95% 퍼센타일 => upper bound

In [None]:
test_y.shape # 라벨의 shape

In [None]:
import numpy as np
lower_bound = []
upper_bound = []
median = []
for step_pred in pred[0]:
                           # 정규화된 값을 원래대로.
  lb = (np.quantile(step_pred, 0.05) + test_mean) * test_std
  ub = (np.quantile(step_pred, 0.95) + test_mean) * test_std
  med = (np.quantile(step_pred, 0.5) + test_mean) * test_std
  lower_bound.append(lb)
  upper_bound.append(ub)
  median.append(med)

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure(figsize = (12,6))
ax = fig.add_subplot(111)


df['Close'].plot(ax=ax)
ax.vlines('2020-08-22', 0, 1000, linestyle='--', color='r', label='forecast boundary')
ax.fill_between(df_test[30:].index, lower_bound, upper_bound, color='b', alpha=0.1, label='95% Confidence Interval')
ax.plot(df_test[30:].index, median, label='Prediction')
ax.legend(loc='upper left')
# plt.suptitle(f"ARIMA {optimal[0][0]} Prediction Result (r2 score: {r2}")
# plt.show()