In [18]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
    print('and then re-execute this cell.')
else:
    print(gpu_info)

Sat Oct 17 15:13:34 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
# metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler

# high-language sequential modeling
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras import optimizers

# misc
from pandas import DataFrame
from pandas import Series
from pandas import concat
from pandas import read_csv
from pandas import datetime
from math import sqrt
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot
import numpy as np
import matplotlib.pyplot as plt

# statistic stationarity
from statsmodels.tsa.stattools import adfuller

# progress bar
from tqdm import tqdm
from tqdm.notebook import tnrange

  from pandas import datetime


In [2]:
source_file = ["dataset/Indonesia Stock Exchange Composite Index Historical Data.csv"]

In [21]:
# data log debugger
data_log = {}

In [22]:
def timeseries_to_supervised(data, lag=1):
		df = DataFrame(data)
		columns = [df.shift(i) for i in range(1, lag+1)]
		columns.append(df)
		df = concat(columns, axis=1)
		df = df.drop(0)
		return df

In [23]:
def difference(dataset, interval=1):
	diff = list()
	for i in range(interval, len(dataset)):
			value = dataset[i] - dataset[i - interval]
			diff.append(value)
	return Series(diff)

In [24]:
def inverse_difference(history, yhat, interval=1):
	return yhat + history[-interval]

In [25]:
def scale(train, test):
	# fit scaler
	scaler = MinMaxScaler(feature_range=(0, 1))
	scaler = scaler.fit(train)
	train = train.reshape(train.shape[0], train.shape[1])
	train_scaled = scaler.transform(train)
	test = test.reshape(test.shape[0], test.shape[1])
	test_scaled = scaler.transform(test)
	return scaler, train_scaled, test_scaled

In [26]:
def invert_scale(scaler, X, yhat):
	new_row = [x for x in X] + [yhat]
	array = np.array(new_row)
	array = array.reshape(1, len(array))
	inverted = scaler.inverse_transform(array)
	return inverted[0, -1]

In [27]:
# evaluate the model on a dataset, returns RMSE in transformed units
def evaluate(model, raw_data, scaled_dataset, scaler, offset, batch_size):
    # separate
    X, y = scaled_dataset[:,0:-1], scaled_dataset[:,-1]
    # reshape
    reshaped = X.reshape(len(X), 1, 1)
    # forecast dataset
    output = model.predict(reshaped, batch_size=batch_size)
    # invert data transforms on forecast
    predictions = list()
    for i in range(len(output)):
        yhat = output[i,0]
        # invert scaling
        yhat = invert_scale(scaler, X[i], yhat)
        # invert differencing
        yhat = yhat + raw_data[i]
        # store forecast
        predictions.append(yhat)
    # report performance
    rmse = sqrt(mean_squared_error(raw_data[1:], predictions))
    mae = mean_absolute_error(raw_data[1:], predictions)
    r_squared = r2_score(raw_data[1:], predictions)
    pd_test = DataFrame(raw_data[1:])
    pd_test.to_csv('data_test.csv')
    pd_predictions = DataFrame(predictions)
    pd_predictions.to_csv('data_predictions.csv')

    return rmse, mae, r_squared

In [28]:
"""
Description of fit_lstm()
Trains and returns an LSTM model. 
Takes the training dataset in a supervised learning format, a batch size, a number of epochs, and a number of neurons.
"""
def fit_lstm(train, test, raw, scaler, batch_size, nb_epoch, neurons, timesteps, repeatation):
    X, y = train[:, 0:-1], train[:, -1]
    X = X.reshape(X.shape[0], timesteps, X.shape[1])
    data_log['model_lstm_X'] = X

    # building model
    model = Sequential()
    model.add(LSTM(neurons, activation='tanh', batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
 
    model.add(Dense(1))
    sgd = optimizers.SGD(lr=0.001, momentum=0.0, nesterov=False)
    model.compile(loss='mean_squared_error', optimizer='sgd')

    train_rmse, test_rmse = list(), list()
    train_mae, test_mae = list(), list()
    train_r_squared, test_r_squared = list(), list()
    for i in tnrange(nb_epoch, desc = 'Fit LSTM'):
        #fit model
        model.fit(X, y, epochs=1, batch_size=batch_size, verbose=0, shuffle=False)

        # evaluate model on train data
        raw_train = raw[-(len(train)+len(test)+1):-len(test)]
        rmse, mae, r_squared = evaluate(model, raw_train, train, scaler, 0, batch_size)
        train_rmse.append(rmse)
        train_mae.append(mae)
        train_r_squared.append(r_squared)

        # evaluate model on test data
        raw_test = raw[-(len(test)+1):]
        rmse, mae, r_squared = evaluate(model, raw_test, test, scaler, 0, batch_size)
        test_rmse.append(rmse)
        test_mae.append(mae)
        test_r_squared.append(r_squared)

        model.reset_states()
    
    # logging pake dict global
    data_log['train_rmse_' + str(repeatation)] = train_rmse
    data_log['test_rmse_' + str(repeatation)] = test_rmse
    data_log['train_mae_' + str(repeatation)] = train_mae
    data_log['test_mae_' + str(repeatation)] = test_mae
    data_log['train_r_squared_' + str(repeatation)] = train_r_squared
    data_log['test_r_squared_' + str(repeatation)] = train_r_squared

    return model

In [29]:
def plot_1(series, predictions, file_name):
    pyplot.figure(figsize=(12,7))
    pyplot.plot(series['Price'], 'green', color='blue', label='LSTM Training Data')
    pyplot.plot(series.index[-300:], predictions, color='green', marker='o', linestyle='dashed', label='LSTM Predicted Price')
    pyplot.plot(series.index[-300:], series['Price'][-300:], color='red', label='Actual Price')
    pyplot.title(file_name[36:])
    pyplot.xlabel('Dates')
    pyplot.ylabel('Prices')
    #plt.xticks(np.arange(0,len(series), 300), series.index[0:len(series):300])
    pyplot.legend()
    pyplot.savefig('LSTM_' + file_name[36:] + '_Full.png')

In [30]:
def plot_2(series, predictions, file_name):
    pyplot.figure(figsize=(12,7))
    pyplot.plot(series.index[-300:], predictions, color='green', marker='o', linestyle='dashed', label='LSTM Predicted Price')
    pyplot.plot(series.index[-300:], series['Price'][-300:], color='red', label='Actual Price')
    pyplot.xticks(np.arange(1486,len(series), 60), series['Price'][1486:len(series):60])
    pyplot.title(file_name[36:])
    pyplot.xlabel('Dates')
    pyplot.ylabel('Prices')
    pyplot.legend()
    pyplot.savefig('LSTM_' + file_name[36:] + '_Prediction Result.png')

In [31]:
def do_trial(repeats, series, epochs, batch_size, neurons, timesteps, file_name):
    # get difference
    diff_values = difference(series['Price'].values, 1)
    data_log['diff_values'] = diff_values

    # convert to supervised
    supervised = timeseries_to_supervised(diff_values, timesteps)
    data_log['supervised'] = supervised

    # pick values only
    supervised_values = supervised.values[timesteps:,:]
    data_log['supervised_values'] = supervised_values

    train, test = supervised_values[0:-300], supervised_values[-300:]

    # scale data
    scaler, train_scaled, test_scaled = scale(train, test)
    data_log['train_scaled'] = train_scaled
    data_log['test_scaled'] = test_scaled

    error_scores = list()
    for repeatation in tnrange(repeats, desc = 'Main Lane'):
        train_trimmed = train_scaled[2:, :]
        data_log['train_trimmed'] = train_trimmed

        raw_values = series['Price'].values
        lstm_model = fit_lstm(train_trimmed, test_scaled, raw_values, scaler, batch_size, epochs, neurons, timesteps, repeatation)
        
        train_reshaped = train_trimmed[:, 0].reshape(len(train_trimmed), 1, 1)
        data_log['train_reshaped'] = train_reshaped

        # train
        lstm_model.predict(train_reshaped, batch_size=batch_size)

        test_reshaped = test_scaled[:,0:-1]
        test_reshaped = test_reshaped.reshape(len(test_reshaped), 1, 1)
        data_log['test_reshaped'] = test_reshaped

        # test
        output = lstm_model.predict(test_reshaped, batch_size=batch_size)
        data_log['output'] = output

        predictions = list()

        # convert prediction result to unscaled data
        for i in range(len(output)):
            yhat = output[i,0]
            X = test_scaled[i, 0:-1]
            yhat = invert_scale(scaler, X, yhat)
            yhat = inverse_difference(raw_values, yhat, len(test_scaled)+1-i)
            predictions.append(yhat)

        # rmse after prediction
        rmse = sqrt(mean_squared_error(raw_values[-300:], predictions))
        print('%d) Test RMSE: %.3f' % (repeatation+1, rmse))
        error_scores.append(rmse)
    pyplot.savefig('Error convergence_' + file_name[36:] + '.png')
    data_log['error_scores'] = error_scores
    data_log['lstm_model_summary'] = lstm_model.summary()
    data_log['predictions'] = predictions

    return error_scores, predictions

In [32]:
def return_scaled_data(repeats, series, epochs, batch_size, neurons, timesteps):
    raw_values = series['Price'].values
    diff_values = difference(raw_values, 1)
    supervised = timeseries_to_supervised(diff_values, timesteps)
    supervised_values = supervised.values[timesteps:,:]
    train, test = supervised_values[0:-300], supervised_values[-300:]
    scaler, train_scaled, test_scaled = scale(train, test)

    return train_scaled, test_scaled

In [33]:
def stationarity_info(series, file_address):
    results = adfuller(series)
    print(file_address[36:])   # google colab address only
    print('ADF Statistic: %f' % results[0])
    print('p-value: %f' % results[1])
    print('Critical Values:')
    values = []
    for key, value in results[4].items():
        print('\t%s: %.3f' % (key, value))
        values.append(value)
    if(results[0] > max(values)):
        print("Non-stationary and most likely random walk.")
    else:
        print("Stationary")
    print("")

In [34]:
if __name__ == "__main__":
    for file_name in source_file:
        series = read_csv(file_name, header=0, parse_dates=[0], index_col=0, squeeze=True)
        repeats = 5
        results = DataFrame()
        epochs = 1000
        batch_size = 4
        neurons = 1
        timesteps = 1

        # get stationarity information
        stationarity_info(series['Price'].values, file_name)

        # main lane
        results[0], predictions = do_trial(repeats, series, epochs, batch_size, neurons, timesteps, file_name)
        print(results.describe())
        results.boxplot()
        pyplot.savefig('boxplot_' + file_name[36:] + '.png')

        plot_1(series, predictions, file_name)
        plot_2(series, predictions, file_name)

ex Historical Data.csv
ADF Statistic: -1.031595
p-value: 0.741567
Critical Values:
	1%: -3.436
	5%: -2.864
	10%: -2.568
Non-stationary and most likely random walk.



HBox(children=(FloatProgress(value=0.0, description='Main Lane', max=5.0, style=ProgressStyle(description_widt…

HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


1) Test RMSE: 45.740


HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


2) Test RMSE: 45.819


HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


3) Test RMSE: 45.766


HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


4) Test RMSE: 45.784


HBox(children=(FloatProgress(value=0.0, description='Fit LSTM', max=1000.0, style=ProgressStyle(description_wi…


5) Test RMSE: 45.794

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (4, 1)                    12        
_________________________________________________________________
dense_4 (Dense)              (4, 1)                    2         
Total params: 14
Trainable params: 14
Non-trainable params: 0
_________________________________________________________________
               0
count   5.000000
mean   45.780485
std     0.029670
min    45.739752
25%    45.765757
50%    45.784376
75%    45.794017
max    45.818522


### Debugging

In [35]:
#pyplot.figure(figsize=(12,7))
#pyplot.plot(data_log['train_rmse_'], color='blue', label='Train RMSE')
#pyplot.plot(data_log['test_rmse_'], color='orange', label='Test RMSE')
#pyplot.savefig('rmse_plot_overtime.png')