In [None]:
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from keras.layers import Input, Dense, Dropout
from keras.models import Model, Sequential
from keras import regularizers

In [None]:
# Load data
stocks_data = pd.read_pickle('data/nasdaq100_6y.pkl')
index_data = pd.read_pickle('data/nasdaq100_index_6y.pkl')
assets_names = stocks_data.columns.values

data_assets = stocks_data
data_index = index_data

print("Stocks data (time series) shape: {shape}".format(shape=stocks_data.shape))
print("Index data (time series) shape: {shape}".format(shape=index_data.shape))

stocks_data.head()

In [None]:
# Split data
n_train = int(data_assets.shape[0]*0.8)

# Stocks data
X_train = data_assets.values[:n_train, :]
X_test = data_assets.values[n_train:, :]

# Index data
index_train = data_index[:n_train]
index_test = data_index[n_train:]

In [None]:
# Normalize data
scaler = MinMaxScaler([0, 1])
# Stocks data
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
# Index data
scaler_index = MinMaxScaler([0, 1])
index_train = scaler_index.fit_transform(index_train[:, np.newaxis])
index_test = scaler_index.fit_transform(index_test[:, np.newaxis])

In [None]:
## Autoencoder - Keras

# Network hyperparameters
n_inputs = X_train.shape[1]

# Training hyperparameters
epochs = 50
batch_size = 1

# Define model
input = Input(shape=(n_inputs,))
# Encoder Layers
encoded = Dense(4, input_shape=(n_inputs,), activation='relu', activity_regularizer=regularizers.l1(10e-5))(input)
decoded = Dense(n_inputs, activation='sigmoid')(encoded)

# Encoder
encoder = Model(input, encoded)

# Autoencoder
model = Model(input, decoded)

In [None]:
# Compile autoencoder
model.compile(loss='mse', optimizer='adam')
model.summary()

In [None]:
# Fit the model
history = model.fit(X_train,
                    X_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=True,
                    verbose=1
                    )

In [None]:
#from keras.models import model_from_json
#
## Save model
## serialize model to JSON
#model_json = model.to_json()
#with open("models/sparseAE_model.json", "w") as json_file:
#    json_file.write(model_json)
## serialize weights to HDF5
#model.save_weights("models/sparseAE_model.h5")

In [None]:
## Load model
## load json and create model
#json_file = open('models/sparseAE_model.json', 'r')
#loaded_model_json = json_file.read()
#json_file.close()
#model = model_from_json(loaded_model_json)
# load weights into new model
#model.load_weights("models/sparseAE_model.h5")

In [None]:
# Obtain reconstruction of the stocks
X_train_pred = model.predict(X_train)
X_test_pred = model.predict(X_test)

error = np.mean(np.abs(X_train - X_train_pred)**2, axis=0)
print('Training MSE: %.8f' %np.mean(error))

error_test = np.mean(np.abs(X_test - X_test_pred)**2, axis=0)
print('Testing MSE: %.8f' %np.mean(error_test))

In [None]:
# Sort stocks by reconstruction error (increasing order)
ind = np.argsort(error) 
sort_error = error[ind]
sort_assets_names = assets_names[ind]

In [None]:
# Barplotplt.figure()
plt.barh(2*np.arange(len(error[:20])), error[ind[:20]], tick_label=assets_names[ind[:20]])
plt.xlabel('MSE')
plt.show()
#plt.savefig('images/sparseAE_MSEbar.eps', bbox_inches='tight')

In [None]:
# Plot stock
i = 0
plt.figure()
plt.plot(X_train[:, ind[i]], label=assets_names[ind[i]] + ' Stock')
plt.plot(X_train_pred[:, ind[i]], label=assets_names[ind[i]] + ' AE')
#plt.plot(index_train, label='Nasdaq100')
plt.legend()
plt.xlabel('Time (days)')
plt.ylabel('Normalized price')
plt.show()
#plt.savefig('images/sparseAE_' + assets_names[ind[i]] + '.eps', bbox_inches='tight')

In [None]:
# Identify stocks
n = 5

portfolio_train = X_train_pred[:, ind[:n]]
portfolio_test = X_test_pred[:, ind[:n]]

# Create portfolio in-sample
tracked_index_insample = np.mean(portfolio_train, axis=1)

# Create portfolio out-sample
tracked_index_outofsample = np.mean(portfolio_test, axis=1)

In [None]:
# In-sample
plt.figure()
plt.plot(index_train, label='Nasdaq100 Index')
plt.plot(tracked_index_insample, label='Tracked Index')
plt.legend()
plt.xlabel('Time (days)')
plt.ylabel('Normalized price')
plt.show()
#plt.savefig('images/sparseAE_insample.png', bbox_inches='tight')

In [None]:
# Correlation coefficient (in-sample)
corr_train = np.corrcoef(index_train.squeeze(), tracked_index_insample)[0, 1]
print('Correlation coefficient (in-sample): %.8f' %corr_train)

In [None]:
# Plot tracked index (out-of-sample)
plt.figure()
plt.plot(index_test, label='Nasdaq100 Index')
plt.plot(tracked_index_outofsample, label='Tracked Index')
plt.legend()
plt.xlabel('Time (days)')
plt.ylabel('Normalized price')
plt.show()
#plt.savefig('images/sparseAE_outofsample.png', bbox_inches='tight')

In [None]:
# Correlation coefficient (out-of-sample)
corr_test = np.corrcoef(index_test.squeeze(), tracked_index_outofsample)[0, 1]
print('Correlation coefficient: %.8f' %corr_test)

In [None]:
# Predict code values
code_values = encoder.predict(X_test)

# Compute mean
mean_code = np.mean(code_values, axis=(0, 1))

print('Mean of code values for sparse AE: %.8f' %mean_code)

In [None]:
# Compute mean for Vanilla AE
code_vanillaAE = np.load('models/vanillaAE_code.npy')
mean_code_vanillaAE = np.mean(code_vanillaAE, axis=0)

print('Mean of code values for vanilla AE: %.8f' %mean_code_vanillaAE)

In [None]:
import seaborn as sns
from scipy.stats import norm

# Density Plot - Sparse AE code Vs. Vanilla AE code
plt.figure()
sns.distplot(code_vanillaAE, hist=False, fit=norm, kde=False, fit_kws={"color":"green"}, label = 'Vanilla AE')
sns.distplot(code_values, hist=False, fit=norm, kde=False, fit_kws={"color":"red"}, label = 'Sparse AE')
plt.legend()
plt.show()
#plt.savefig('images/sparseAE_code.png', bbox_inches='tight')