In [1]:
#Here we attempt to replicate research findings from Persio and Honchar from "Artificial Neural Networks Approach to the Forecast of Stock Market Price Movements" (2016)
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import os
import yfinance as yf
from sklearn.preprocessing import StandardScaler

In [66]:
#Data Ingestion
look_back = 30
all_data = pd.DataFrame()

#Replicate paper's data: ~ 16k data points from S&P 500 index
stock_data = yf.download('^GSPC', start='1950-01-01', end='2016-12-31')
stock_data.columns = stock_data.columns.droplevel(1)
stock_data.columns.name = None

prices = stock_data
prices['Date'] = prices.index
prices = prices.dropna()

#Cast numeric data to type float
prices[['Open', 'High', 'Low', 'Close']] = prices[['Open', 'High', 'Low', 'Close']].astype(float)
prices['Volume'] = prices['Volume'].astype(int)

#Create percentage change column for each col to normalize data.
prices[['Open_pc', 'High_pc', 'Low_pc', 'Close_pc', 'Volume_pc']] = prices[['Open', 'High', 'Low', 'Close', 'Volume']].pct_change()

#Take the previous 30 days of price data (only for close for this paper)
for num in range(0,look_back):
  col_name = 'Close_pc' + '_' + str(num)
  prices[col_name] = prices['Close_pc'].shift(num+1)


#Keep cols on date, movement class, previous -lookback period- days
cols_to_keep = []
cols_to_keep = ['Date'] + [col for col in prices.columns if 'Close_pc' in col]
prices_pattern = prices[cols_to_keep]
all_data = pd.concat([all_data, prices_pattern], ignore_index=True)

all_data = all_data.dropna()
all_data['Movement_Class'] = np.where(all_data['Close_pc']>0, 'Positive', 'Negative')
print(len(all_data))
print(all_data.head(10))

  stock_data = yf.download('^GSPC', start='1950-01-01', end='2016-12-31')
[*********************100%***********************]  1 of 1 completed


16828
         Date  Close_pc  Close_pc_0  Close_pc_1  Close_pc_2  Close_pc_3  \
31 1950-02-16 -0.004103    0.000000   -0.010441   -0.002315    0.004067   
32 1950-02-17  0.009417   -0.004103    0.000000   -0.010441   -0.002315   
33 1950-02-20  0.002916    0.009417   -0.004103    0.000000   -0.010441   
34 1950-02-21 -0.001744    0.002916    0.009417   -0.004103    0.000000   
35 1950-02-23  0.002330   -0.001744    0.002916    0.009417   -0.004103   
36 1950-02-24  0.004067    0.002330   -0.001744    0.002916    0.009417   
37 1950-02-27  0.000000    0.004067    0.002330   -0.001744    0.002916   
38 1950-02-28 -0.003472    0.000000    0.004067    0.002330   -0.001744   
39 1950-03-01  0.001161   -0.003472    0.000000    0.004067    0.002330   
40 1950-03-02 -0.000580    0.001161   -0.003472    0.000000    0.004067   

    Close_pc_4  Close_pc_5  Close_pc_6  Close_pc_7  Close_pc_8  Close_pc_9  \
31   -0.001161   -0.005196    0.001735    0.003482    0.010557    0.000000   
32    0.0040

In [69]:
#MLP
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Flatten, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from sklearn.preprocessing import StandardScaler

#Normalize data - zero mean, unit variance
scaler = StandardScaler()
features_scaled = scaler.fit_transform(all_data.iloc[:,1:31][::-1])
x = features_scaled

#Encode label axes
y_raw = all_data['Movement_Class'].values
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y_raw)
y = to_categorical(y_encoded)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=99, stratify=y)
#MLP model:2 hidden layers with 500, 250 neurons respectively, with a dropout after first layer
model_MLP = Sequential()
model_MLP.add(Input(shape=(30,)))

model_MLP.add(Dense(500, activation = 'relu' ))
model_MLP.add(Dropout(0.5))
model_MLP.add(Dense(250, activation = 'relu'))
model_MLP.add(Dense(2, activation='softmax'))


model_MLP.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_MLP.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate on test data
test_loss, test_accuracy = model_MLP.evaluate(X_test, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


Epoch 1/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.5123 - loss: 0.7172
Epoch 2/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5339 - loss: 0.6895
Epoch 3/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5393 - loss: 0.6866
Epoch 4/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5528 - loss: 0.6835
Epoch 5/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.5677 - loss: 0.6786
Epoch 6/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.5684 - loss: 0.6758
Epoch 7/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5769 - loss: 0.6735
Epoch 8/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.5833 - loss: 0.6685
Epoch 9/10
[1m421/421[0m [32m━━━━━━━━

In [70]:
#MLP eval
from sklearn.metrics import classification_report

y_test_true_labels = np.argmax(y_test, axis=1)
y_pred = model_MLP.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

print(classification_report(y_test_true_labels, y_pred_classes))
print("0 = Price decrease, 1 = Price increase")

[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
              precision    recall  f1-score   support

           0       0.49      0.41      0.45      1586
           1       0.54      0.62      0.58      1780

    accuracy                           0.52      3366
   macro avg       0.52      0.52      0.51      3366
weighted avg       0.52      0.52      0.52      3366

0 = Price decrease, 1 = Price increase


In [73]:
#CNN
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Input

#CNN Model:
model_CNN = Sequential()
model_CNN.add(Input(shape=(30,1)))
# 1st conv layer
model_CNN.add(Conv1D(filters=64, kernel_size=3, strides=1, activation='relu'))
model_CNN.add(MaxPooling1D(pool_size=2))
# Second Conv Layer
model_CNN.add(Conv1D(filters=64, kernel_size=3, strides=1, activation='relu'))
model_CNN.add(MaxPooling1D(pool_size=2))
model_CNN.add(Dropout(0.1))
#MLP layer - 2 layers with dropout after the first
model_CNN.add(Flatten())
model_CNN.add(Dense(100, activation='relu'))
model_CNN.add(Dropout(0.3))
model_CNN.add(Dense(50, activation='relu'))
model_CNN.add(Dense(2, activation='softmax'))

model_CNN.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_CNN.fit(X_train, y_train, epochs=10, batch_size=32)

loss_cnn, acc_cnn = model_CNN.evaluate(X_test, y_test)
print(f"Test Loss: {loss_cnn:.4f}, Test Accuracy: {acc_cnn:.4f}")

Epoch 1/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.5042 - loss: 0.6955
Epoch 2/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.5267 - loss: 0.6922
Epoch 3/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5294 - loss: 0.6915
Epoch 4/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5303 - loss: 0.6913
Epoch 5/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.5269 - loss: 0.6910
Epoch 6/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5261 - loss: 0.6912
Epoch 7/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5305 - loss: 0.6900
Epoch 8/10
[1m421/421[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5320 - loss: 0.6893
Epoch 9/10
[1m421/421[0m [32m━━━━━━━━

In [72]:
#CNN eval
y_pred_CNN = model_CNN.predict(X_test)
y_pred_classes_CNN = y_pred_CNN.argmax(axis=1)

print(classification_report(y_test_true_labels, y_pred_classes_CNN))
print("0 = Price decrease, 1 = Price increase")

[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
              precision    recall  f1-score   support

           0       0.43      0.04      0.08      1586
           1       0.53      0.95      0.68      1780

    accuracy                           0.52      3366
   macro avg       0.48      0.50      0.38      3366
weighted avg       0.48      0.52      0.39      3366

0 = Price decrease, 1 = Price increase
