In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [2]:
import tensorflow as tf
print(tf.__version__)
import keras
print(keras.__version__)
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

2.4.0
2.4.3


In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 10388158626482253103
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 8855182400
locality {
  bus_id: 1
  links {
  }
}
incarnation: 2881845996792935534
physical_device_desc: "device: 0, name: GeForce RTX 3080, pci bus id: 0000:01:00.0, compute capability: 8.6"
]


In [4]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 400)

In [5]:
stock_list = ['Alior Bank', 'Allegro', 'Asseco', 'CCC', 'CD Projekt', 'Cyfrowy Polsat', 'Dino Polska', 'JSW', 'KGHM', 'Lotos', 'LPP', 'Orange Polska', 'PEKAO', 'PGE', 'PGNiG', 'PKN Orlen', 'PKO BP', 'PZU', 'Santander', 'Tauron']

directory = "WiG20 data/"
stock_data_list = []

for filename in os.listdir(directory):
    file_path = os.path.join(directory, filename)
    stock_data = pd.read_csv(file_path)
    stock_data = stock_data.rename(columns={'Data': 'Date', 'Otwarcie': 'Open', 'Najwyzszy': 'Highest', 'Najnizszy': 'Lowest', 'Zamkniecie': 'Close', 'Wolumen': 'Volume'})
    stock_data['Date'] = pd.to_datetime(stock_data.Date)
    stock_data_list.append(stock_data)
    print('Loaded file: ' + filename)

Loaded file: acp_d.csv
Loaded file: ale_d.csv
Loaded file: alr_d.csv
Loaded file: ccc_d.csv
Loaded file: cdr_d.csv
Loaded file: cps_d.csv
Loaded file: dnp_d.csv
Loaded file: jsw_d.csv
Loaded file: kgh_d.csv
Loaded file: lpp_d.csv
Loaded file: lts_d.csv
Loaded file: opl_d.csv
Loaded file: peo_d.csv
Loaded file: pge_d.csv
Loaded file: pgn_d.csv
Loaded file: pkn_d.csv
Loaded file: pko_d.csv
Loaded file: pzu_d.csv
Loaded file: san_d.csv
Loaded file: tpe_d.csv


In [6]:
for i in range(len(stock_data_list)):
    stock_data_list[i] = stock_data_list[i].dropna()

In [7]:
def calculate_technical_indicators(data, rsi_period=10, so_period=14, so_d_period=4, tema_period=10, cgi_period=20, wpi_period=14):
    # rsi_period - number of sessions considered when calculating RSI
    # so_period - number of sessions considered when calculating stochastic oscillator K
    # so_d_period - numbers of sessions considered when calculating moving average of the stochastic oscillator K
    # tema_period - number of sessions considered when calculating TEMA
    # cgi_period - number of sessions considered when calculating CGI
    # wpi_period - number of sessions considered when calculating Williams' Percent Range

    # Moving averages for periods of 10, 30 and 60 days
    data['MovingAverage4'] = data['Close'].rolling(4).mean()
    data['MovingAverage7'] = data['Close'].rolling(7).mean()
    data['MovingAverage20'] = data['Close'].rolling(20).mean()

    # Relative Strength Index RSI
    increase_difference, decrease_difference = data['Close'].diff(), data['Close'].diff()
    increase_difference[increase_difference < 0] = 0
    decrease_difference[decrease_difference > 0] = 0
    roll_increase = increase_difference.ewm(span = rsi_period).mean()
    roll_decrease = decrease_difference.abs().ewm(span = rsi_period).mean()
    RS = roll_increase / roll_decrease
    data['RSI'] = 100 - (100 / (1 + RS))

    # Rate of Change ROC
    data['ROC'] = data['Close'].pct_change()

    # Stochastic Oscillator K
    L14, H14 = data['Close'].rolling(so_period).min(), data['Close'].rolling(so_period).max()
    data['K'] = (data['Close'] - L14)/(H14 - L14)

    # Moving average of the Stochastic Oscillator D
    data['D'] = data['K'].rolling(so_d_period).mean()

    # Moving Average Convergence / Divergence MACD
    EMA_26 = data['Close'].ewm(26, adjust=False).mean()
    EMA_12 = data['Close'].ewm(12, adjust=False).mean()
    data['MACD'] = EMA_12 - EMA_26

    # MACD Signal Line
    data['MACD_Signal'] = data['MACD'].ewm(9, adjust=False).mean()

    # MACD histogram
    data['MACD_Histogram'] = data['MACD'] - data['MACD_Signal']

    # Percentage Price Oscillator PPO
    data['PPO'] =(EMA_12 - EMA_26)/EMA_26

    # Triple Exponential Moving Average TEMA
    SEMA = data['Close'].ewm(tema_period, adjust=False).mean()
    DEMA = SEMA.ewm(tema_period, adjust=False).mean()
    data['TEMA'] = DEMA.ewm(tema_period, adjust=False).mean()

    # Commodity Channel Index CGI
    typical_price = (data['Highest'] + data['Lowest'] + data['Close']) / 3
    MA = typical_price.rolling(cgi_period).mean()
    mean_deviation = (MA - typical_price).abs().rolling(cgi_period).mean()
    data['CCI'] = (typical_price - MA) / (0.15 * mean_deviation)

    # Williams' Percent Range
    data['Percent_Range'] = (data['Highest'].rolling(wpi_period).max() - data['Close']) / (data['Highest'].rolling(wpi_period).max() - data['Lowest'].rolling(wpi_period).min())

    return data

In [8]:
def scale_data(data):
    list_of_features = []
    list_of_outputs = []
    for column in data.columns:
        list_of_features.append(data[column])
        if column in ['Open', 'Close', 'Highest', 'Lowest', 'Volume', 'Otwarcie', 'Najwyzszy', 'Najnizszy', 'Zamkniecie', 'Wolumen']:
            list_of_outputs.append(data[column])
        
    dataset = np.transpose(list_of_features)
    output_dataset = np.transpose(list_of_outputs)
    X_scaler = MinMaxScaler(feature_range=(0,1))
    scaled_data = X_scaler.fit_transform(dataset)
    Y_scaler = MinMaxScaler(feature_range=(0,1))
    Y_scaler.fit_transform(output_dataset)

    return scaled_data, X_scaler, Y_scaler

In [9]:
def prepare_input_and_output(data, number_of_sessions=60):
    # number_of_sessions - number of considered previous sessions as an input
    X = []
    Y = []
    for i in range(number_of_sessions, data.shape[0]):
        X.append(data[i-number_of_sessions:i, :])
        Y.append(data[i, :5])
    return X, Y

In [10]:
# Evaluation of the results is made on closing prices - it gives the most information about the usefullness of the model

def evaluate_results(X_valid, Y_valid, Y_scalers, predictions):

    predictions = Y_scalers[0].inverse_transform(predictions)
    real_output = Y_scalers[0].inverse_transform(Y_valid)

    predicted_data = pd.DataFrame(predictions, columns=['Open_predicted', 'Close_predicted', 'Highest_predicted', 'Lowest_predicted', 'Volume_predicted'])
    real_data = pd.DataFrame(real_output, columns=['Open_real', 'Close_real', 'Highest_real', 'Lowest_real', 'Volume_real'])
    predictions = pd.concat([real_data, predicted_data], axis=1)
    print(predictions[['Close_real', 'Close_predicted']].tail(20))

    predictions['Close_difference'] = abs(predictions['Close_real'] - predictions['Close_predicted'])
    predictions['Close_difference_percent'] = abs(predictions['Close_real'] - predictions['Close_predicted'])/predictions['Close_real'] * 100

    previous_close = predictions['Close_real'].shift(-1)
    Naive_forcast_MAPE = (abs(predictions['Close_real'] - previous_close)/predictions['Close_real'] * 100).mean()
    predictions_MAPE = predictions['Close_difference_percent'].mean()

    print('Naive forcast MAE: ' + str(round(abs(previous_close - predictions['Close_real']).mean(),2)))
    print('Predictions MAE: ' + str(round(predictions['Close_difference'].mean(),2)))
    print('Naive forcast MAPE: ' + str(round(Naive_forcast_MAPE,2)) + '%')
    print('Predictions MAPE: ' + str(round(predictions_MAPE,2)) + '%')
    
    return predictions

# Lerning based on prices and volume only

In [11]:
scaled_stocks_basic = []
X_scalers_basic = []
Y_scalers_basic = []

for i in range(len(stock_data_list)):
    # Date is dropped as it isn't considered for learning
    scaled_stock, X_scaler, Y_scaler = scale_data(stock_data_list[i].drop(columns='Date'))
  
    scaled_stocks_basic.append(scaled_stock)
    X_scalers_basic.append(X_scaler)
    Y_scalers_basic.append(Y_scaler)

In [12]:
X_all_basic = []
Y_all_basic = []
scaled_X_list_basic = []
scaled_Y_list_basic = []

for stock in scaled_stocks_basic:
    X, Y = prepare_input_and_output(stock, 60)
    scaled_X_list_basic.append(np.array(X))
    scaled_Y_list_basic.append(np.array(Y))
    X_all_basic = X_all_basic + X
    Y_all_basic = Y_all_basic + Y

X_all_basic, Y_all_basic = np.array(X_all_basic), np.array(Y_all_basic)
print('Shape of input matrix: ' + str(X_all_basic.shape))
print('Shape of output matrix: ' + str(Y_all_basic.shape))

Shape of input matrix: (76328, 60, 5)
Shape of output matrix: (76328, 5)


In [13]:
from sklearn.model_selection import train_test_split

X_train_basic, X_valid_basic, Y_train_basic, Y_valid_basic = train_test_split(X_all_basic, Y_all_basic, test_size=2667, shuffle=False)

,

''

In [14]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import matplotlib.pyplot as plt

In [15]:
model_basic = Sequential()
model_basic.add(LSTM(64, input_shape=[X_train_basic.shape[1], X_train_basic.shape[2]]))#, return_sequences=True))
#model_basic.add(LSTM(64))
model_basic.add(Dense(100, activation='relu'))
model_basic.add(Dense(Y_train_basic.shape[1]))

model_basic.summary()

model_basic.compile(loss='mse', optimizer='adam')

history = model_basic.fit(X_train_basic, Y_train_basic, epochs=15, validation_data=(X_valid_basic, Y_valid_basic))

plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 64)                17920     
_________________________________________________________________
dense (Dense)                (None, 100)               6500      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 24,925
Trainable params: 24,925
Non-trainable params: 0
_________________________________________________________________
Epoch 1/15


UnknownError:    Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[sequential/lstm/PartitionedCall]] [Op:__inference_train_function_2960]

Function call stack:
train_function -> train_function -> train_function


In [None]:
predictions_basic = model_basic.predict(X_valid_basic)

predictions_basic = evaluate_results(X_valid_basic, Y_valid_basic, Y_scalers_basic, predictions_basic) # returns dataset

In [None]:
plt.figure(figsize=(25,10))
plt.title('Porównanie rzeczywistych i przewidywanych cen Tauron SA', fontsize=50)
plt.xlabel('Nr sesji', fontsize=30)
plt.ylabel('Cena zamknięcia (zł)', fontsize=30)
plt.plot(predictions_basic['Close_real'])
plt.plot(predictions_basic['Close_predicted'])
plt.legend(['Cena rzeczywista',  'Cena przewidywana'], loc='lower right', fontsize=30)
plt.show()

# Lerning based on prices, volume and technical indicators

In [None]:
from IPython import display
stock_data_list_ta = []

for i in range(len(stock_data_list)):
    technical_indicators = calculate_technical_indicators(stock_data_list[i], rsi_period=60, so_period=5, so_d_period=3, tema_period=4, cgi_period=4, wpi_period=4)
    stock_data_list_ta.append(technical_indicators)
    # moving averages return NaN when the considered period is greater than available data, these rows need to be dropped
    stock_data_list_ta[i] = stock_data_list_ta[i].dropna()

display.display(stock_data_list_ta[0].head(100))

In [None]:
scaled_stocks_ta = []
X_scalers_ta = []
Y_scalers_ta = []

for i in range(len(stock_data_list_ta)):
    # Date is dropped as it isn't considered for learning
    scaled_stock_ta, X_scaler_ta, Y_scaler_ta = scale_data(stock_data_list_ta[i].drop(columns='Date'))
    scaled_stocks_ta.append(scaled_stock_ta)
    X_scalers_ta.append(X_scaler_ta)
    Y_scalers_ta.append(Y_scaler_ta)

In [None]:
X_all_ta = []
Y_all_ta = []
scaled_X_list_ta = []
scaled_Y_list_ta = []

for stock in scaled_stocks_ta:
    X, Y = prepare_input_and_output(stock)
    scaled_X_list_ta.append(np.array(X))
    scaled_Y_list_ta.append(np.array(Y))
    X_all_ta = X_all_ta + X
    Y_all_ta = Y_all_ta + Y

X_all_ta, Y_all_ta = np.array(X_all_ta), np.array(Y_all_ta)
print('Shape of input matrix: ' + str(X_all_ta.shape))
print('Shape of output matrix: ' + str(Y_all_ta.shape))

In [None]:
from sklearn.model_selection import train_test_split

X_train_ta, X_valid_ta, Y_train_ta, Y_valid_ta = train_test_split(X_all_ta, Y_all_ta, test_size=2667, shuffle=False)

print('Shape of training input matrix: ' + str(X_train_ta.shape))
print('Shape of training output matrix: ' + str(Y_train_ta.shape))
print('Shape of validation input matrix: ' + str(X_valid_ta.shape))
print('Shape of validation output matrix: ' + str(Y_valid_ta.shape))

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import matplotlib.pyplot as plt

In [None]:
model_ta = Sequential()
model_ta.add(LSTM(64,input_shape=[X_train_ta.shape[1], X_train_ta.shape[2]]))#, return_sequences=True))
model_ta.add(Dense(100, activation='relu'))
model_ta.add(Dense(Y_train_ta.shape[1]))

model_ta.summary()

model_ta.compile(loss='mse', optimizer='adam')

history_ta = model_ta.fit(X_train_ta, Y_train_ta, epochs=15, validation_data=(X_valid_ta, Y_valid_ta))

plt.plot(history_ta.history['loss'], label='train')
plt.plot(history_ta.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
predictions_ta = model_ta.predict(X_valid_ta)

predictions_ta = evaluate_results(X_valid_ta, Y_valid_ta, Y_scalers_ta, predictions_ta)

In [None]:
plt.figure(figsize=(25,10))
plt.title('Porównanie rzeczywistych i przewidywanych z użyciem \nwskaźników technicznych cen Tauron SA', fontsize=50)
plt.xlabel('Nr sesji', fontsize=30)
plt.ylabel('Cena zamknięcia (zł)', fontsize=30)
plt.plot(predictions_ta['Close_real'])
plt.plot(predictions_ta['Close_predicted'])
plt.legend(['Cena rzeczywista',  'Cena przewidywana'], loc='lower right', fontsize=30)
plt.show()

# Learning based on price, volume and fundamental indicators

In [None]:
fundamental_data = pd.read_csv('Testing_data/WiG20_full_data.csv')

fundamental_data.head()

**Firstly we use stocks that have every fundamental indicator. Indicators like Wartość księgowa Grahama (one among many) don't apply to stocks 'ALR', 'PEO', 'PGN', 'PKO', 'PZU' so we drop them off**

In [None]:
scaled_stocks_fa = []
X_scalers_fa = []
Y_scalers_fa = []

for stock in list(set(fundamental_data['Spółka'].unique()) - set(['ALR', 'PEO', 'PGN', 'PKO', 'PZU', 'SPL'])) :
    # Date, stock and quarters is dropped as it isn't considered for learning
    scaled_stock, X_scaler, Y_scaler = scale_data(fundamental_data.loc[fundamental_data['Spółka'] == stock].drop(columns=['Spółka', 'Data', 'Kwartały']))
  
    scaled_stocks_fa.append(scaled_stock)
    X_scalers_fa.append(X_scaler)
    Y_scalers_fa.append(Y_scaler)

In [None]:
fundamental_data['Spółka'].unique()

In [None]:
X_all_fa = []
Y_all_fa = []
scaled_X_list_fa = []
scaled_Y_list_fa = []

for stock in scaled_stocks_fa:
    X, Y = prepare_input_and_output(stock, 60)
    scaled_X_list_fa.append(np.array(X))
    scaled_Y_list_fa.append(np.array(Y))
    X_all_fa = X_all_fa + X
    Y_all_fa = Y_all_fa + Y

X_all_fa, Y_all_fa = np.array(X_all_fa), np.array(Y_all_fa)
print('Shape of input matrix: ' + str(X_all_fa.shape))
print('Shape of output matrix: ' + str(Y_all_fa.shape))

In [None]:
TPE_records = fundamental_data.loc[fundamental_data['Spółka'] == 'TPE'].shape[0]
print('TPE records: ' + str(TPE_records))

In [None]:

# We use one of the stocks data for validation - the TPE which has 2486 records
X_train_fa, X_valid_fa, Y_train_fa, Y_valid_fa = train_test_split(X_all_fa, Y_all_fa, test_size=TPE_records, shuffle=False)

print('Shape of training input matrix: ' + str(X_train_fa.shape))
print('Shape of training output matrix: ' + str(Y_train_fa.shape))
print('Shape of validation input matrix: ' + str(X_valid_fa.shape))
print('Shape of validation output matrix: ' + str(Y_valid_fa.shape))

In [None]:
model_fa = Sequential()
model_fa.add(LSTM(64, input_shape=[X_train_fa.shape[1], X_train_fa.shape[2]]))#, return_sequences=True))
#model.add(LSTM(64))
model_fa.add(Dense(100, activation='relu'))
model_fa.add(Dense(Y_train_fa.shape[1]))

model_fa.summary()

model_fa.compile(loss='mse', optimizer='adam')

history_fa = model_fa.fit(X_train_fa, Y_train_fa, epochs=15, validation_data=(X_valid_fa, Y_valid_fa))

plt.plot(history_fa.history['loss'], label='train')
plt.plot(history_fa.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
predictions_fa = model_fa.predict(X_valid_fa)

predictions_fa = evaluate_results(X_valid_fa, Y_valid_fa, Y_scalers_fa, predictions_fa)

In [None]:
plt.figure(figsize=(25,10))
plt.title('Porównanie rzeczywistych i przewidywanych z użyciem \nwskaźników fundamentalnych cen Tauron SA', fontsize=50)
plt.xlabel('Nr sesji', fontsize=30)
plt.ylabel('Cena zamknięcia (zł)', fontsize=30)
plt.plot(predictions_fa['Close_real'])
plt.plot(predictions_fa['Close_predicted'])
plt.legend(['Cena rzeczywista',  'Cena przewidywana'], loc='lower right', fontsize=30)
plt.show()