In [None]:
# import Libraries 

import numpy as np
import pandas as pd
import hvplot.pandas

# set randomiser seed
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [35]:
ticker_list = ['XLE', 'XLF', 'XLU', 'XLI', 'GDX', 'XLK', 'XLV', 'XLY', 'XLP', 'XLB', 'XOP', 'IYR', 'XHB', 'ITB', 'VNQ', 'GDXJ', 'IYE', 'OIH', 'XME', 'XRT', 'SMH', 'IBB', 'KBE', 'KRE', 'XTL']

In [None]:
ticker_list = ['XLE', 'XLF', 'XLU', 'XLI', 'GDX', 'XLK', 'XLV', 'XLY', 'XLP', 'XLB', 'XOP', 'IYR', 'XHB', 'ITB', 'VNQ', 'GDXJ', 'IYE', 'OIH', 'XME', 'XRT', 'SMH', 'IBB', 'KBE', 'KRE', 'XTL']



# Load CSV ticker data (this is an alternate to the API to keep data consistant)

df_ticker_data = pd.read_csv("../Data/EFT_optimised_signals.csv",
    index_col =[0],
    parse_dates = True,
    header = [0,1]
)


# Replace 'buy' and 'sell' with scalar '1' and '-1' 
df_ticker_data.replace(['buy', 'sell'],[1, -1] ,inplace=True)

df_ticker_data.dropna(inplace = True)

# Check df
df_ticker_data.head()

In [None]:
# this function inputs a data frame, window size and target column name
# this returns a rolling nested list of the features, and a list of the targets

def window_data_chunk(df, window, target_col_name):
    X = []
    y = []
    for i in range(len(df) - window - 1):
        
        df_features = df.drop(axis = 'columns', labels = target_col_name)
        features = df_features.iloc[i:(i + window), :]
        target = df[target_col_name].iloc[i + window]
        X.append(features)
        y.append(target)
        
    return np.array(X), np.array(y).reshape(-1, 1)

In [None]:
Out = X.reshape(2,-1) 

# X[1][1] + X[1][2]


test_train = X.reshape((X.shape[0], X.shape[1]*X.shape[2]))




In [None]:
test_train.shape

In [None]:
test_train[0][170]

In [None]:
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [None]:

# get ticker data
df_input_data = df_ticker_data['XLE']


X, y = window_data_chunk(df_input_data, 20, 'signal')

# X = df_input_data.drop(axis = 'columns', labels = 'signal')
# y = df_input_data['signal']


# Test and train Split

split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]


X.reshape((X.shape[0], X.shape[1] * X.shape[2], 1))

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
X_test  = X_test.reshape((X_test.shape[0], X_test.shape[1] * X_test.shape[2]))


# Create the MinMaxScaler() instance
scaler = MinMaxScaler()

# Fit the MinMaxScaler object with the training feature data X_train
scaler.fit(X_train)

# Scale the features training and testing sets
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))



# Train the model

model = Sequential()

number_units = 5
dropout_fraction = 0.2

# Layer 1
model.add(LSTM(units=number_units, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(dropout_fraction))
# Layer 2
model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3
model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))
# Output layer
model.add(Dense(1))

model.compile(optimizer="adam", loss="mean_squared_error")




In [None]:
X_train.shape

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=2, shuffle=False, batch_size=1, verbose=1)

## Evaluation

In [None]:
model.evaluate(X_test, y_test)

In [None]:
predicted = model.predict(X_test)

In [None]:
predicted

In [None]:
# Create a DataFrame of Real and Predicted values
df_results = pd.DataFrame({
    "Real": y_test.ravel(),
    "Predicted": predicted.ravel()
}, index = df_input_data.index[-len(y_test): ])
df_results.head(50)

In [None]:
df_results.plot()

In [None]:
df_test = df_ticker_data['XLE'].copy




In [None]:
df_test.iloc[5 : (5 + 5), : ]

df_test['signal'].iloc[(5 + 5), ]




In [None]:
df_test['signal'].iloc[(5 + 5)]

In [None]:
df_test.rolling(10).agg(list)

In [None]:
!pip install scikeras

In [None]:
# https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/

# Binary Classification with Sonar Dataset: Standardized Larger
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier # this is new libray
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [None]:

df_ticker_data = pd.read_csv("../Data/EFT_optimised_signals.csv",
    index_col =[0],
    parse_dates = True,
    header = [0,1]
)


# Replace 'buy' and 'sell' with scalar '1' and '-1' 
df_input_data = df_ticker_data.replace(['buy', 'sell'],[1, -1])
df_input_data = df_input_data['XTL']

df_input_data.dropna(inplace = True)







# load dataset
# dataframe = read_csv("sonar.csv", header=None)
# dataset = dataframe.values

# # split into input (X) and output (Y) variables
# X = dataset[:,0:60].astype(float)
# Y = dataset[:,60]

X = df_input_data.drop(axis = 'columns', labels = 'signal')
y = df_input_data['signal']



# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(60, input_shape=(9,), activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_larger, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
predicted = KerasClassifier.predict(X)

In [None]:
X

In [110]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced


from sklearn.preprocessing import StandardScaler


In [111]:
# Load the data
# Load the ticker data with returns optimised signals and engineered features

file_path = Path('../Data/ETF_data_including_engineered_features_and_signals.csv')
df_ticker_data = pd.read_csv(file_path,
                index_col =[0],
                parse_dates = True,
                header = [0,1]
)

# Clean the Data
df_ticker_data.dropna(inplace = True)



# Preview the data
df_ticker_data

Unnamed: 0_level_0,GDX,GDX,GDX,GDX,GDX,GDX,GDX,GDX,GDX,GDX,...,XTL,XTL,XTL,XTL,XTL,XTL,XTL,XTL,XTL,XTL
Unnamed: 0_level_1,ATR,CCI,Close,Dividends,EMA_long,EMA_short,High,Low,MACD_FAST_PERIOD,MACD_SIGNAL_PERIOD,...,High,Low,MACD_FAST_PERIOD,MACD_SIGNAL_PERIOD,MACD_SLOW_PERIOD,Open,RSI,Stock Splits,Volume,signal
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-08-24,0.368656,98.534755,22.368999,0.0,21.453932,22.227395,22.464799,22.206142,0.310016,0.005471,...,65.667716,65.340773,-0.598293,-0.072055,-0.526237,65.527598,49.539976,0,3100,sell
2017-08-25,0.368326,110.642995,22.426479,0.0,21.492071,22.293757,22.598917,22.234881,0.316105,0.009248,...,65.695747,65.490243,-0.518054,0.006547,-0.524601,65.527608,50.652395,0,4500,sell
2017-08-28,0.400865,253.007051,23.240768,0.0,21.560647,22.609427,23.250349,22.570177,0.382230,0.060299,...,65.611660,65.359454,-0.467170,0.045945,-0.513115,65.480885,49.009664,0,27300,sell
2017-08-29,0.413973,249.027857,23.394045,0.0,21.632545,22.870966,23.700600,23.116228,0.441909,0.095982,...,65.630361,65.004503,-0.410056,0.082447,-0.492503,65.004503,50.169953,0,15900,sell
2017-08-30,0.404247,151.287553,23.144968,0.0,21.691856,22.962300,23.374885,23.116228,0.463761,0.094267,...,65.854540,65.434195,-0.351694,0.112647,-0.464341,65.434195,51.069135,0,3900,sell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-14,1.183511,-241.562304,29.700001,0.0,33.463692,30.969219,30.580000,29.400000,-0.798083,-0.055026,...,76.448074,75.839601,-1.196869,-0.659358,-0.537511,76.448074,32.171472,0,700,sell
2022-06-15,1.190403,-161.876171,30.190001,0.0,33.335312,30.709479,30.620001,29.340000,-0.853849,-0.088633,...,77.754791,76.487975,-1.369379,-0.665494,-0.703885,76.487975,38.570940,0,2500,sell
2022-06-16,1.215374,-103.174728,30.760000,0.0,33.234320,30.726320,30.990000,29.450001,-0.842339,-0.061699,...,75.839594,73.994229,-1.787327,-0.866753,-0.920573,75.839594,32.071075,0,2400,sell
2022-06-17,1.174991,-83.022222,30.389999,0.0,33.122778,30.614213,30.790001,30.139999,-0.853238,-0.058078,...,76.318399,75.500453,-1.929695,-0.807298,-1.122398,75.500453,38.256364,0,2900,sell


In [129]:
# set up Record Keeping dataFrames
df_random_forrest_results = pd.DataFrame(columns=[])
dict_random_forrest_results = pd.DataFrame(columns=[])

#evaluate each ticker with random forrest

# Loop through each ticker in the ticker list
for ticker in ticker_list:
    
    # set up features and Targets for analysis
    X = df_ticker_data[ticker].drop("signal", axis = 'columns')
    y['signal'] = df_ticker_data[ticker, 'signal']
    
    # Record the balance of our target values
    df_random_forrest_results.loc['sell_count', ticker]  = (y['signal'].value_counts()['sell'] / (y['signal'].value_counts()['sell'] + y['signal'].value_counts()['buy'])) * 100
    df_random_forrest_results.loc['buy_count', ticker]  = (y['signal'].value_counts()['buy'] / (y['signal'].value_counts()['sell'] + y['signal'].value_counts()['buy'])) * 100
    
    # Split the X and y into X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X, y['signal'].values, random_state=10)
    
    # scale data, set up scaler
    scaler = StandardScaler()
    
    # scale X data, note, y is classified and can not be scaled
    X_scaler = scaler.fit(X_train)
    
    X_train = X_scaler.transform(X_train)
    X_test = X_scaler.transform(X_test)
    
    # Resample the training data with the BalancedRandomForestClassifier
    model = BalancedRandomForestClassifier(random_state=1)
    model.fit(X_train, y_train)
    
    # Calculated the balanced accuracy score
    y_pred = model.predict(X_test)
    df_random_forrest_results.loc['balanced_accuracy_score', ticker]  = balanced_accuracy_score(y_test, y_pred)
    
    
    # Captue the confusion matrix (note, convert to % for easy comparison)
    temp_CM = confusion_matrix(y_test, y_pred)
    df_random_forrest_results.loc['True_Buys %', ticker]   = (temp_CM[0,0]/(temp_CM[0,0] + temp_CM[0,1] + temp_CM[1,0] + temp_CM[1,1])) * 100
    df_random_forrest_results.loc['False_Buys %', ticker]  = (temp_CM[0,1]/(temp_CM[0,0] + temp_CM[0,1] + temp_CM[1,0] + temp_CM[1,1])) * 100
    df_random_forrest_results.loc['True_Sells %', ticker]  = (temp_CM[1,1]/(temp_CM[0,0] + temp_CM[0,1] + temp_CM[1,0] + temp_CM[1,1])) * 100
    df_random_forrest_results.loc['False_Sells %', ticker] = (temp_CM[1,0]/(temp_CM[0,0] + temp_CM[0,1] + temp_CM[1,0] + temp_CM[1,1])) * 100
    
    # capture classification report metrics
    classification_string = classification_report_imbalanced(y_test, y_pred)
    df_random_forrest_results.loc['Classification_Report', ticker] = classification_string
    df_random_forrest_results.loc['Buy_Precision', ticker] = float(classification_string[102:106])
    df_random_forrest_results.loc['Sell_Precision', ticker] = float(classification_string[185:189])
    df_random_forrest_results.loc['Buy_Recall', ticker] = float(classification_string[112:116])
    df_random_forrest_results.loc['Sell_Recall', ticker] = float(classification_string[195:199])
    
    # Capture Feature importances
    importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
    df_random_forrest_results.loc['1st_Important_Feature', ticker] = pd.Series.to_dict(importances)
    
    
    
    # Capture y_test, y_pred, full_predic
    df_random_forrest_results.loc['y_test', ticker] = y_test
    df_random_forrest_results.loc['y_predict', ticker] = y_pred
    
    full_pred = model.predict(X) # capture the predictions on the whole data set to be able to compare the trading results with the optimised results
    df_random_forrest_results.loc['full_predict', ticker] = full_pred
       
    
    
    
    # Complete trading backtest
    
    # Test Model pridictions against the full duration for comparison
    df_temp = pd.DataFrame(columns=['Portfolio_Value'])

    #impliment buy/sell fee
    fee = 0.01 #(1% fee)

    # set starting conditions
    start_fund = 1000
    fund = start_fund
    stock = 0
    status = 'short'

    # cycle through all rows
    for index, row in df1.iterrows():

        # buy conditions and action
        if status == 'short' and row['signal'] == 'buy':
            stock = np.floor(fund/(row['Close']*(1+fee)))
            fund = round(fund - (row['Close']*stock), 2)
            status = 'long'

        #sell conditions and actions
        if status == 'long' and row['signal'] == 'sell':
            fund = round(fund + ((row['Close'] * stock)*(1-fee)), 2)
            stock = 0
            status = 'short'



        #record the total portfolio value        
        df_temp.loc[index, ('Portfolio_Value')] = fund + ((row['Close'] * stock)*(1-fee))

    # calculate profit or loss as % return (positive or negative)
    # remaining funds + sale of any stock held less starting value, divide by staring value to get % change
    final_value = fund + ((row['Close'] * stock)*(1-fee))
    pct_return = round((final_value - start_fund)/start_fund,5) * 100

    df_random_forrest_results.loc['%_return', ticker] = pct_return
    df_random_forrest_results.loc['Portfolio_Value', ticker] = df_temp



ValueError: Incompatible indexer with Series

In [132]:
dict_record = pd.Series.to_dict(importances)

In [139]:
importances.index[1]

'RSI'

In [51]:
# Check the balance of our target values
y['signal'].value_counts()

buy     726
sell    494
Name: signal, dtype: int64

In [7]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y.values, random_state=10)

In [8]:
# Create the StandardScaler instance
# YOUR CODE HERE
from sklearn.preprocessing import StandardScaler
# YOUR CODE HERE
scaler = StandardScaler()

In [9]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
# YOUR CODE HERE

X_scaler = scaler.fit(X_train)

In [10]:
# Scale the training and testing data
# YOUR CODE HERE

X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

In [11]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
model = BalancedRandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [12]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8723112128146453

In [67]:
# Display the confusion matrix
# YOUR CODE HERE

confusion_matrix(y_test, y_pred)#[0,1] = 30 = False Buy, 1,0 = False Sell 1,1 =99 True Sell   0,0 = 160 = True Buys,

array([[160,  30],
       [ 16,  99]], dtype=int64)

In [84]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        buy       0.91      0.84      0.86      0.87      0.85      0.72       190
       sell       0.77      0.86      0.84      0.81      0.85      0.73       115

avg / total       0.86      0.85      0.85      0.85      0.85      0.72       305



In [101]:
classification_report_imbalanced(y_test, y_pred) #[195:199] = rec sell #[185:189] = pre sell # [112:116] = rec buy # [102:106] = Pre buy

'0.86'

In [15]:
# List the features sorted in descending order by feature importance
# YOUR CODE HERE
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
importances.nlargest(20)

RSI                   0.176871
CCI                   0.134457
MACD_FAST_PERIOD      0.130047
MACD_SLOW_PERIOD      0.098928
MACD_SIGNAL_PERIOD    0.079092
EMA_long              0.075556
ATR                   0.058509
EMA_short             0.047363
Close                 0.044135
High                  0.041662
Low                   0.041465
Open                  0.036423
Volume                0.033522
Dividends             0.001969
Stock Splits          0.000000
dtype: float64

### Easy Ensemble Classifier

<bound method Series.keys of RSI                   0.176871
CCI                   0.134457
MACD_FAST_PERIOD      0.130047
MACD_SLOW_PERIOD      0.098928
MACD_SIGNAL_PERIOD    0.079092
EMA_long              0.075556
ATR                   0.058509
EMA_short             0.047363
Close                 0.044135
High                  0.041662
Low                   0.041465
Open                  0.036423
Volume                0.033522
Dividends             0.001969
Stock Splits          0.000000
dtype: float64>

In [16]:
# Train the Classifier
# YOUR CODE HERE
model = EasyEnsembleClassifier(random_state=1)
model.fit(X_train, y_train)


EasyEnsembleClassifier(random_state=1)

In [17]:
# Calculated the balanced accuracy score
# YOUR CODE HERE

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.851487414187643

In [18]:
# Display the confusion matrix
# YOUR CODE HERE

confusion_matrix(y_test, y_pred)

array([[160,  30],
       [ 16,  99]], dtype=int64)

In [19]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        buy       0.91      0.84      0.86      0.87      0.85      0.72       190
       sell       0.77      0.86      0.84      0.81      0.85      0.73       115

avg / total       0.86      0.85      0.85      0.85      0.85      0.72       305



In [20]:
full_pred = model.predict(X)

In [21]:
## returns test

In [22]:
df1['signal'] = full_pred

In [45]:
# Test Model pridictions against the full duration for comparison

df_temp = pd.DataFrame(columns=['Portfolio_Value'])

# complete trading backtest
fee = 0.01 #(1% fee)

# set starting conditions
start_fund = 10000
fund = start_fund
stock = 0
status = 'short'

# cycle through all rows
for index, row in df1.iterrows():

    # buy conditions and action
    if status == 'short' and row['signal'] == 'buy':

        stock = np.floor(fund/(row['Close']*(1+fee)))

        fund = round(fund - (row['Close']*stock), 2)

        status = 'long'

    #sell conditions and actions
    if status == 'long' and row['signal'] == 'sell':

        fund = round(fund + ((row['Close'] * stock)*(1-fee)), 2)

        stock = 0

        status = 'short'

        
        
    #record the total portfolio value        
    df_temp.loc[index, ('Portfolio_Value')] = fund + ((row['Close'] * stock)*(1-fee))
    
# calculate profit or loss as % return (positive or negative)
# remaining funds + sale of any stock held less starting value, divide by staring value to get % change
final_value = fund + ((row['Close'] * stock)*(1-fee))
pct_return = round((final_value - start_fund)/start_fund,5) * 100

print(pct_return)
print(df_temp)

-25.417
           Portfolio_Value
2017-08-24     9901.037389
2017-08-25     9924.777275
2017-08-28     9891.260867
2017-08-29       9913.6059
2017-08-30     9930.362393
...                    ...
2022-06-23          7497.4
2022-06-24     7632.336716
2022-06-27     7650.018343
2022-06-28     7469.481716
2022-06-29      7458.31497

[1220 rows x 1 columns]


In [109]:
df_temp

Unnamed: 0,Portfolio_Value
2017-08-24,9901.037389
2017-08-25,9924.777275
2017-08-28,9891.260867
2017-08-29,9913.6059
2017-08-30,9930.362393
...,...
2022-06-23,7497.4
2022-06-24,7632.336716
2022-06-27,7650.018343
2022-06-28,7469.481716


In [123]:
test = X

In [133]:
df_random_forrest_results.loc['%_return', 'XLE'] = dict_record

ValueError: Incompatible indexer with Series

In [127]:
df_random_forrest_results

Unnamed: 0,XLE
sell_count,39.456343
buy_count,60.543657
balanced_accuracy_score,0.880598
True_Buys %,57.565789
False_Buys %,7.565789
True_Sells %,30.592105
False_Sells %,4.276316
Classification_Report,pre rec spe ...
Buy_Precision,0.93
Sell_Precision,0.8


In [137]:
importances.index

Index(['MACD_FAST_PERIOD', 'RSI', 'MACD_SLOW_PERIOD', 'CCI',
       'MACD_SIGNAL_PERIOD', 'ATR', 'EMA_long', 'EMA_short', 'Volume', 'Low',
       'Close', 'High', 'Open', 'Dividends', 'Stock Splits'],
      dtype='object')