In [None]:
# import Libraries 

import numpy as np
import pandas as pd
import hvplot.pandas

# set randomiser seed
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [None]:
ticker_list = ['XLE', 'XLF', 'XLU', 'XLI', 'GDX', 'XLK', 'XLV', 'XLY', 'XLP', 'XLB', 'XOP', 'IYR', 'XHB', 'ITB', 'VNQ', 'GDXJ', 'IYE', 'OIH', 'XME', 'XRT', 'SMH', 'IBB', 'KBE', 'KRE', 'XTL']

In [None]:
ticker_list = ['XLE', 'XLF', 'XLU', 'XLI', 'GDX', 'XLK', 'XLV', 'XLY', 'XLP', 'XLB', 'XOP', 'IYR', 'XHB', 'ITB', 'VNQ', 'GDXJ', 'IYE', 'OIH', 'XME', 'XRT', 'SMH', 'IBB', 'KBE', 'KRE', 'XTL']



# Load CSV ticker data (this is an alternate to the API to keep data consistant)

df_ticker_data = pd.read_csv("../Data/EFT_optimised_signals.csv",
    index_col =[0],
    parse_dates = True,
    header = [0,1]
)


# Replace 'buy' and 'sell' with scalar '1' and '-1' 
df_ticker_data.replace(['buy', 'sell'],[1, -1] ,inplace=True)

df_ticker_data.dropna(inplace = True)

# Check df
df_ticker_data.head()

In [None]:
# this function inputs a data frame, window size and target column name
# this returns a rolling nested list of the features, and a list of the targets

def window_data_chunk(df, window, target_col_name):
    X = []
    y = []
    for i in range(len(df) - window - 1):
        
        df_features = df.drop(axis = 'columns', labels = target_col_name)
        features = df_features.iloc[i:(i + window), :]
        target = df[target_col_name].iloc[i + window]
        X.append(features)
        y.append(target)
        
    return np.array(X), np.array(y).reshape(-1, 1)

In [None]:
Out = X.reshape(2,-1) 

# X[1][1] + X[1][2]


test_train = X.reshape((X.shape[0], X.shape[1]*X.shape[2]))




In [None]:
test_train.shape

In [None]:
test_train[0][170]

In [None]:
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [None]:

# get ticker data
df_input_data = df_ticker_data['XLE']


X, y = window_data_chunk(df_input_data, 20, 'signal')

# X = df_input_data.drop(axis = 'columns', labels = 'signal')
# y = df_input_data['signal']


# Test and train Split

split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]


X.reshape((X.shape[0], X.shape[1] * X.shape[2], 1))

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
X_test  = X_test.reshape((X_test.shape[0], X_test.shape[1] * X_test.shape[2]))


# Create the MinMaxScaler() instance
scaler = MinMaxScaler()

# Fit the MinMaxScaler object with the training feature data X_train
scaler.fit(X_train)

# Scale the features training and testing sets
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))



# Train the model

model = Sequential()

number_units = 5
dropout_fraction = 0.2

# Layer 1
model.add(LSTM(units=number_units, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(dropout_fraction))
# Layer 2
model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3
model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))
# Output layer
model.add(Dense(1))

model.compile(optimizer="adam", loss="mean_squared_error")




In [None]:
X_train.shape

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=2, shuffle=False, batch_size=1, verbose=1)

## Evaluation

In [None]:
model.evaluate(X_test, y_test)

In [None]:
predicted = model.predict(X_test)

In [None]:
predicted

In [None]:
# Create a DataFrame of Real and Predicted values
df_results = pd.DataFrame({
    "Real": y_test.ravel(),
    "Predicted": predicted.ravel()
}, index = df_input_data.index[-len(y_test): ])
df_results.head(50)

In [None]:
df_results.plot()

In [None]:
df_test = df_ticker_data['XLE'].copy




In [None]:
df_test.iloc[5 : (5 + 5), : ]

df_test['signal'].iloc[(5 + 5), ]




In [None]:
df_test['signal'].iloc[(5 + 5)]

In [None]:
df_test.rolling(10).agg(list)

In [None]:
!pip install scikeras

In [None]:
# https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/

# Binary Classification with Sonar Dataset: Standardized Larger
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier # this is new libray
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [None]:

df_ticker_data = pd.read_csv("../Data/EFT_optimised_signals.csv",
    index_col =[0],
    parse_dates = True,
    header = [0,1]
)


# Replace 'buy' and 'sell' with scalar '1' and '-1' 
df_input_data = df_ticker_data.replace(['buy', 'sell'],[1, -1])
df_input_data = df_input_data['XTL']

df_input_data.dropna(inplace = True)







# load dataset
# dataframe = read_csv("sonar.csv", header=None)
# dataset = dataframe.values

# # split into input (X) and output (Y) variables
# X = dataset[:,0:60].astype(float)
# Y = dataset[:,60]

X = df_input_data.drop(axis = 'columns', labels = 'signal')
y = df_input_data['signal']



# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(60, input_shape=(9,), activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_larger, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
predicted = KerasClassifier.predict(X)

In [None]:
X

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced


from sklearn.preprocessing import StandardScaler

import json

In [2]:
# Load the data
# Load the ticker data with returns optimised signals and engineered features

file_path = Path('../Data/ETF_data_including_engineered_features_and_signals.csv')
df_ticker_data = pd.read_csv(file_path,
                index_col =[0],
                parse_dates = True,
                header = [0,1]
)

# Clean the Data
df_ticker_data.dropna(inplace = True)

#define ticker list
ticker_list = ['XLE', 'XLF', 'XLU', 'XLI', 'GDX', 'XLK', 'XLV', 'XLY', 'XLP', 'XLB', 'XOP', 'IYR', 'XHB', 'ITB', 'VNQ', 'GDXJ', 'IYE', 'OIH', 'XME', 'XRT', 'SMH', 'IBB', 'KBE', 'KRE', 'XTL']


# Preview the data
df_ticker_data

Unnamed: 0_level_0,GDX,GDX,GDX,GDX,GDX,GDX,GDX,GDX,GDX,GDX,...,XTL,XTL,XTL,XTL,XTL,XTL,XTL,XTL,XTL,XTL
Unnamed: 0_level_1,ATR,CCI,Close,Dividends,EMA_long,EMA_short,High,Low,MACD_FAST_PERIOD,MACD_SIGNAL_PERIOD,...,High,Low,MACD_FAST_PERIOD,MACD_SIGNAL_PERIOD,MACD_SLOW_PERIOD,Open,RSI,Stock Splits,Volume,signal
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2011-03-16,1.470913,-144.549573,50.820278,0.0,52.584950,52.275360,52.393461,50.305082,0.324209,-0.807016,...,43.691024,42.860203,-0.454148,-0.461295,0.007147,43.691024,35.160164,0.0,594600.0,sell
2011-03-17,1.425647,-115.696023,51.261875,0.0,52.520409,51.937532,51.491873,50.654682,0.145749,-0.788380,...,43.254192,43.117149,-0.538300,-0.436358,-0.101942,43.254192,37.642249,0.0,1700.0,sell
2011-03-18,1.412528,-64.804886,52.126659,0.0,52.501202,52.000574,52.503855,51.593067,0.073255,-0.688700,...,43.069145,42.803391,-0.618339,-0.413117,-0.205222,43.069145,36.169708,0.0,7400.0,sell
2011-03-21,1.402975,-14.156992,53.377853,0.0,52.543966,52.459667,53.405452,52.632660,0.115433,-0.517217,...,43.326326,43.214880,-0.639807,-0.347668,-0.292139,43.309180,40.410325,0.0,1000.0,sell
2011-03-22,1.377019,7.678927,53.515850,0.0,52.591375,52.811728,53.883848,52.844257,0.158172,-0.379583,...,43.309187,43.086295,-0.658226,-0.292870,-0.365356,43.086295,39.672380,0.0,4800.0,sell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-15,1.190403,-161.876171,30.190001,0.0,32.985746,30.709479,30.620001,29.340000,-0.853849,-0.088633,...,77.754791,76.487975,-1.369379,-0.665494,-0.703885,76.487975,38.570940,0.0,2500.0,sell
2022-06-16,1.215374,-103.174728,30.760000,0.0,32.877173,30.726320,30.990000,29.450001,-0.842339,-0.061699,...,75.839594,73.994229,-1.787327,-0.866753,-0.920573,75.839594,32.071075,0.0,2400.0,sell
2022-06-17,1.174991,-83.022222,30.389999,0.0,32.755847,30.614213,30.790001,30.139999,-0.853238,-0.058078,...,76.318399,75.500453,-1.929695,-0.807298,-1.122398,75.500453,38.256364,0.0,2900.0,sell
2022-06-21,1.145348,-64.559803,30.500000,0.0,32.645806,30.576142,30.950001,30.190001,-0.843279,-0.038495,...,77.870003,77.550003,-1.886745,-0.611478,-1.275267,77.550003,42.770727,0.0,3100.0,sell


In [51]:
# set up Record Keeping dataFrames
df_random_forrest_results = pd.DataFrame(columns=[])
dict_random_forrest_results = {} # some items could not be stored in the df so they are stored in this dict

progress_count = 0
#evaluate each ticker with random forrest

# Loop through each ticker in the ticker list
for ticker in ticker_list:
    progress_count += 1
    print(f"Progress: {progress_count} / {len(ticker_list)}")
    
    # set up ticker record dictionary
    dict_random_forrest_results[ticker] = {}
    # set up  
    
    # set up features and Targets for analysis
    X = df_ticker_data[ticker].drop("signal", axis = 'columns')
    y = pd.DataFrame()
    y['signal'] = df_ticker_data[ticker, 'signal']
    
    # Record the balance of our target values
    df_random_forrest_results.loc['sell_count', ticker]  = (y['signal'].value_counts()['sell'] / (y['signal'].value_counts()['sell'] + y['signal'].value_counts()['buy'])) * 100
    df_random_forrest_results.loc['buy_count', ticker]  = (y['signal'].value_counts()['buy'] / (y['signal'].value_counts()['sell'] + y['signal'].value_counts()['buy'])) * 100
    
    # Split the X and y into X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X, y['signal'].values, random_state=10)
    
    # scale data, set up scaler
    scaler = StandardScaler()
    
    # scale X data, note, y is classified and can not be scaled
    X_scaler = scaler.fit(X_train)
    
    X_train = X_scaler.transform(X_train)
    X_test = X_scaler.transform(X_test)
    
    # Resample the training data with the BalancedRandomForestClassifier
    model = BalancedRandomForestClassifier(random_state=1)
    model.fit(X_train, y_train)
    
    # Calculated the balanced accuracy score
    y_pred = model.predict(X_test)
    df_random_forrest_results.loc['balanced_accuracy_score', ticker]  = balanced_accuracy_score(y_test, y_pred)
    
    
    # Captue the confusion matrix (note, convert to % for easy comparison)
    temp_CM = confusion_matrix(y_test, y_pred)
    df_random_forrest_results.loc['True_Buys %', ticker]   = (temp_CM[0,0]/(temp_CM[0,0] + temp_CM[0,1] + temp_CM[1,0] + temp_CM[1,1])) * 100
    df_random_forrest_results.loc['False_Buys %', ticker]  = (temp_CM[0,1]/(temp_CM[0,0] + temp_CM[0,1] + temp_CM[1,0] + temp_CM[1,1])) * 100
    df_random_forrest_results.loc['True_Sells %', ticker]  = (temp_CM[1,1]/(temp_CM[0,0] + temp_CM[0,1] + temp_CM[1,0] + temp_CM[1,1])) * 100
    df_random_forrest_results.loc['False_Sells %', ticker] = (temp_CM[1,0]/(temp_CM[0,0] + temp_CM[0,1] + temp_CM[1,0] + temp_CM[1,1])) * 100
    
    # capture classification report metrics
    classification_string = classification_report_imbalanced(y_test, y_pred)
    df_random_forrest_results.loc['Classification_Report', ticker] = classification_string
    df_random_forrest_results.loc['Buy_Precision', ticker] = float(classification_string[102:106])
    df_random_forrest_results.loc['Sell_Precision', ticker] = float(classification_string[185:189])
    df_random_forrest_results.loc['Buy_Recall', ticker] = float(classification_string[112:116])
    df_random_forrest_results.loc['Sell_Recall', ticker] = float(classification_string[195:199])
    
    # Capture Feature importances
    importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
    df_random_forrest_results.loc['1st_Important_Feature', ticker] = importances.index[0]
    df_random_forrest_results.loc['2nd_Important_Feature', ticker] = importances.index[1]
    df_random_forrest_results.loc['3rd_Important_Feature', ticker] = importances.index[2]
    
    dict_random_forrest_results[ticker]['importances'] = pd.Series.to_dict(importances)
    
    
    # Capture y_test, y_pred, full_predic
#     df_random_forrest_results.loc['y_test', ticker] = y_test # old code
#     df_random_forrest_results.loc['y_predict', ticker] = y_pred # old code
    
    dict_random_forrest_results[ticker]['y_test'] = list(y_test)
    dict_random_forrest_results[ticker]['y_pred'] = list(y_pred)    
    
    
    full_pred = model.predict(X) # capture the predictions on the whole data set to be able to compare the trading results with the optimised results
#     df_random_forrest_results.loc['full_predict', ticker] = full_pred # old code
    dict_random_forrest_results[ticker]['full_pred'] = list(full_pred)
    
    
    # Complete trading backtest
    # Test Model pridictions against the full duration for comparison
    df_backtest = pd.DataFrame()
    df_backtest['signal'] = full_pred
    df_backtest['Close'] = df_ticker_data[ticker, 'Close'].values
    
    
    list_portfolio_value = []

    #impliment buy/sell fee
    fee = 0.01 #(1% fee)

    # set starting conditions
    start_fund = 1000
    fund = start_fund
    stock = 0
    status = 'short'

    # cycle through all rows
    for index, row in df_backtest.iterrows():

        # buy conditions and action
        if status == 'short' and row['signal'] == 'buy':
            stock = np.floor(fund/(row['Close']*(1+fee)))
            fund = round(fund - (row['Close']*stock), 2)
            status = 'long'

        #sell conditions and actions
        if status == 'long' and row['signal'] == 'sell':
            fund = round(fund + ((row['Close'] * stock)*(1-fee)), 2)
            stock = 0
            status = 'short'



        #record the total portfolio value
        list_portfolio_value.append(fund + ((row['Close'] * stock)*(1-fee)))
#         df_temp.loc[index, ('Portfolio_Value')] = fund + ((row['Close'] * stock)*(1-fee)) # old code

    # calculate profit or loss as % return (positive or negative)
    # remaining funds + sale of any stock held less starting value, divide by staring value to get % change
    final_value = fund + ((row['Close'] * stock)*(1-fee))
    pct_return = round((final_value - start_fund)/start_fund,5) * 100

    df_random_forrest_results.loc['BackTest % Return', ticker] = pct_return
#     df_random_forrest_results.loc['Portfolio_Value', ticker] = list_portfolio_value # old code
    dict_random_forrest_results[ticker]['Portfolio_Value'] = list_portfolio_value








Progress: 1 / 25
Progress: 2 / 25
Progress: 3 / 25
Progress: 4 / 25
Progress: 5 / 25
Progress: 6 / 25
Progress: 7 / 25
Progress: 8 / 25
Progress: 9 / 25
Progress: 10 / 25
Progress: 11 / 25
Progress: 12 / 25
Progress: 13 / 25
Progress: 14 / 25
Progress: 15 / 25
Progress: 16 / 25
Progress: 17 / 25
Progress: 18 / 25
Progress: 19 / 25
Progress: 20 / 25
Progress: 21 / 25
Progress: 22 / 25
Progress: 23 / 25
Progress: 24 / 25
Progress: 25 / 25


In [52]:
df_random_forrest_results.to_csv('../Data/ETF_data_including_engineered_features_and_signals.csv')


with open('../Data/dict_random_forrest_results.txt', 'w') as convert_file:
     convert_file.write(json.dumps(dict_random_forrest_results))

    
with open('../Data/dict_random_forrest_results.json', 'w') as fp:
    json.dump(dict_random_forrest_results, fp)

In [50]:
type(dict_random_forrest_results['XLE']['Portfolio_Value'])

list

In [53]:
type(dict_random_forrest_results['XLE'].keys())

dict_keys

In [None]:
dict_record = pd.Series.to_dict(importances)

In [None]:
importances.index[1]

In [None]:
df_random_forrest_results

In [None]:
# Check the balance of our target values
y['signal'].value_counts()

In [None]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y.values, random_state=10)

In [None]:
# Create the StandardScaler instance
# YOUR CODE HERE
from sklearn.preprocessing import StandardScaler
# YOUR CODE HERE
scaler = StandardScaler()

In [None]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
# YOUR CODE HERE

X_scaler = scaler.fit(X_train)

In [None]:
# Scale the training and testing data
# YOUR CODE HERE

X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
model = BalancedRandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

In [None]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
# YOUR CODE HERE

confusion_matrix(y_test, y_pred)#[0,1] = 30 = False Buy, 1,0 = False Sell 1,1 =99 True Sell   0,0 = 160 = True Buys,

In [None]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
classification_report_imbalanced(y_test, y_pred) #[195:199] = rec sell #[185:189] = pre sell # [112:116] = rec buy # [102:106] = Pre buy

In [None]:
# List the features sorted in descending order by feature importance
# YOUR CODE HERE
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
importances.nlargest(20)

### Easy Ensemble Classifier

In [None]:
# Train the Classifier
# YOUR CODE HERE
model = EasyEnsembleClassifier(random_state=1)
model.fit(X_train, y_train)


In [None]:
# Calculated the balanced accuracy score
# YOUR CODE HERE

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
# YOUR CODE HERE

confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
full_pred = model.predict(X)

In [None]:
## returns test

In [None]:
df1['signal'] = full_pred

In [None]:
# Test Model pridictions against the full duration for comparison

df_temp = pd.DataFrame(columns=['Portfolio_Value'])

# complete trading backtest
fee = 0.01 #(1% fee)

# set starting conditions
start_fund = 10000
fund = start_fund
stock = 0
status = 'short'

# cycle through all rows
for index, row in df1.iterrows():

    # buy conditions and action
    if status == 'short' and row['signal'] == 'buy':

        stock = np.floor(fund/(row['Close']*(1+fee)))

        fund = round(fund - (row['Close']*stock), 2)

        status = 'long'

    #sell conditions and actions
    if status == 'long' and row['signal'] == 'sell':

        fund = round(fund + ((row['Close'] * stock)*(1-fee)), 2)

        stock = 0

        status = 'short'

        
        
    #record the total portfolio value        
    df_temp.loc[index, ('Portfolio_Value')] = fund + ((row['Close'] * stock)*(1-fee))
    
# calculate profit or loss as % return (positive or negative)
# remaining funds + sale of any stock held less starting value, divide by staring value to get % change
final_value = fund + ((row['Close'] * stock)*(1-fee))
pct_return = round((final_value - start_fund)/start_fund,5) * 100

print(pct_return)
print(df_temp)

In [None]:
df_temp

In [None]:
test = X

In [None]:
df_random_forrest_results.loc['%_return', 'XLE'] = dict_record

In [None]:
df_random_forrest_results

In [None]:
importances.index

In [None]:
dict_t = {}

In [None]:
dict_t = {'ter' : 'sdfgdsfg'}
dict_t = {'fgh' : 'blah'}

dict_t

