In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
# from imblearn.ensemble import EasyEnsembleClassifier #not used any more
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler
import json

from sklearn.naive_bayes import BernoulliNB


In [2]:
# Load the data
# Load the ticker data with returns optimised signals and engineered features

file_path = Path('../Data/ETF_data_including_engineered_features_and_signals.csv')
df_ticker_data = pd.read_csv(file_path,
                index_col =[0],
                parse_dates = True,
                header = [0,1]
)

# Clean the Data
df_ticker_data.dropna(inplace = True)

#define ticker list
ticker_list = ['XLE', 'XLF', 'XLU', 'XLI', 'GDX', 'XLK', 'XLV', 'XLY', 'XLP', 'XLB', 'XOP', 'IYR', 'XHB', 'ITB', 'VNQ', 'GDXJ', 'IYE', 'OIH', 'XME', 'XRT', 'SMH', 'IBB', 'KBE', 'KRE', 'XTL']


# Preview the data
df_ticker_data

Unnamed: 0_level_0,GDX,GDX,GDX,GDX,GDX,GDX,GDX,GDX,GDX,GDX,...,XTL,XTL,XTL,XTL,XTL,XTL,XTL,XTL,XTL,XTL
Unnamed: 0_level_1,ATR,CCI,Close,Dividends,EMA_long,EMA_short,High,Low,MACD_FAST_PERIOD,MACD_SIGNAL_PERIOD,...,High,Low,MACD_FAST_PERIOD,MACD_SIGNAL_PERIOD,MACD_SLOW_PERIOD,Open,RSI,Stock Splits,Volume,signal
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2011-03-16,1.470913,-144.549573,50.820278,0.0,52.584950,52.275360,52.393461,50.305082,0.324209,-0.807016,...,43.691024,42.860203,-0.454148,-0.461295,0.007147,43.691024,35.160164,0.0,594600.0,sell
2011-03-17,1.425647,-115.696023,51.261875,0.0,52.520409,51.937532,51.491873,50.654682,0.145749,-0.788380,...,43.254192,43.117149,-0.538300,-0.436358,-0.101942,43.254192,37.642249,0.0,1700.0,sell
2011-03-18,1.412528,-64.804886,52.126659,0.0,52.501202,52.000574,52.503855,51.593067,0.073255,-0.688700,...,43.069145,42.803391,-0.618339,-0.413117,-0.205222,43.069145,36.169708,0.0,7400.0,sell
2011-03-21,1.402975,-14.156992,53.377853,0.0,52.543966,52.459667,53.405452,52.632660,0.115433,-0.517217,...,43.326326,43.214880,-0.639807,-0.347668,-0.292139,43.309180,40.410325,0.0,1000.0,sell
2011-03-22,1.377019,7.678927,53.515850,0.0,52.591375,52.811728,53.883848,52.844257,0.158172,-0.379583,...,43.309187,43.086295,-0.658226,-0.292870,-0.365356,43.086295,39.672380,0.0,4800.0,sell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-15,1.190403,-161.876171,30.190001,0.0,32.985746,30.709479,30.620001,29.340000,-0.853849,-0.088633,...,77.754791,76.487975,-1.369379,-0.665494,-0.703885,76.487975,38.570940,0.0,2500.0,sell
2022-06-16,1.215374,-103.174728,30.760000,0.0,32.877173,30.726320,30.990000,29.450001,-0.842339,-0.061699,...,75.839594,73.994229,-1.787327,-0.866753,-0.920573,75.839594,32.071075,0.0,2400.0,sell
2022-06-17,1.174991,-83.022222,30.389999,0.0,32.755847,30.614213,30.790001,30.139999,-0.853238,-0.058078,...,76.318399,75.500453,-1.929695,-0.807298,-1.122398,75.500453,38.256364,0.0,2900.0,sell
2022-06-21,1.145348,-64.559803,30.500000,0.0,32.645806,30.576142,30.950001,30.190001,-0.843279,-0.038495,...,77.870003,77.550003,-1.886745,-0.611478,-1.275267,77.550003,42.770727,0.0,3100.0,sell


## Random Forrest Classifier Machine Learning Model

In [3]:
# set up Record Keeping dataFrames
df_random_forrest_results = pd.DataFrame(columns=[])
dict_random_forrest_results = {} # some items could not be stored in the df so they are stored in this dict

progress_count = 0
#evaluate each ticker with random forrest

# Loop through each ticker in the ticker list
for ticker in ticker_list:
    # print progrss of analysis
    progress_count += 1
    print(f"Progress: {progress_count} / {len(ticker_list)} ---- {ticker}")
    
    # set up ticker record dictionary
    dict_random_forrest_results[ticker] = {}
    # set up  
    
    # set up features and Targets for analysis
    X = df_ticker_data[ticker].drop("signal", axis = 'columns')
    y = pd.DataFrame()
    y['signal'] = df_ticker_data[ticker, 'signal']
    
    # Record the balance of our target values
    df_random_forrest_results.loc['sell_count', ticker]  = (y['signal'].value_counts()['sell'] / (y['signal'].value_counts()['sell'] + y['signal'].value_counts()['buy'])) * 100
    df_random_forrest_results.loc['buy_count', ticker]  = (y['signal'].value_counts()['buy'] / (y['signal'].value_counts()['sell'] + y['signal'].value_counts()['buy'])) * 100
    
    # Split the X and y into X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X, y['signal'].values, random_state=10, test_size = 0.3)
    
    # scale data, set up scaler
    scaler = StandardScaler()
    
    # scale X data, note, y is classified and can not be scaled
    X_scaler = scaler.fit(X_train)
    
    X_train = X_scaler.transform(X_train)
    X_test = X_scaler.transform(X_test)
    
    # Fit the training data with the BalancedRandomForestClassifier
    model = BalancedRandomForestClassifier(random_state=1)
    model.fit(X_train, y_train)
    
    # Calculated the balanced accuracy score
    y_pred = model.predict(X_test)
    df_random_forrest_results.loc['balanced_accuracy_score', ticker]  = balanced_accuracy_score(y_test, y_pred)
    
    
    # Captue the confusion matrix (note, convert to % for easy comparison)
    temp_CM = confusion_matrix(y_test, y_pred)
    df_random_forrest_results.loc['True_Buys %', ticker]   = (temp_CM[0,0]/(temp_CM[0,0] + temp_CM[0,1])) * 100
    df_random_forrest_results.loc['False_Buys %', ticker]  = (temp_CM[0,1]/(temp_CM[0,0] + temp_CM[0,1])) * 100
    df_random_forrest_results.loc['True_Sells %', ticker]  = (temp_CM[1,1]/(temp_CM[1,0] + temp_CM[1,1])) * 100
    df_random_forrest_results.loc['False_Sells %', ticker] = (temp_CM[1,0]/(temp_CM[1,0] + temp_CM[1,1])) * 100
    
    # capture classification report metrics
    classification_string = classification_report_imbalanced(y_test, y_pred)
    df_random_forrest_results.loc['Classification_Report', ticker] = classification_string
    df_random_forrest_results.loc['Buy_Precision', ticker] = float(classification_string[102:106])
    df_random_forrest_results.loc['Sell_Precision', ticker] = float(classification_string[185:189])
    df_random_forrest_results.loc['Buy_Recall', ticker] = float(classification_string[112:116])
    df_random_forrest_results.loc['Sell_Recall', ticker] = float(classification_string[195:199])
    
    # Capture Feature importances
    importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
    df_random_forrest_results.loc['1st_Important_Feature', ticker] = importances.index[0]
    df_random_forrest_results.loc['2nd_Important_Feature', ticker] = importances.index[1]
    df_random_forrest_results.loc['3rd_Important_Feature', ticker] = importances.index[2]
    
    dict_random_forrest_results[ticker]['importances'] = pd.Series.to_dict(importances)
    
    
    # Capture y_test, y_pred, full_predic
#     df_random_forrest_results.loc['y_test', ticker] = y_test # old code
#     df_random_forrest_results.loc['y_predict', ticker] = y_pred # old code
    
    dict_random_forrest_results[ticker]['y_test'] = list(y_test)
    dict_random_forrest_results[ticker]['y_pred'] = list(y_pred)    
    
    
    full_pred = model.predict(X) # capture the predictions on the whole data set to be able to compare the trading results with the optimised results
#     df_random_forrest_results.loc['full_predict', ticker] = full_pred # old code
    dict_random_forrest_results[ticker]['full_pred'] = list(full_pred)
    
    
    # Complete trading backtest
    # Test Model pridictions against the full duration for comparison
    df_backtest = pd.DataFrame()
    df_backtest['signal'] = full_pred
    df_backtest['Close'] = df_ticker_data[ticker, 'Close'].values
    
    
    list_portfolio_value = []

    #impliment buy/sell fee
    fee = 0.01 #(1% fee)

    # set starting conditions
    start_fund = 1000
    fund = start_fund
    stock = 0
    status = 'short'

    # cycle through all rows
    for index, row in df_backtest.iterrows():

        # buy conditions and action
        if status == 'short' and row['signal'] == 'buy':
            stock = np.floor(fund/(row['Close']*(1+fee)))
            fund = round(fund - (row['Close']*stock), 2)
            status = 'long'

        #sell conditions and actions
        if status == 'long' and row['signal'] == 'sell':
            fund = round(fund + ((row['Close'] * stock)*(1-fee)), 2)
            stock = 0
            status = 'short'



        #record the total portfolio value
        list_portfolio_value.append(fund + ((row['Close'] * stock)*(1-fee)))
#         df_temp.loc[index, ('Portfolio_Value')] = fund + ((row['Close'] * stock)*(1-fee)) # old code

    # calculate profit or loss as % return (positive or negative)
    # remaining funds + sale of any stock held less starting value, divide by staring value to get % change
    final_value = fund + ((row['Close'] * stock)*(1-fee))
    pct_return = round((final_value - start_fund)/start_fund,5) * 100

    df_random_forrest_results.loc['BackTest % Return', ticker] = pct_return
#     df_random_forrest_results.loc['Portfolio_Value', ticker] = list_portfolio_value # old code
    dict_random_forrest_results[ticker]['Portfolio_Value'] = list_portfolio_value




Progress: 1 / 25 ---- XLE
Progress: 2 / 25 ---- XLF
Progress: 3 / 25 ---- XLU
Progress: 4 / 25 ---- XLI
Progress: 5 / 25 ---- GDX
Progress: 6 / 25 ---- XLK
Progress: 7 / 25 ---- XLV
Progress: 8 / 25 ---- XLY
Progress: 9 / 25 ---- XLP
Progress: 10 / 25 ---- XLB
Progress: 11 / 25 ---- XOP
Progress: 12 / 25 ---- IYR
Progress: 13 / 25 ---- XHB
Progress: 14 / 25 ---- ITB
Progress: 15 / 25 ---- VNQ
Progress: 16 / 25 ---- GDXJ
Progress: 17 / 25 ---- IYE
Progress: 18 / 25 ---- OIH
Progress: 19 / 25 ---- XME
Progress: 20 / 25 ---- XRT
Progress: 21 / 25 ---- SMH
Progress: 22 / 25 ---- IBB
Progress: 23 / 25 ---- KBE
Progress: 24 / 25 ---- KRE
Progress: 25 / 25 ---- XTL


In [4]:

# save the df as .csv
df_random_forrest_results.to_csv('../Data/df_random_forrest_results.csv')

# save the dict as .txt
with open('../Data/dict_random_forrest_results.txt', 'w') as convert_file:
     convert_file.write(json.dumps(dict_random_forrest_results))

#save the dict as .json
with open('../Data/dict_random_forrest_results.json', 'w') as fp:
    json.dump(dict_random_forrest_results, fp)

## Naive Bayes Bernoulli Classifier Machine Learning Model

In [5]:
# set up Record Keeping dataFrames
df_naive_bayes_results = pd.DataFrame(columns=[])
dict_naive_bayes_results = {} # some items could not be stored in the df so they are stored in this dict

progress_count = 0
#evaluate each ticker with random forrest

# Loop through each ticker in the ticker list
for ticker in ticker_list:
    # print progrss of analysis
    progress_count += 1
    print(f"Progress: {progress_count} / {len(ticker_list)} ---- {ticker}")
    
    # set up ticker record dictionary
    dict_naive_bayes_results[ticker] = {}
    # set up  
    
    # set up features and Targets for analysis
    X = pd.DataFrame()
    X = df_ticker_data[ticker].drop("signal", axis = 'columns')
    y = pd.DataFrame()
    y['signal'] = df_ticker_data[ticker, 'signal']
    
    # Record the balance of our target values
    df_naive_bayes_results.loc['sell_count', ticker]  = (y['signal'].value_counts()['sell'] / (y['signal'].value_counts()['sell'] + y['signal'].value_counts()['buy'])) * 100
    df_naive_bayes_results.loc['buy_count', ticker]  = (y['signal'].value_counts()['buy'] / (y['signal'].value_counts()['sell'] + y['signal'].value_counts()['buy'])) * 100
    
    # Split the X and y into X_train, X_test, y_train, y_test
    X_train, X_test, y_train, y_test = train_test_split(X, y['signal'].values, random_state=10, test_size = 0.3)
    
#     # scale data, set up scaler
#     scaler = StandardScaler()
    
#     # scale X data, note, y is classified and can not be scaled
#     X_scaler = scaler.fit(X_train)
    
#     X_train = X_scaler.transform(X_train)
#     X_test = X_scaler.transform(X_test)
    
    # fit the training data with the Naive_bays BernoulliNB model
    
    model = BernoulliNB(binarize = True)
    model.fit(X_train, y_train)
    
    # Calculated the balanced accuracy score
    y_pred = model.predict(X_test)
    df_naive_bayes_results.loc['balanced_accuracy_score', ticker]  = balanced_accuracy_score(y_test, y_pred)
    
    
    # Captue the confusion matrix (note, convert to % for easy comparison)
    temp_CM = confusion_matrix(y_test, y_pred)
    df_naive_bayes_results.loc['True_Buys %', ticker]   = (temp_CM[0,0]/(temp_CM[0,0] + temp_CM[0,1])) * 100
    df_naive_bayes_results.loc['False_Buys %', ticker]  = (temp_CM[0,1]/(temp_CM[0,0] + temp_CM[0,1])) * 100
    df_naive_bayes_results.loc['True_Sells %', ticker]  = (temp_CM[1,1]/(temp_CM[1,0] + temp_CM[1,1])) * 100
    df_naive_bayes_results.loc['False_Sells %', ticker] = (temp_CM[1,0]/(temp_CM[1,0] + temp_CM[1,1])) * 100
    
    # capture classification report metrics
    classification_string = classification_report_imbalanced(y_test, y_pred)
    df_naive_bayes_results.loc['Classification_Report', ticker] = classification_string
    df_naive_bayes_results.loc['Buy_Precision', ticker] = float(classification_string[102:106])
    df_naive_bayes_results.loc['Sell_Precision', ticker] = float(classification_string[185:189])
    df_naive_bayes_results.loc['Buy_Recall', ticker] = float(classification_string[112:116])
    df_naive_bayes_results.loc['Sell_Recall', ticker] = float(classification_string[195:199])
    
    # Capture Feature importances
#     importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
#     df_naive_bayes_results.loc['1st_Important_Feature', ticker] = importances.index[0]
#     df_naive_bayes_results.loc['2nd_Important_Feature', ticker] = importances.index[1]
#     df_naive_bayes_results.loc['3rd_Important_Feature', ticker] = importances.index[2]
    
#     dict_naive_bayes_results[ticker]['importances'] = pd.Series.to_dict(importances)
    
    
    # Capture y_test, y_pred, full_predic
#     df_naive_bayes_results.loc['y_test', ticker] = y_test # old code
#     df_naive_bayes_results.loc['y_predict', ticker] = y_pred # old code
    
    dict_naive_bayes_results[ticker]['y_test'] = list(y_test)
    dict_naive_bayes_results[ticker]['y_pred'] = list(y_pred)    
    
    
    full_pred = model.predict(X) # capture the predictions on the whole data set to be able to compare the trading results with the optimised results
#     df_naive_bayes_results.loc['full_predict', ticker] = full_pred # old code
    dict_naive_bayes_results[ticker]['full_pred'] = list(full_pred)
    
    
    # Complete trading backtest
    # Test Model pridictions against the full duration for comparison
    df_backtest = pd.DataFrame()
    df_backtest['signal'] = full_pred
    df_backtest['Close'] = df_ticker_data[ticker, 'Close'].values
    
    
    list_portfolio_value = []

    #impliment buy/sell fee
    fee = 0.01 #(1% fee)

    # set starting conditions
    start_fund = 1000
    fund = start_fund
    stock = 0
    status = 'short'

    # cycle through all rows
    for index, row in df_backtest.iterrows():

        # buy conditions and action
        if status == 'short' and row['signal'] == 'buy':
            stock = np.floor(fund/(row['Close']*(1+fee)))
            fund = round(fund - (row['Close']*stock), 2)
            status = 'long'

        #sell conditions and actions
        if status == 'long' and row['signal'] == 'sell':
            fund = round(fund + ((row['Close'] * stock)*(1-fee)), 2)
            stock = 0
            status = 'short'



        #record the total portfolio value
        list_portfolio_value.append(fund + ((row['Close'] * stock)*(1-fee)))
#         df_temp.loc[index, ('Portfolio_Value')] = fund + ((row['Close'] * stock)*(1-fee)) # old code

    # calculate profit or loss as % return (positive or negative)
    # remaining funds + sale of any stock held less starting value, divide by staring value to get % change
    final_value = fund + ((row['Close'] * stock)*(1-fee))
    pct_return = round((final_value - start_fund)/start_fund,5) * 100

    df_naive_bayes_results.loc['BackTest % Return', ticker] = pct_return
#     df_naive_bayes_results.loc['Portfolio_Value', ticker] = list_portfolio_value # old code
    dict_naive_bayes_results[ticker]['Portfolio_Value'] = list_portfolio_value

Progress: 1 / 25 ---- XLE
Progress: 2 / 25 ---- XLF
Progress: 3 / 25 ---- XLU
Progress: 4 / 25 ---- XLI
Progress: 5 / 25 ---- GDX
Progress: 6 / 25 ---- XLK
Progress: 7 / 25 ---- XLV
Progress: 8 / 25 ---- XLY
Progress: 9 / 25 ---- XLP
Progress: 10 / 25 ---- XLB
Progress: 11 / 25 ---- XOP
Progress: 12 / 25 ---- IYR
Progress: 13 / 25 ---- XHB
Progress: 14 / 25 ---- ITB
Progress: 15 / 25 ---- VNQ
Progress: 16 / 25 ---- GDXJ
Progress: 17 / 25 ---- IYE
Progress: 18 / 25 ---- OIH
Progress: 19 / 25 ---- XME
Progress: 20 / 25 ---- XRT
Progress: 21 / 25 ---- SMH
Progress: 22 / 25 ---- IBB
Progress: 23 / 25 ---- KBE
Progress: 24 / 25 ---- KRE
Progress: 25 / 25 ---- XTL


In [6]:
# save the df as .csv
df_naive_bayes_results.to_csv('../Data/df_naive_bayes_results.csv')

# save the dict as .txt
with open('../Data/dict_naive_bayes_results.txt', 'w') as convert_file:
     convert_file.write(json.dumps(dict_naive_bayes_results))

#save the dict as .json
with open('../Data/dict_naive_bayes_results.json', 'w') as fp:
    json.dump(dict_naive_bayes_results, fp)