In [None]:
# import Libraries 

import numpy as np
import pandas as pd
import hvplot.pandas

# set randomiser seed
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

from sklearn.preprocessing import MinMaxScaler


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [None]:
ticker_list = ['XLE', 'XLF', 'XLU', 'XLI', 'GDX', 'XLK', 'XLV', 'XLY', 'XLP', 'XLB', 'XOP', 'IYR', 'XHB', 'ITB', 'VNQ', 'GDXJ', 'IYE', 'OIH', 'XME', 'XRT', 'SMH', 'IBB', 'KBE', 'KRE', 'XTL']



# Load CSV ticker data (this is an alternate to the API to keep data consistant)

df_ticker_data = pd.read_csv("../Data/EFT_optimised_signals.csv",
    index_col =[0],
    parse_dates = True,
    header = [0,1]
)


# Replace 'buy' and 'sell' with scalar '1' and '-1' 
df_ticker_data.replace(['buy', 'sell'],[1, -1] ,inplace=True)

df_ticker_data.dropna(inplace = True)

# Check df
df_ticker_data.head()

In [None]:
# this function inputs a data frame, window size and target column name
# this returns a rolling nested list of the features, and a list of the targets

def window_data_chunk(df, window, target_col_name):
    X = []
    y = []
    for i in range(len(df) - window - 1):
        
        df_features = df.drop(axis = 'columns', labels = target_col_name)
        features = df_features.iloc[i:(i + window), :]
        target = df[target_col_name].iloc[i + window]
        X.append(features)
        y.append(target)
        
    return np.array(X), np.array(y).reshape(-1, 1)

In [None]:
Out = X.reshape(2,-1) 

# X[1][1] + X[1][2]


test_train = X.reshape((X.shape[0], X.shape[1]*X.shape[2]))




In [None]:
test_train.shape

In [None]:
test_train[0][170]

In [None]:
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

In [None]:

# get ticker data
df_input_data = df_ticker_data['XLE']


X, y = window_data_chunk(df_input_data, 20, 'signal')

# X = df_input_data.drop(axis = 'columns', labels = 'signal')
# y = df_input_data['signal']


# Test and train Split

split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]


X.reshape((X.shape[0], X.shape[1] * X.shape[2], 1))

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
X_test  = X_test.reshape((X_test.shape[0], X_test.shape[1] * X_test.shape[2]))


# Create the MinMaxScaler() instance
scaler = MinMaxScaler()

# Fit the MinMaxScaler object with the training feature data X_train
scaler.fit(X_train)

# Scale the features training and testing sets
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


# Reshape the features for the model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))



# Train the model

model = Sequential()

number_units = 5
dropout_fraction = 0.2

# Layer 1
model.add(LSTM(units=number_units, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(dropout_fraction))
# Layer 2
model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))
# Layer 3
model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))
# Output layer
model.add(Dense(1))

model.compile(optimizer="adam", loss="mean_squared_error")




In [None]:
X_train.shape

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, epochs=2, shuffle=False, batch_size=1, verbose=1)

## Evaluation

In [None]:
model.evaluate(X_test, y_test)

In [None]:
predicted = model.predict(X_test)

In [None]:
predicted

In [None]:
# Create a DataFrame of Real and Predicted values
df_results = pd.DataFrame({
    "Real": y_test.ravel(),
    "Predicted": predicted.ravel()
}, index = df_input_data.index[-len(y_test): ])
df_results.head(50)

In [None]:
df_results.plot()

In [None]:
df_test = df_ticker_data['XLE'].copy




In [None]:
df_test.iloc[5 : (5 + 5), : ]

df_test['signal'].iloc[(5 + 5), ]




In [None]:
df_test['signal'].iloc[(5 + 5)]

In [None]:
df_test.rolling(10).agg(list)

In [None]:
!pip install scikeras

In [None]:
# https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/

# Binary Classification with Sonar Dataset: Standardized Larger
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier # this is new libray
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [None]:

df_ticker_data = pd.read_csv("../Data/EFT_optimised_signals.csv",
    index_col =[0],
    parse_dates = True,
    header = [0,1]
)


# Replace 'buy' and 'sell' with scalar '1' and '-1' 
df_input_data = df_ticker_data.replace(['buy', 'sell'],[1, -1])
df_input_data = df_input_data['XTL']

df_input_data.dropna(inplace = True)







# load dataset
# dataframe = read_csv("sonar.csv", header=None)
# dataset = dataframe.values

# # split into input (X) and output (Y) variables
# X = dataset[:,0:60].astype(float)
# Y = dataset[:,60]

X = df_input_data.drop(axis = 'columns', labels = 'signal')
y = df_input_data['signal']



# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_Y = encoder.transform(y)

# larger model
def create_larger():
    # create model
    model = Sequential()
    model.add(Dense(60, input_shape=(9,), activation='relu'))
    model.add(Dense(30, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasClassifier(model=create_larger, epochs=100, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(pipeline, X, encoded_Y, cv=kfold)
print("Larger: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

In [None]:
predicted = KerasClassifier.predict(X)

In [None]:
X

In [1]:
import warnings
warnings.filterwarnings('ignore')


In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
file_path = Path('../Data/EFT_optimised_signals.csv')
df = pd.read_csv(file_path,
                index_col =[0],
                parse_dates = True,
                header = [0,1]
)

# Preview the data
df.head()

Unnamed: 0_level_0,XTL,XTL,XTL,XTL,XTL,XTL,XTL,KRE,KRE,KRE,...,IBB,KBE,KBE,KBE,KRE,KRE,KRE,XTL,XTL,XTL
Unnamed: 0_level_1,Open,High,Low,Close,Volume,Dividends,Stock Splits,Open,High,Low,...,signal,EMA_short,EMA_long,signal,EMA_short,EMA_long,signal,EMA_short,EMA_long,signal
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2017-07-10,65.807837,66.284233,65.742449,66.284233,2100,0.0,0,49.230791,49.443759,48.991203,...,buy,39.10754,39.10754,sell,49.150928,49.150928,sell,66.284233,66.284233,sell
2017-07-11,65.751801,65.93862,65.723779,65.779823,9200,0.0,0,49.168673,49.230788,48.689498,...,buy,39.063135,39.101042,sell,49.121349,49.1466,sell,66.116096,66.259628,buy
2017-07-12,66.321578,66.321578,65.93859,66.088051,14400,0.0,0,48.813726,49.266282,48.751611,...,buy,39.021686,39.093127,sell,49.066134,49.137288,sell,66.106748,66.251258,buy
2017-07-13,65.863877,66.237521,65.845191,66.069382,14200,0.0,0,49.10657,49.275168,48.831487,...,buy,39.044382,39.092963,sell,49.088486,49.137088,sell,66.094293,66.242386,buy
2017-07-14,66.013344,66.312258,66.013344,66.293579,2300,0.0,0,48.432167,49.115433,48.210327,...,buy,38.99142,39.082843,sell,48.999859,49.121747,sell,66.160721,66.244883,buy


In [13]:
# Create our features
df1 = df['XTL']
df1.dropna(inplace = True)

X = df1.drop("signal", axis = 'columns').copy()
# encode non-calculateable fields to binary
# X = pd.get_dummies(X, columns = ['next_pymnt_d','issue_d','home_ownership','verification_status', 'pymnt_plan', 'hardship_flag', 'debt_settlement_flag', 'application_type', 'initial_list_status'])


# Create our target
y = df1["signal"].to_frame()


In [14]:
# Check the balance of our target values
# YOUR CODE HERE
y['signal'].value_counts()

buy     739
sell    514
Name: signal, dtype: int64

In [15]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y.values, random_state=10)

In [16]:
# Create the StandardScaler instance
# YOUR CODE HERE
from sklearn.preprocessing import StandardScaler
# YOUR CODE HERE
scaler = StandardScaler()

In [17]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
# YOUR CODE HERE

X_scaler = scaler.fit(X_train)

In [18]:
# Scale the training and testing data
# YOUR CODE HERE

X_train = X_scaler.transform(X_train)
X_test = X_scaler.transform(X_test)

In [19]:
# Resample the training data with the BalancedRandomForestClassifier
# YOUR CODE HERE
model = BalancedRandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

BalancedRandomForestClassifier(random_state=1)

In [20]:
# Calculated the balanced accuracy score
# YOUR CODE HERE
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8662321539416511

In [21]:
# Display the confusion matrix
# YOUR CODE HERE

confusion_matrix(y_test, y_pred)

array([[151,  28],
       [ 15, 120]], dtype=int64)

In [22]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        buy       0.91      0.84      0.89      0.88      0.87      0.75       179
       sell       0.81      0.89      0.84      0.85      0.87      0.75       135

avg / total       0.87      0.86      0.87      0.86      0.87      0.75       314



In [23]:
# List the features sorted in descending order by feature importance
# YOUR CODE HERE
importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
importances.nlargest(20)

EMA_long        0.315303
Close           0.145609
EMA_short       0.122882
Low             0.118328
High            0.107512
Open            0.100563
Volume          0.087959
Dividends       0.001844
Stock Splits    0.000000
dtype: float64

### Easy Ensemble Classifier

In [24]:
# Train the Classifier
# YOUR CODE HERE
model = EasyEnsembleClassifier(random_state=1)
model.fit(X_train, y_train)


EasyEnsembleClassifier(random_state=1)

In [25]:
# Calculated the balanced accuracy score
# YOUR CODE HERE

y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.8774053382991931

In [26]:
# Display the confusion matrix
# YOUR CODE HERE

confusion_matrix(y_test, y_pred)

array([[155,  24],
       [ 15, 120]], dtype=int64)

In [27]:
# Print the imbalanced classification report
# YOUR CODE HERE
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

        buy       0.91      0.87      0.89      0.89      0.88      0.77       179
       sell       0.83      0.89      0.87      0.86      0.88      0.77       135

avg / total       0.88      0.88      0.88      0.88      0.88      0.77       314



In [29]:
full_pred = model.predict(X)

In [None]:
## returns test

In [30]:
df1['signal'] = full_pred

In [35]:
# Test Model pridictions against the full duration for comparison

df_temp = pd.DataFrame(columns=['Portfolio_Value'])

# complete trading backtest
fee = 0.01 #(1% fee)

# set starting conditions
start_fund = 1000
fund = start_fund
stock = 0
status = 'short'

# cycle through all rows
for index, row in df1.iterrows():

    # buy conditions and action
    if status == 'short' and row['signal'] == 'buy':

        stock = np.floor(fund/(row['Close']*(1+fee)))

        fund = round(fund - (row['Close']*stock), 2)

        status = 'long'

    #sell conditions and actions
    if status == 'long' and row['signal'] == 'sell':

        fund = round(fund + ((row['Close'] * stock)*(1-fee)), 2)

        stock = 0

        status = 'short'

        
        
    #record the total portfolio value        
    df_temp.loc[index, ('Portfolio_Value')] = fund + ((row['Close'] * stock)*(1-fee))
    
# calculate profit or loss as % return (positive or negative)
# remaining funds + sale of any stock held less starting value, divide by staring value to get % change
final_value = fund + ((row['Close'] * stock)*(1-fee))
pct_return = round((final_value - start_fund)/start_fund,5) * 100

print(pct_return)
print(df_temp)

16.114
           Portfolio_Value
2017-07-10      990.719471
2017-07-11      983.728351
2017-07-12      988.000385
2017-07-13      987.741631
2017-07-14      990.849006
...                    ...
2022-06-23         1166.96
2022-06-24     1187.056958
2022-06-27     1189.690392
2022-06-28     1162.801958
2022-06-29     1161.138825

[1253 rows x 1 columns]
