Simple example of batch-learning LDA Classification for a single asset, along with Kelly critereon for position sizing, for comparison to streaming examples

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import pytz
import pandas as pd
import pyfolio as pf
import seaborn as sns
from time import time
from pprint import pprint as pp
import matplotlib.pyplot as plt
from datetime import datetime as dt
import pandas_datareader.data as web

import cytrader as bt
import cytrader.analyzers as btanalyzers

from sklearn.metrics import accuracy_score, classification_report
import sklearn
from joblib import dump, load

from histData import binance_bars
from utils.tools import to_utc
from features import add_hist_features
# from features_py import add_hist_features
from mlPipelines import ttsplit, feat_pipe1, lda_pipe

from strategies import Classifier, KellyML, SMAStrategy
from backtest import BinanceCommision, SignalData, format_time

sk_ver = sklearn.__version__

import timeit

In [None]:
comission = 0.001

# class to define the columns we will provide
class SignalData(bt.feeds.PandasData):
    """
    Define pandas DataFrame structure
    """
    OHLCV = ['open', 'high', 'low', 'close']
    cols = OHLCV + ['predicted']

    # create lines
    lines = tuple(cols)
    # define parameters
    params = {c: -1 for c in cols}
    params.update({'datetime': None})
    params = tuple(params.items())

In [None]:
tickers=['BTCUSDT']
interval='1h'
start_dt='2017-01-01'
end_dt=None
vol_window=24*30
test_size=0.2
start_nav=100000

local_csv = False
save_csv = False

In [None]:
start_dt = to_utc(start_dt)
print ("START DATE:", start_dt)
end_dt = pytz.utc.localize(dt.utcnow()) if not end_dt else to_utc(end_dt)
print("END DATE:", end_dt)
print("TICKERS:", tickers)

In [None]:
sets = {}

if not local_csv:
    for ticker in tickers:
        sec = binance_bars(symbol=ticker, interval=interval,
                            start_dt=start_dt, end_dt=end_dt, limit=None, dtype='df');

else:
    for ticker in tickers:
        sec = pd.read_csv('../csv_files/BTCUSDT_1h_2017-08-17_2022-05-10.csv',
                        low_memory=False, index_col=['DateTime'], parse_dates=['DateTime'], infer_datetime_format=True)
sets[ticker] = sec 

In [None]:
for ticker in sets:
    dset = add_hist_features(ticker, sec, interval=interval, vol_window=vol_window, test_size=test_size)
    sets[ticker] = dset
    
    if save_csv:
        str_start = dset.index.min().strftime('%Y-%m-%d')
        str_end = dset.index.max().strftime('%Y-%m-%d')
        f_name = f'csv_files/{ticker}_{interval}_{str_start}_{str_end}.csv'
        dset.to_csv(f_name)

In [None]:
sets[ticker].info()

In [None]:
sets['BTCUSDT'][['Price_Returns', 'target', 'Kalman_Filter']]

In [None]:
for ticker, dset in sets.items():
    X_train, X_test, y_train, y_test = ttsplit(dset, test_size)

    feat_pipe = feat_pipe1(dset)
    X_train = feat_pipe.fit_transform(X_train)
    X_test = feat_pipe.transform(X_test)

    model = lda_pipe().fit(X_train, y_train)
    model_name = f'../models/LDA_{interval}_{ticker}_{dt.now().strftime("%Y_%m_%d-%H_%M")}_skl{sk_ver}.joblib'
    
    dump(model, model_name)
    model = load(model_name)

    predictions = model.predict(X_test)
    print('')
    print(ticker, "Accuracy:","{:.2%}".format(accuracy_score(y_test, predictions)))
    print (classification_report(y_test, predictions, zero_division=0))
    display(pd.DataFrame({'Test Set':y_test, 'Predictions': predictions}))

    train_len = int(len(dset) * (1-test_size))
    train_set, test_set = dset.iloc[0:train_len].copy(),dset.iloc[train_len:len(dset)].copy()

    test_set['predicted'] = predictions

    sets[ticker] = test_set

In [None]:
comission = 0.001
cerebro = bt.Cerebro()
comminfo = BinanceCommision()
# comminfo = FixedCommisionScheme()
cerebro.broker.addcommissioninfo(comminfo)
cerebro.broker.setcash(start_nav)
print ("Opening NAV: ${:,.2f}".format(start_nav))

for ticker in sets:
    bt_df = sets[ticker]
    bt_df.index.name = 'datetime'
    bt_data = SignalData(dataname=bt_df)
    cerebro.adddata(bt_data, name=ticker)

cerebro.addstrategy(KellyML, n_positions=1, min_positions=0, 
                    verbose=True, kel_bounds=[0., 1.], kel_window=24*30)
# cerebro.addstrategy(SMAStrategy, n_positions=1, min_positions=0, 
#                     verbose=True, ma_window=60, kel_bounds=[0., 1.], kel_window=90)
                    
cerebro.addanalyzer(bt.analyzers.SQN, _name='sqn')
cerebro.addanalyzer(bt.analyzers.PyFolio, _name='pyfolio')
cerebro.addanalyzer(btanalyzers.Transactions, _name = "trans")
cerebro.addanalyzer(btanalyzers.TradeAnalyzer, _name = "trades")

In [None]:
start = time()
results = cerebro.run()
ending_value = cerebro.broker.getvalue()
duration = time() - start
n_trades = len(results[0].analyzers.trans.get_analysis()) 

print ("Test Start:", bt_df.index.min())
print ("Test End:", bt_df.index.max())
test_dt_len = ((bt_df.index.max() - bt_df.index.min()).total_seconds())/(24*60*60)
print (f"Days: {test_dt_len:,.2f}")

print ("Opening NAV: ${:,.2f}".format(start_nav))
print(f'Closing NAV ${ending_value:,.2f}')
gross_ret = (ending_value/start_nav)-1
print ("Gross Return: {:,.2%}".format(gross_ret))
print(f'Duration: {format_time(duration)}')
print(f'Trades: {n_trades:,}')
print('System Quality Number: ', results[0].analyzers.sqn.get_analysis())
print('Total Comission: ${:,.2f}'.format(n_trades*comission))

In [None]:
# print('Trades: ')
# pp(results[0].analyzers.trades.get_analysis())

In [None]:
# prepare pyfolio inputs
pyfolio_analyzer = results[0].analyzers.getbyname('pyfolio')
returns, positions, transactions, gross_lev = pyfolio_analyzer.get_pf_items()

returns.to_hdf('backtrader.h5', 'returns')
positions.to_hdf('backtrader.h5', 'positions')
transactions.to_hdf('backtrader.h5', 'transactions/')
gross_lev.to_hdf('backtrader.h5', 'gross_lev')

bench_name = 'CBBTCUSD' # 'SP500'
benchmark = web.DataReader(bench_name, 'fred', returns.index.min(), returns.index.max()).squeeze()
benchmark = benchmark.pct_change().tz_localize('UTC')

daily_tx = transactions.groupby(level=0)
longs = daily_tx.value.apply(lambda x: x.where(x>0).sum())
shorts = daily_tx.value.apply(lambda x: x.where(x<0).sum())

fig, axes = plt.subplots(ncols=2, figsize=(15, 5))

df = returns.to_frame('Strategy').join(benchmark.to_frame(f'Benchmark ({bench_name})'))
df.add(1).cumprod().sub(1).plot(ax=axes[0], title='Cumulative Return')

longs.plot(label='Long',ax=axes[1], title='Positions')
shorts.plot(ax=axes[1], label='Short')
positions.cash.plot(ax=axes[1], label='PF Value')
axes[1].legend()
sns.set()
sns.despine()
fig.tight_layout();

pf.create_full_tear_sheet(returns,
                        transactions=transactions,
                        positions=positions,
                        benchmark_rets=benchmark.fillna(0))