In [1]:
import sys
sys.path.append("..")

from core.data_manager import retrieve_data_offline 
from core.constants import BEAR_PATTERNS, BULL_PATTERNS
from core import indicators as ic
from core.common import remove_multi
import vectorbtpro as vbt
import gc
import numbers

import pandas as pd
import numpy as np
from numba import njit
import joblib

from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn import metrics

import pickle

In [11]:
class ML():
    def __init__(
            self,
            period: str,
            indexes: list=["CAC40", "DAX", "NASDAQ"], #,"NYSE"
            ):

        #init
        for k in ["indexes"]:
            setattr(self,k,locals()[k])
        
        for key in ["close","open","low","high","data"]:
            setattr(self,key+"_dic",{})
            setattr(self,key+"_ind_dic",{})

        for ind in self.indexes:
            retrieve_data_offline(self,ind,period)                
            self.data_dic[ind]=self.data
            for d in ["Close","Open","Low","High"]:
                getattr(self,d.lower()+"_dic")[ind]=self.data_dic[ind].get(d)   
                getattr(self,d.lower()+"_ind_dic")[ind]=self.data_ind.get(d)   
            
    def prepare(self,
               test_size:numbers.Number=0.2,
               data_name:str=None,
               ):
        if data_name is None:
            self.defi_x()
            self.x_df=self.flatten(self.all_x)
            self.defi_y()
            self.y_df=self.flatten(self.all_y)
        else:
            self.x_df=pd.read_csv("x_"+data_name+".csv",index_col=[0,1,2])
            self.y_df=pd.read_csv("y_"+data_name+".csv",index_col=[0,1,2])
            
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            self.x_df, 
            self.y_df, 
            test_size=test_size, 
        )
        print("preparation finished")
        
    def save(self,data_name:str):
        self.x_df.to_csv("x_"+data_name+".csv")
        self.y_df.to_csv("y_"+data_name+".csv")

    def defi_x(self):
        self.all_x={}
        for ind in self.indexes: #CAC, DAX, NASDAQ
            all_x={}
            open_=self.open_dic[ind]
            high=self.high_dic[ind]
            low=self.low_dic[ind]
            close=self.close_dic[ind]
            close_ind=self.close_ind_dic[ind]
            
            t=ic.VBTMA.run(close)
            all_x['MA_ent']=t.entries.astype(float)
            all_x['MA_ex']=t.exits.astype(float)
            
            t=ic.VBTSTOCHKAMA.run(high,low,close)
            all_x['STOCH_ent']=t.entries_stoch.astype(float)
            all_x['STOCH_ex']=t.exits_stoch.astype(float)
            all_x['STOCH_v']=t.stoch
            
            all_x['KAMA_ent']=t.entries_kama.astype(float)
            all_x['KAMA_ex']=t.exits_kama.astype(float)

            t=ic.VBTSUPERTREND.run(high,low,close)
            all_x['SUPERTREND_ent']=t.entries.astype(float)
            all_x['SUPERTREND_ex']=t.exits.astype(float)
                            
            t=vbt.BBANDS.run(close)
            all_x['BBANDS_ent']=t.lower_above(close).astype(float)
            all_x['BBANDS_ex']=t.upper_below(close).astype(float)
            all_x["bandwidth"]=t.bandwidth
            all_x["bandwidth_above"]=t.bandwidth_above(close)
            all_x["bandwidth_below"]=t.bandwidth_below(close)
            
            t=vbt.RSI.run(close,wtype='simple')
            all_x['RI20_ent']=t.rsi_crossed_below(20).astype(float)
            all_x['RI20_ex']=t.rsi_crossed_above(80).astype(float)
            
            all_x['RI30_ent']=t.rsi_crossed_below(30).astype(float)
            all_x['RI30_ex']=t.rsi_crossed_above(70).astype(float)

            for func_name in BULL_PATTERNS:
                all_x[func_name+'_ent']=ic.VBTPATTERNONE.run(open_,high,low,close,func_name, "ent").out.astype(float)
                
            for func_name in BEAR_PATTERNS:
                all_x[func_name+'ex']=ic.VBTPATTERNONE.run(open_,high,low,close,func_name, "ex").out.astype(float)
            
            all_x["GROW_30"]=ic.VBTGROW.run(close,distance=30, ma=False).out
            all_x["GROW_30_rank"]=ic.VBTRANK.run(all_x["GROW_30"]).rank_arr
            all_x["GROW_50"]=ic.VBTGROW.run(close,distance=50, ma=False).out
            all_x["GROW_50_rank"]=ic.VBTRANK.run(all_x["GROW_50"]).rank_arr
          
            t=ic.VBTMA.run(close)
            all_x["MA_ent"]=t.entries.astype(float)
            all_x["MA_ex"]=t.exits.astype(float)
            all_x["MA_fast_over_slow"]=t.fast_over_slow.astype(float)
            
            all_x["KAMA_duration"]=ic.VBTKAMATREND.run(close).duration
            all_x["KAMA_duration_rank"]=ic.VBTRANK.run(all_x["KAMA_duration"]).rank_arr

            all_x["volatility"]=ic.VBTNATR.run(high, low, close).natr
            
            macd=vbt.MACD.run(close, macd_wtype='simple',signal_wtype='simple')
            all_x["hist"]=macd.hist
            all_x["macd"]=macd.macd

            all_x["divergence"]=ic.VBTDIVERGENCE.run(close,close_ind).out
            self.all_x[ind]=all_x
                        
        del t
        gc.collect()
        
    def defi_y(self):
        self.all_y={}
        for ind in self.indexes: #CAC, DAX, NASDAQ
            all_y={}
            open_=self.open_dic[ind]
            high=self.high_dic[ind]
            low=self.low_dic[ind]
            close=self.close_dic[ind]  
            
            t=ic.VBTMINMAX.run(close)
            all_y['max_3mo']=t.maximum
            all_y['min_3mo']=t.minimum 
            
            self.all_y[ind]=all_y
        
    def create_empty_x_df(self, ind, s):
        return pd.MultiIndex.from_arrays([
            self.close_dic[ind].index,  #list(
            [s for ii in self.close_dic[ind].index],
            [ind for ii in self.close_dic[ind].index]
        ])
        
    def flatten(self, input_arr):
        df_total=None
        ts={}

        #remove the multiindex only once
        for ind in self.indexes: #CAC, DAX, NASDAQ    
            ts[ind]={}
            for col in input_arr[ind]:
                ts[ind][col]=remove_multi(input_arr[ind][col])
           
        #somehow vbt is designed with the columns in the other orders so to say, which lead to this very computer intensive function
        for ind in self.indexes: #CAC, DAX, NASDAQ
            for s in self.close_dic[ind].columns:
                dfs=[]
                for col in input_arr[ind]:
                    dfs.append(ts[ind][col][s].rename(col))
                #put columns together
                df=pd.concat(dfs,axis=1)
                #clean
                df=df.fillna(0)
                df=df.replace([np.inf, -np.inf], 0)   
                #get the index
                df.set_index(self.create_empty_x_df(ind,s) ,inplace=True)
                
                #put rows together
                if df_total is None:
                    df_total=df
                else:
                    df_total=pd.concat([df_total,df])
        return df_total
    
    def unflatten(self, df, col: str) -> dict:
        indexes=pd.unique(df.index.get_level_values(2))
        out={}
        out2={}

        for ind in indexes:
            sub_df=y_df[df.index.get_level_values(2)==ind]
            out[ind]={}

            for s in pd.unique(sub_df.index.get_level_values(1)):
                sub_df2=sub_df[sub_df.index.get_level_values(1)==s]
                out[ind][s]=sub_df2[col].values
        
            out2[ind]=pd.DataFrame(data=out[ind],index=pd.unique(sub_df.index.get_level_values(0)))  
        return out2
    
    def train(
        self,
        model_name:str="model"
        ):
        
        self.model_name=model_name
        self.scaler = StandardScaler()  
        self.scaler.fit(self.x_train)
        scaled_x_train=self.scaler.transform(self.x_train)
        self.clf =  MLPRegressor(solver='lbfgs', 
                            alpha=1e-5, 
                            hidden_layer_sizes=(10, 2), 
                            random_state=1,
                            max_iter=10000)
        print("starting the fitting")
        self.clf.fit(scaled_x_train, self.y_train)
        with open("models/"+model_name+".pickle", "wb") as f:
            pickle.dump(self.clf, f)
            
        joblib.dump(self.scaler, "models/scaler_"+self.model_name+".save") 
        print("model saved, starting the testing")
        acc=self.test()

    def test(self, model_name:str="model"):
        self.load_model(model_name)
        scaled_x_test=self.scaler.transform(self.x_test)
        acc = self.clf.score(scaled_x_test, self.y_test)
        print("accurary: "+str(acc))
        return acc  
    
    def load_model(self, model_name:str="model"):
        self.model_name=model_name
        if "scaler" not in self.__dir__():
            self.scaler = joblib.load("models/scaler_"+self.model_name+".save")
        if "clf" not in self.__dir__():   
            with open("models/"+model_name+".pickle", 'rb') as pickle_file:
                self.clf = pickle.load(pickle_file)
    
    def use(self,model_name:str, x_df):
        self.load_model(model_name)
        scaled_x_df=self.scaler.transform(x_df)
        y=self.clf.predict(scaled_x_df)
        return y, pd.DataFrame(data=y,columns=['max_3mo','min_3mo'],index=m.x_df.index)    


In [3]:
period="2007_2023_08"

In [12]:
m=ML(period)

  0%|          | 0/40 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

In [165]:
m.prepare()

preparation finished


In [None]:
m.flatten


In [152]:
m.x_df=m.x_df.fillna(0)
m.x_df=m.x_df.replace([np.inf, -np.inf], 0)    

In [102]:
type(m.x_df[m.x_df.index.get_level_values(2)=="CAC40"].index[0][0])

str

In [104]:
type(m.x_df.index[0][0])

str

In [166]:
m.save("first")
    

In [7]:
m.train(model_name="first")

starting the fitting
model saved, starting the testing
accurary: 0.11809063428097971


In [13]:
m.prepare(data_name="first")

preparation finished


In [8]:
m.test(model_name="first")

accurary: 0.11809063428097971


0.11809063428097971

In [14]:
a, y_df=m.use("first",m.x_df) 

In [15]:
out_dic=m.unflatten(y_df,"max_3mo")

In [16]:
ind="NASDAQ"

cand=out_dic[ind].idxmax(axis=1)
ent=ic.VBTFALSE.run(m.close_dic[ind]).out

for i in m.close_dic[ind].index:
    ent.loc[i,cand.loc[i]]=True
ex=~ent  




KeyError: Timestamp('2007-01-03 00:00:00-0500', tz='America/New_York')

In [None]:
pf=vbt.Portfolio.from_signals(m.close_dic[ind], ent,ex,freq="1d",
                            call_seq='auto',cash_sharing=True
                             )
pf.returns_stats()

In [144]:
pf.plot()


Subplot 'orders' does not support grouped data


Subplot 'trade_pnl' does not support grouped data



FigureWidget({
    'data': [{'legendgroup': '0',
              'line': {'color': '#7f7f7f'},
              'mode': 'lines',
              'name': 'Benchmark',
              'showlegend': True,
              'type': 'scatter',
              'uid': '87c7ac17-c8f9-43a7-8238-50c02338ba31',
              'x': array([datetime.datetime(2007, 1, 3, 0, 0, tzinfo=<DstTzInfo 'America/New_York' EST-1 day, 19:00:00 STD>),
                          datetime.datetime(2007, 1, 4, 0, 0, tzinfo=<DstTzInfo 'America/New_York' EST-1 day, 19:00:00 STD>),
                          datetime.datetime(2007, 1, 5, 0, 0, tzinfo=<DstTzInfo 'America/New_York' EST-1 day, 19:00:00 STD>),
                          ...,
                          datetime.datetime(2023, 7, 27, 0, 0, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2023, 7, 28, 0, 0, tzinfo=<DstTzInfo 'America/New_York' EDT-1 day, 20:00:00 DST>),
                          datetime.datetime(2023, 

In [135]:
m.close_dic["CAC40"].iloc[-1000:]

symbol,AC,AI,AIR,ALO,ATO,BN,BNP,CA,CAP,CS,...,SAN,SGO,SLB,STMPA,SU,SW,TEP,TTE,VIE,VIV
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-09-06 00:00:00+02:00,39.555298,95.733505,119.158913,38.462688,68.589592,68.916260,31.397350,14.565660,106.809700,15.700559,...,70.634567,29.351543,27.684576,16.910759,70.191940,93.189453,184.778214,34.835148,19.452808,22.960785
2019-09-09 00:00:00+02:00,39.584988,94.396454,119.384796,38.151169,67.448738,67.465752,32.329285,14.854090,105.164299,15.995494,...,68.617432,29.754700,29.212624,17.129179,70.974274,92.499496,177.742538,35.249844,19.208227,22.761127
2019-09-10 00:00:00+02:00,39.367268,95.198685,115.111649,37.654686,66.229187,67.044090,32.950577,15.223639,103.518906,16.239449,...,68.184578,30.977316,31.010326,16.915609,72.011330,92.269516,173.484131,36.048824,19.155819,22.643150
2019-09-11 00:00:00+02:00,39.129761,95.962723,116.241119,38.336136,66.052162,67.685020,32.765671,15.178574,103.001778,16.352325,...,68.530869,30.907202,31.100212,17.551460,72.666306,93.051460,174.965332,35.873806,18.788950,22.643150
2019-09-12 00:00:00+02:00,37.981796,97.834610,117.483536,36.457294,66.091492,68.174141,32.717594,15.228148,102.343620,16.330477,...,69.058952,30.788885,29.662046,17.653393,73.230309,93.281441,179.316299,35.443882,19.033529,22.815582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-25 00:00:00+02:00,33.689999,160.399994,133.020004,26.910000,14.375000,56.759998,58.590000,17.465000,173.100006,27.745001,...,97.620003,58.790001,51.910000,46.785000,163.500000,93.739998,158.500000,54.340000,29.620001,8.370000
2023-07-26 00:00:00+02:00,33.380001,158.220001,133.179993,26.959999,14.440000,55.900002,58.119999,17.975000,173.100006,27.680000,...,96.410004,58.540001,52.189999,46.330002,160.220001,94.239998,158.100006,53.860001,29.740000,8.354000
2023-07-27 00:00:00+02:00,33.730000,161.960007,131.000000,28.200001,14.580000,56.200001,59.849998,18.540001,179.600006,28.070000,...,97.750000,61.220001,52.590000,50.459999,164.380005,93.620003,131.399994,54.320000,29.780001,8.430000
2023-07-28 00:00:00+02:00,34.049999,163.699997,133.419998,27.790001,11.300000,56.430000,60.180000,18.264999,167.000000,28.180000,...,94.959999,61.459999,51.910000,48.395000,162.139999,93.360001,135.600006,54.439999,29.719999,8.224000


In [138]:
pf.trades.records.sort_values(["entry_idx"], ascending=True).head(50)

Unnamed: 0,id,col,size,entry_order_id,entry_idx,entry_price,entry_fees,exit_order_id,exit_idx,exit_price,exit_fees,pnl,return,direction,status,parent_id
199,0,31,3.61212,0,0,27.684576,0.0,1,29,25.976763,0.0,-6.168826,-0.061688,0,1,0
114,0,24,3.291027,0,29,28.511211,0.0,1,30,29.096851,0.0,1.927357,0.020541,0,1,0
81,0,15,2.347733,0,30,40.787655,0.0,1,37,44.6777,0.0,9.132788,0.095373,0,1,0
122,0,27,2.266928,0,37,46.270252,0.0,1,38,44.411015,0.0,-4.214757,-0.040182,0,1,0
82,1,15,2.301004,2,38,43.753326,0.0,3,39,43.73407,0.0,-0.044309,-0.00044,0,1,1
123,1,27,2.347882,2,39,42.860867,0.0,3,41,44.120655,0.0,2.957835,0.029393,0,1,1
200,1,31,3.557017,2,41,29.122738,0.0,3,48,29.662046,0.0,1.91833,0.018518,0,1,1
124,2,27,2.510196,4,48,42.031937,0.0,5,52,41.26857,0.0,-1.9162,-0.018162,0,1,2
201,2,31,3.729765,4,52,27.774462,0.0,5,57,30.381124,0.0,9.72224,0.093851,0,1,2
125,3,27,2.762711,6,57,41.015678,0.0,7,58,41.15617,0.0,0.388137,0.003425,0,1,3
