In [None]:
## Necessary packages
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import warnings
warnings.filterwarnings("ignore")

# 1. TimeGAN model
from timegan import timegan
# 2. Data loading
from data_loading import real_data_loading, sine_data_generation
# 3. Metrics
from metrics.discriminative_metrics import discriminative_score_metrics
from metrics.predictive_metrics import predictive_score_metrics
from metrics.visualization_metrics import visualization
import pandas as pd
from tqdm import tqdm

In [None]:
def get_intervals(data):
    index=data['index']
    last_value=index[0]-1
    last_index=0
    intervals=[]
    for i in range(data.shape[0]):
        if last_value!=index[i]-1:
            intervals.append([last_index,i])
            last_value=index[i]
            last_index=i
        last_value=index[i]
    intervals.append([last_index, i])
    return intervals

In [None]:
def interpolation(data):
    max_len=24
    l=len(data)
    to_fill=max_len-l
    if to_fill!=0:
        interval=max_len//to_fill
        for j in range(to_fill):
            idx=(interval+1)*j+interval
            data.insert(min(idx,len(data)-1),float('nan'))
    data=pd.Series(data).interpolate(method='polynomial', order=2)
    return data

In [None]:
def MinMaxScaler(data):
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)
    norm_data = numerator / (denominator + 1e-7)
    return norm_data

In [None]:
def minmax_normlization(data):
    normalized_data=(data-data.min())/(data.max()-data.min()+ 1e-7)
    return normalized_data

In [None]:
def get_data_of_same_length(data,seq_len):
    data_processed=[]
    intervals=get_intervals(data)
    temp_data=[]
    data.drop(columns=['index'])
    for interval in intervals:
#         print(interval)
        data_seg=data.iloc[interval[0]:interval[1],:]
        for i in range(0, len(data_seg) - seq_len):
            _x = data_seg.iloc[i:i + seq_len,:]
            temp_data.append(_x)
    return temp_data

# Run through

In [None]:
def dataset_prepareation(path):
    data=pd.read_csv(path).reset_index()
    tics=data['tic'].unique()
    features=[ 'open', 'high', 'low', 'close', 'adjcp','zopen', 'zhigh', 'zlow', 'zadjcp', 'zclose', 'zd_5', 'zd_10',
       'zd_15', 'zd_20', 'zd_25', 'zd_30', 'pct_return', 'adjcp_filtered',
       'pct_return_filtered','volume']
    ret=[]
    for col in data.columns:
        if col in features:
            ret.append(col)
    features=ret
    for tic in tics:
        data_by_tic=data.loc[data['tic']==tic,features].astype(float)
        norm_data_by_tic=MinMaxScaler(data_by_tic)
        data.loc[data['tic']==tic,features]=norm_data_by_tic
    stock_group_num=len(data['stock_type'].unique())
    regime_num=len(data['label'].unique())
    for tic in tics:
        for j in range(regime_num):
            data_seg=data.loc[(data['tic']==tic) & (data['label']==j),['index','open','high','low','close','adjcp','volume']]
    #         data_dict[(i,j)]=data_seg
            data_seg.to_csv('./data/data_seg_'+tic+'_'+str(j)+'.csv')

## Data Loading

Load original dataset and preprocess the loaded data.

- data_name: stock, energy, or sine
- seq_len: sequence length of the time-series data

In [None]:
dataset_prepareation("/home/hcxia/TradeMaster_dev/TradeMaster/data/data/other/DJI_all_labeled_3_24.csv")

In [None]:
dataset_prepareation('/home/hcxia/TradeMaster_dev/TradeMaster/data/data/other/GOOG_labeled_3_24.csv')

# Make data dict

In [None]:
def prepare_data_for_trainning(path):
    data=pd.read_csv(path).drop('index', axis=1)
    data=data.reset_index().rename(columns={data.index.name:'index'})
#     display(data.head())
    data=get_data_of_same_length(data,24)
#     display(len(data))
    data=[d.loc[:,['open','high','low','close','adjcp','volume']].to_numpy() for d in data]
    return data

### GOOG data

In [None]:
GOOG_data={}
for i in range(3):
    GOOG_data['data_seg_GOOG_'+str(i)]=prepare_data_for_trainning('./data/data_seg_GOOG_'+str(i)+'.csv')
    print(i,len(GOOG_data['data_seg_GOOG_'+str(i)]))

### Single stock data

In [None]:
data=pd.read_csv("/home/hcxia/TradeMaster_dev/TradeMaster/data/data/other/DJI_all_labeled_3_24.csv").reset_index()
tics=data['tic'].unique()
data_dict_tic={}
for tic in tics:
    data_dict_tic[tic]={}
    for i in range(3):
        data_dict_tic[tic]['data_seg_'+str(tic)+'_'+str(i)]=prepare_data_for_trainning('./data/data_seg_'+str(tic)+'_'+str(i)+'.csv')
        print(tic,i,len(data_dict_tic[tic]['data_seg_'+str(tic)+'_'+str(i)]))

### Stock group data

In [None]:
data=pd.read_csv("/home/hcxia/TradeMaster_dev/TradeMaster/data/data/other/DJI_all_labeled_3_24.csv").reset_index()
# display(data.columns)
tic_group_pair=data.loc[:,['tic','stock_type']]
tic_group_pair=tic_group_pair.groupby(['tic','stock_type']).size().reset_index(name='Freq')
stock_group_num=len(data['stock_type'].unique())
tic_in_group={}
for group in range(stock_group_num):
#     if group not in tic_in_group：
#         tic_in_group[group]=[]
    tic_in_group[group]=list(tic_group_pair.loc[tic_group_pair['stock_type']==group,:]['tic'].unique())

In [None]:
print(tic_in_group)

In [None]:
data=pd.read_csv("/home/hcxia/TradeMaster_dev/TradeMaster/data/data/other/DJI_all_labeled_3_24.csv").reset_index()
stock_group_num=len(data['stock_type'].unique())
data_dict_group={}
for group in range(stock_group_num):
    data_dict_group[group]={}
    for i in range(3):
        if 'data_seg_'+str(group)+'_'+str(i) not in data_dict_group[group]:
            data_dict_group[group]['data_seg_'+str(group)+'_'+str(i)]=[]
        for tic in tic_in_group[group]:
            data_dict_group[group]['data_seg_'+str(group)+'_'+str(i)].extend(data_dict_tic[tic]['data_seg_'+str(tic)+'_'+str(i)])
        print(group,i,len(data_dict_group[group]['data_seg_'+str(group)+'_'+str(i)]))

### All dji stock data

In [None]:
data=pd.read_csv("/home/hcxia/TradeMaster_dev/TradeMaster/data/data/other/DJI_all_labeled_3_24.csv").reset_index()
tics=data['tic'].unique()
data_all={}
for i in range(3):
    if 'data_seg_'+'all'+'_'+str(i) not in data_all:
        data_all['data_seg_'+'all'+'_'+str(i)]=[]
    for tic in tics:
        data_all['data_seg_'+'all'+'_'+str(i)].extend(data_dict_tic[tic]['data_seg_'+str(tic)+'_'+str(i)])
    print(i,len(data_all['data_seg_'+'all'+'_'+str(i)]))

# data set:

- GOOG_data
- data_dict_tic (dict of dict by tic)
- data_dict_group (dict of dict by group num)
- data_all

In [None]:
print(GOOG_data.keys())
print(data_dict_tic.keys())
print(data_dict_group.keys())
print(data_all.keys())

# Pre-train Static learning classification discriminator

In [None]:
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.datasets import load_arrow_head
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sktime.classification.kernel_based import RocketClassifier

In [None]:
def get_data_of_same_length_df(data,seq_len):
    data_processed=[]
    intervals=get_intervals(data)
    temp_data=[]
    data.drop(columns=['index'])
    for interval in intervals:
        data_seg=data.iloc[interval[0]:interval[1],:]
        for i in range(0, len(data_seg) - seq_len):
            _x = data[i:i + seq_len]
            temp_data.append(_x)
    return temp_data

In [None]:
data=pd.read_csv('data_seg_'+"0"+'_'+"0"+'.csv')
display(data.columns)

In [None]:
for tic in tics:
    print(tic)
    X=[]
    y=np.empty(0)
    for j in range(regime_num):
        data=pd.read_csv('data_seg_'+tic+'_'+str(j)+'.csv').loc[:,['index', 'open', 'high', 'low', 'close', 'adjcp',
       'pct_return', 'adjcp_filtered', 'pct_return_filtered']]
        process_data=get_data_of_same_length_df(data,24)
        label=np.full(len(process_data), j)
        X.extend(process_data)
        y=np.concatenate((y, label), axis=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    classifier = RocketClassifier(num_kernels=2000,n_jobs=-1)
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print(accuracy_score(y_test, y_pred))

ALL single stock classification have unbelieve 100% acc?

In [None]:
for i in range(stock_group_num):
    print('stock_group',i)
    X=[]
    y=np.empty(0)
    for j in range(regime_num):
        data=pd.read_csv('data_seg_'+str(i)+'_'+str(j)+'.csv').loc[:,['index', 'open', 'high', 'low', 'close', 'adjcp',
       'pct_return', 'adjcp_filtered', 'pct_return_filtered']]
        process_data=get_data_of_same_length_df(data,24)
        label=np.full(len(process_data), j)
        X.extend(process_data)
        y=np.concatenate((y, label), axis=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    classifier = RocketClassifier(num_kernels=2000,n_jobs=-1,use_multivariate='yes')
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    print(accuracy_score(y_test, y_pred))

Still 1.0?

In [None]:

# X_train, X_test, y_train, y_test = train_test_split(X, y)
# classifier = TimeSeriesForestClassifier()
# classifier.fit(X_train, y_train)
# y_pred = classifier.predict(X_test)
# accuracy_score(y_test, y_pred)

# Pre-train Deep learning classification discriminator

### training

In [12]:
from tsai.all import *
from sklearn.model_selection import train_test_split
my_setup()

os              : Linux-5.15.0-58-generic-x86_64-with-debian-bullseye-sid
python          : 3.7.15
tsai            : 0.3.4
fastai          : 2.7.10
fastcore        : 1.5.27
torch           : 1.13.1+cu117
device          : 4 gpus (['NVIDIA RTX A6000', 'NVIDIA RTX A6000', 'NVIDIA RTX A6000', 'NVIDIA RTX A6000'])
cpu cores       : 64
threads per cpu : 2
RAM             : 503.53 GB
GPU memory      : [47.99, 47.99, 47.99, 47.99] GB


In [13]:
print(data_all.keys())

NameError: name 'data_all' is not defined

In [None]:
X=[]
y=np.empty(0)
for i in range(3):
    data=data_all['data_seg_all_'+str(i)]
    label=np.full(len(data), i)
    X.extend([p.transpose() for p in data])
    y=np.concatenate((y, label), axis=0)
X=np.array(X)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X, y, splits = combine_split_data([X_train, X_test], [y_train, y_test])
tfms  = [None, [Categorize()]]
dsets = TSDatasets(X, y, tfms=tfms, splits=splits, inplace=True)
dls   = TSDataLoaders.from_dsets(dsets.train, dsets.valid, bs=[64, 128], batch_tfms=[TSStandardize()], num_workers=0)
model = InceptionTime(dls.vars, dls.c)
learn = Learner(dls, model, metrics=accuracy)
learn.fit_one_cycle(25, lr_max=1e-3)
learn.plot_metrics()
learn.save_all(path='export', dls_fname='dls', model_fname='model', learner_fname='learner')
#     display(type(X_train),X_train.shape)

In [14]:
import pickle
learn=load_all(path='export', dls_fname='dls', model_fname='model',
           learner_fname='learner', device=None, pickle_module=pickle, verbose=True)

Learner loaded:
path          = 'export'
dls_fname     = '['dls_0.pth', 'dls_1.pth']'
model_fname   = 'model.pth'
learner_fname = 'learner.pkl'


In [None]:
X=[]
y=np.empty(0)
for i in range(3):
    data=data_all['data_seg_all_'+str(i)]
    label=np.full(len(data), i)
    X.extend([p.transpose() for p in data])

In [None]:
test_probas, test_targets, test_preds=learn.get_X_preds(X_test, with_decoded=True)

In [15]:
import pickle

data_to_load=["data_dict_tic_APPL_generated",
               "data_dict_group_3_generated",
              "data_dict_group_4_generated",
               "GOOG_data_generated",
              "data_all_generated"]
load_data_dict={}
for data in data_to_load:
    with open( './generated_data/'+data+'.pickle', 'rb') as handle:
        load_data_dict[data] = pickle.load(handle)

In [16]:
print(load_data_dict["GOOG_data_generated"].keys())

dict_keys(['data_seg_GOOG_0', 'data_seg_GOOG_1', 'data_seg_GOOG_2'])


In [None]:
X=[]
y=np.empty(0)
for i in range(3):
    data=data_all['data_seg_all_'+str(i)]
    label=np.full(len(data), i)
    X.extend([p.transpose() for p in data])
    y=np.concatenate((y, label), axis=0)
X=np.array(X)
print(X.shape)

In [17]:
X_test=[]
X_test.extend([p.transpose() for p in load_data_dict["GOOG_data_generated"]["data_seg_GOOG_0"]])
X_test=np.array(X_test)

In [None]:
print(X_test.shape)

In [18]:
test_probas, test_targets, test_preds = learn.get_X_preds(X_test)

In [19]:
import json
from collections import Counter
def get_pre_res(pred_res,label):
    res = json.loads(pred_res)
    # print(res)
    res=[int(r) for r in res]
    # print(res)
    c=Counter(res)
    return c[label]/len(res)

In [20]:
def get_style_score(data_dict):
    res_dict = {}
    for k, v in data_dict.items():
        label = int(k[-1])
        X_test = []
        X_test.extend([p.transpose() for p in v])
        X_test = np.array(X_test)
        test_probas, test_targets, test_preds = learn.get_X_preds(X_test)
        # print(k,label)
        score = get_pre_res(test_preds, label)
        res_dict[k] = score
    return res_dict

In [21]:
get_style_score(load_data_dict["GOOG_data_generated"])

{'data_seg_GOOG_0': 0.9881129271916791,
 'data_seg_GOOG_1': 0.7913262099308611,
 'data_seg_GOOG_2': 0.9849574885546108}

In [None]:
load_data_dict.keys()

In [None]:
for k in load_data_dict.keys():
    print(k,get_style_score(load_data_dict[k]))

In [None]:
load_data_dict.keys()

In [None]:
load_data_dict['data_dict_tic_APPL_generated'].keys()

In [None]:
load_data_dict['data_dict_tic_APPL_generated']['data_seg_AAPL_0'].shape

In [None]:
from random import sample

In [None]:
from sklearn.linear_model import LinearRegression
def calculate_slope(data,plot=False,num=3,name=''):
    try:
        number_of_sample=data.shape[0]
    except: 
        number_of_sample=len(data)
    sample_list=sample([i for i in range(number_of_sample)],num*num)
    if plot:
        fig, ax = plt.subplots(num,num)
        for i, s in enumerate(sample_list):
            data_s=data[s]
            # print(data_s.shape)
            data_s_adjcp=data_s[:,4].flatten()
            ax[i//num][i%num].plot(data_s_adjcp)
        plt.show()
        fig.savefig('./fig/'+str(name)+'.png')
    slope_list=[]
    for i in range(number_of_sample):
        data_s=data[i]
        # print(data_s.shape)
        data_s_adjcp=data_s[:,4].reshape(-1, 1)
        if data_s_adjcp[0]==0:
            continue
        x=np.asarray([i for i in range(len(data_s_adjcp))]).reshape(-1, 1)
        reg=LinearRegression().fit(x, data_s_adjcp)
        slope=(100*reg.coef_/data_s_adjcp[0])[0][0]
        # print(slope)
        slope_list.append(slope)
    print(pd.DataFrame(slope_list).describe())
def get_slope_of_dict(data_dict,plot=False,num=3,prefix=''):
    for k,v in data_dict.items():
        regime=int(k[-1])
        print(k)
        calculate_slope(v,plot,num,prefix+str(k))

In [None]:
GOOG_data.keys()

In [None]:
get_slope_of_dict(data_all,True,3,'ori_')

In [None]:
get_slope_of_dict(load_data_dict['data_all_generated'],True,3,'generated_')

In [None]:
get_slope_of_dict(GOOG_data,True,3,'ori_')

In [None]:
get_slope_of_dict(load_data_dict['GOOG_data_generated'],True,3,'generated_')

In [None]:
get_slope_of_dict(load_data_dict['data_all_generated'])

# Key takeaway

InceptionTime can do the job

In [None]:
X, y, splits = get_classification_data('LSST', split_data=False)

In [None]:
tfms  = [None, TSClassification()] # TSClassification == Categorize
batch_tfms = TSStandardize()
dls = get_ts_dls(X, new_y, splits=splits, tfms=tfms, batch_tfms=batch_tfms, bs=[64, 128])
dls.dataset

In [None]:
def get_std_list(data):
    intervals=get_intervals(data)
    std_list=[]
    data.drop(columns=['index'])
    for interval in intervals:
        data_seg=data.iloc[interval[0]:interval[1],:].to_numpy()
        std=data_seg.adj.std()
        std_list.append(std)
    return std_list

In [None]:
batch_tfms = TSStandardize(by_sample=True)
mv_clf = TSClassifier(X, y, splits=splits, path='models', arch=InceptionTimePlus, batch_tfms=batch_tfms, metrics=accuracy, cbs=ShowGraph())
mv_clf.fit_one_cycle(10, 1e-2)
mv_clf.export("mv_clf.pkl")

## inference

## inference