### tree algorithm factor trading platform

### 팩터별로 바스켓 구성 ==> ls



In [1]:
import xlwings as xw
import pandas as pd
import numpy as np
import pickle
import datetime as dt
import tdpm_kgh.pckg_tools.utils as ut
import tdpm_kgh.pckg_stats.statistics as stat
from _config_python_pc import *
import tqdm
import itertools
import ray
import os
from sklearn.preprocessing import StandardScaler


In [2]:
### read pickle data and adjust for simulation

date_info = pd.read_pickle(FileName.date_info) # read date information
date_m = ut.calculate_date(date_info, 'M') # monthly date
date_q = ut.calculate_date(date_info,'Q') # quarterly date
date_w = ut.calculate_date(date_info, 'W') # weekly date 
bm_const = pd.read_pickle(FileName.bcmk_const) # read bcmk constituent information
price = pd.read_pickle(FileName.stck_price) # read stock price
price_d = pd.read_pickle(FileName.stck_price_d) # read stock price dividend adjusted
ret_d = stat.calculate_return(price,'D') # timeseries daily return
ret_d_stack = ret_d.stack().reset_index() # stacked daily return
ret_d_stack.columns = ['date', 'code', 'return'] # stacked daily return column adjusted
ret_m = stat.calculate_return(price,'M') # timeseries monthly return 
ret_m_stack = ret_m.stack().reset_index() # stacked monthly return 
ret_m_stack.columns = ['date', 'code', 'return'] # stacked monthly return column adjusted 
list_date = [date_m[x] for x in range(71,len(date_m))] # range of regression date 

In [3]:
class PortfolioSimulator:
    def __init__(self, return_df, tickers, initial_weights=None, initial_value =1.0):

        # self.return_df = return_df[tickers]
        self.return_df = return_df[tickers].fillna(0)
        self.tickers = tickers
        if initial_weights is None:
            self.initial_weights = [1/len(tickers) for _ in tickers]
        else:
            self.initial_weights = initial_weights
        self.current_weights = self.initial_weights.copy()
        self.initial_value = [initial_value]
        

    def simulate(self):
        cumulative_returns = self.initial_value  # 첫 번째 날의 누적 수익률은 1.0 (100%)
        return_series = []
        for i in range(len(self.return_df)):
            daily_returns = self.return_df.iloc[i]
            # valid_returns = daily_returns.dropna()
            valid_returns = daily_returns.copy()
            valid_weights = [self.current_weights[self.tickers.index(ticker)] for ticker in valid_returns.index]
            total_valid_weight = sum(valid_weights)
            normalized_weights = [w/total_valid_weight for w in valid_weights]

            daily_portfolio_return = sum(weight * ret for weight, ret in zip(normalized_weights, valid_returns))
            
            # 이전 누적 수익률에 일별 수익률을 반영하여 새로운 누적 수익률 계산
            new_cumulative_return = cumulative_returns[-1] * (1 + daily_portfolio_return)
            cumulative_returns.append(new_cumulative_return)
            return_series.append(daily_portfolio_return)
            
            new_values = [(weight * (1 + ret)) for weight, ret in zip(normalized_weights, valid_returns)]
            total_value = sum(new_values)
            self.current_weights = [val / total_value for val in new_values]

        self.cumulative_return_series = pd.Series(cumulative_returns[1:], index=self.return_df.index)
        self.return_series = pd.Series(return_series, index=self.return_df.index)
        
        # return self.cumulative_return_series, self.initial_weights, self.current_weights

In [4]:
## tqdm_제외 코드 

def tree_simulation(strategy: str, data : pd.DataFrame ,ret_df: pd.DataFrame, date_start: str, date_end: str, df_node: pd.DataFrame, directory: str):    
    ret_d_adj_flt = ret_df[(ret_df.index>date_start) & (ret_df.index<=date_end)].copy()    
    list_simulation = []
    for n in range(len(df_node)):
        data_flt = list(data[(data['strategy'] == strategy) & (data['tree_node'] == df_node['node'][n])].code)
        sim = PortfolioSimulator(ret_d_adj_flt, data_flt, None, 1)
        sim.simulate()
        list_simulation.append(sim.return_series.to_frame(f'{strategy}_{df_node["node"][n]}'))
    
    result = pd.concat(list_simulation, axis = 1)
    result.to_pickle(f'{directory}{date_start}_{strategy}.pkl')


@ray.remote
def ray_tree_simulation(strategy: str, data : pd.DataFrame ,ret_df: pd.DataFrame, date_start: str, date_end: str, df_node: pd.DataFrame, directory: str):    
    ret_d_adj_flt = ret_df[(ret_df.index>date_start) & (ret_df.index<=date_end)].copy()    
    list_simulation = []
    for n in range(len(df_node)):
        data_flt = list(data[(data['strategy'] == strategy) & (data['tree_node'] == df_node['node'][n])].code)
        sim = PortfolioSimulator(ret_d_adj_flt, data_flt, None, 1)
        sim.simulate()
        list_simulation.append(sim.return_series.to_frame(f'{strategy}_{df_node["node"][n]}'))
    
    result = pd.concat(list_simulation, axis = 1)
    result.to_pickle(f'{directory}{date_start}_{strategy}.pkl')


def wrapper_ray_tree_simulation(list_strategy: list, data : pd.DataFrame ,ret_df: pd.DataFrame, date_start: str, date_end: str, df_node: pd.DataFrame, directory: str):
    numb_cpu = os.cpu_count()-3
    numb_cpu_max = min(len(list_strategy),numb_cpu)
    ray.init(num_cpus=numb_cpu_max)
    for i in range(0,len(list_strategy),numb_cpu_max): # tqdm.tqdm(range(0,len(name_list),numb_cpu_max))
        print(list_strategy[i] +'_process_begins')
        strategies = list_strategy[i:i+numb_cpu_max]
        actors = []
        for strategy in strategies:
            actors.append(ray_tree_simulation.remote(strategy, data, ret_df, date_start, date_end, df_node, directory))
        ray.get(actors)
        print(list_strategy[i] +'_process_ends')
    ray.shutdown()

In [None]:
class TestPerfomanceV2:
    def __init__(self, data:pd.DataFrame):
        """
        data : dataframe (not a string)
            index = date
            column = strategy
            value = normal return 
        """
        self.data = np.log(data+1) #convert normal return to log return
        
        if isinstance(self.data.index[0],str):
            try:
                self.data.index = pd.to_datetime(self.data.index)
            except:
                pass
        else:
            pass

        self.list_columns = list(data.columns)
        self.len_date = (self.data.index[-1] - self.data.index[0]).days
        self.ret_cum = pd.DataFrame()
        self.ret_cagr = pd.DataFrame()
        self.ret_mean = pd.DataFrame()
        self.ret_std = pd.DataFrame()
        self.ir = pd.DataFrame()
        self.mdd = pd.DataFrame()
        self.dd = pd.DataFrame()
        self.pdd = pd.DataFrame()
        self.dd_avg = pd.DataFrame()
        self.score_raw = pd.DataFrame()
        self.score_stdz = pd.DataFrame()
        self.score = pd.DataFrame()
        self.obj = pd.DataFrame()

    def get_ret_cum(self):
        ret_cum = self.data.sum().to_frame()
        ret_cum.columns = ['ret_cum']
        ret_cum['ret_cum'] = ret_cum['ret_cum'].apply(lambda x: np.exp(x) - 1)
        # ret_cum = ret_cum.to_frame()
        self.ret_cum = ret_cum
        
    def get_ret_cagr(self):
        ret_cum = self.data.sum().to_frame()
        ret_cum.columns = ['ret_cagr']
        ret_cum['ret_cagr'] = ret_cum['ret_cagr'].apply(lambda x: np.exp(x)**(365/self.len_date) - 1)
        self.ret_cagr = ret_cum

    def get_ret_mean(self, d:int):
        if d is None:
            d=250
        ret_mean = (self.data.mean()*d).to_frame()
        ret_mean.columns = ['ret_mean']
        self.ret_mean = ret_mean

    def get_ret_std(self, d:int):
        if d is None:
            d=250
        ret_std = (self.data.std()*np.sqrt(d)).to_frame()
        ret_std.columns = ['ret_std']
        self.ret_std = ret_std

    def get_drawdown(self, period: int):
        '''
        period : last period
        '''
        cum_sum = self.data.cumsum()
        cum_max = cum_sum.cummax()
        cum_dd = cum_sum-cum_max
        self.dd = cum_dd
        mdd = cum_dd.min().to_frame()
        mdd.columns = ['mdd']
        mdd['mdd'] = mdd['mdd'].apply(lambda x: np.exp(x) - 1)
        self.mdd = mdd

        cum_sum_period = self.data.iloc[-period:,:].cumsum()
        cum_max_period = cum_sum_period.iloc[-period:,:].cummax()
        cum_dd_period = cum_sum_period-cum_max_period
        period_dd = cum_dd_period.iloc[-1,:].to_frame()
        period_dd.columns = [f'period({period})_dd']
        period_dd[f'period({period})_dd'] = period_dd[f'period({period})_dd'].apply(lambda x: np.exp(x) - 1)
        self.pdd = period_dd

    def get_ir(self):
        if self.ret_cagr.empty and self.ret_std.empty:
            self.get_ret_cagr()
            self.get_ret_std()
            ir = (self.ret_cagr['ret_cagr'] / self.ret_std['ret_std']).to_frame()
            ir.columns = ['ir']
            self.ir = ir


            # ir = (self.ret_cagr['ret_cagr'] / self.ret_std['ret_std']).to_frame()
            # ir.columns = ['ir']
            # self.ir = ir
        else:
            # self.get_ret_cagr()
            # self.get_ret_std()
            # ir = (self.ret_cagr['ret_cagr'] / self.ret_std['ret_std']).to_frame()
            # ir.columns = ['ir']
            # self.ir = ir
            ir = (self.ret_cagr['ret_cagr'] / self.ret_std['ret_std']).to_frame()
            ir.columns = ['ir']
            self.ir = ir

    def get_score_base(self, period = 20, d = 250):
        self.get_ret_cum()
        self.get_ret_cagr()
        self.get_ret_mean(d)
        self.get_ret_std(d)
        self.get_ir()
        self.get_drawdown(period)
        final = pd.concat([self.ret_cum, 
                           self.ret_cagr, 
                           self.ret_mean, 
                           self.ret_std,
                           self.ir,
                           self.mdd,
                           self.pdd
                           ], axis=1
                           )
        final = final.sort_values(by=['ir'], ascending=[False])
        self.score_raw = final
        
    def get_score_stdz(self, 
                       period=20, 
                       d = 250,
                       w_ret_cum = 0, 
                       w_ret_cagr = 0, 
                       w_ret_mean = 0, 
                       w_ret_std = 0, 
                       w_ir = 0.8, 
                       w_mdd = 0.5,
                       w_period_dd = -0.5
                       ):
        
        weights = {'ret_cum': w_ret_cum,
           'ret_cagr': w_ret_cagr,
           'ret_mean': w_ret_mean,
           'ret_std': w_ret_std,
           'ir': w_ir,
           'mdd':w_mdd,
           f'period({period})_dd': w_period_dd,
           }

        if self.score_raw.empty:
            self.get_score_base(period,d=d)
            scaler = StandardScaler()
            score_stdz = pd.DataFrame(scaler.fit_transform(self.score_raw), columns=self.score_raw.columns, index = self.score_raw.index)
        else:
            scaler = StandardScaler()
            score_stdz = pd.DataFrame(scaler.fit_transform(self.score_raw), columns=self.score_raw.columns, index = self.score_raw.index)
        
        score_weighted = score_stdz.mul(pd.Series(weights), axis=1)
        score = score_weighted.sum(axis=1).to_frame()
        score.columns = ['score']
        score = score.sort_values(by=['score'], ascending=False)

        self.score_stdz = score_stdz
        self.score = score

In [5]:
### 기초 데이터 세팅 
d = 0 
data = pd.read_pickle(f'{Directory.pkl_tmp_dix_factor_tree_node_universe}{list_date[d]}_tree_node.pkl')
df_strategy = pd.DataFrame(list(data.strategy.unique()),columns=['strategy'])
df_node = pd.DataFrame(list(sorted(set(data.tree_node),reverse= True)), columns=['node'])
list_strategy = list(data.strategy.unique())
print(len(list_strategy))
list_node = list(sorted(set(data.tree_node),reverse= True))
print(len(list_node))
## display
wb = xw.Book(FileName.xl_multifactor)
sh = wb.sheets['dix_strategy_tree_algo_sim']
sh.range('A10').options(index=False, header=True).value = df_strategy
sh.range('B10').options(index=False, header=True).value = df_node

120
27


In [6]:
### selection 축소
list_strategy_sel = ['trt_60',
                    'price_momentum',
                    'value',
                    'momentum',
                    'debt',
                    'dividend',
                    ]
list_strategy_sel = list(itertools.permutations(list_strategy_sel,3))
list_strategy_sel = [','.join(x) for x in list_strategy_sel]
print(len(list_strategy_sel))

120


In [7]:
list_strategy_sel_name = ['_' + name +'.pkl' for name in list_strategy_sel]

In [8]:
list_data = []
for name in list_strategy_sel_name:
    list_data.append(pd.read_pickle(f'{Directory.pkl_tmp_simulation_tree_node}{name}'))
data = pd.concat(list_data, axis=1)

In [11]:
test = TestPerfomanceV2(data)
test.get_score_base(period=6, d= 12)
test.score_raw
test.score_raw.to_excel('tree_node_performance.xlsx')

In [None]:
### monthly_simulation
start_date = 113

for d in range(start_date,len(list_date)-1):
    print(list_date[d] + '_process_begins')
    data = pd.read_pickle(f'{Directory.pkl_tmp_dix_factor_tree_node_universe}{list_date[d]}_tree_node.pkl')
    wrapper_ray_tree_simulation(list_strategy_sel, data, ret_m, list_date[d], list_date[d+1], df_node, Directory.pkl_tmp_simulation_tree_node) ## monthly_return
    
    for strategy in list_strategy_sel:
        if d == 0 :
            data = pd.read_pickle(f'{Directory.pkl_tmp_simulation_tree_node}{list_date[d]}_{strategy}.pkl')
            data = data.drop_duplicates()
            data.to_pickle(f'{Directory.pkl_tmp_simulation_tree_node}_{strategy}.pkl')
            os.remove(f'{Directory.pkl_tmp_simulation_tree_node}{list_date[d]}_{strategy}.pkl')
        else:
            first = pd.read_pickle(f'{Directory.pkl_tmp_simulation_tree_node}_{strategy}.pkl')
            second = pd.read_pickle(f'{Directory.pkl_tmp_simulation_tree_node}{list_date[d]}_{strategy}.pkl')
            merge = pd.concat([first,second])
            merge = merge.drop_duplicates()
            merge.to_pickle(f'{Directory.pkl_tmp_simulation_tree_node}_{strategy}.pkl')
            os.remove(f'{Directory.pkl_tmp_simulation_tree_node}{list_date[d]}_{strategy}.pkl')

In [None]:
# ## with tqdm // 

# def tree_simulation(strategy: str, data : pd.DataFrame ,ret_df: pd.DataFrame, date_start: str, date_end: str, df_node: pd.DataFrame, directory: str):    
#     ret_d_adj_flt = ret_df[(ret_df.index>date_start) & (ret_df.index<=date_end)].copy()    
#     list_simulation = []
#     for n in range(len(df_node)):
#         data_flt = list(data[(data['strategy'] == strategy) & (data['tree_node'] == df_node['node'][n])].code)
#         sim = PortfolioSimulator(ret_d_adj_flt, data_flt, None, 1)
#         sim.simulate()
#         list_simulation.append(sim.return_series.to_frame(f'{strategy}_{df_node["node"][n]}'))
    
#     result = pd.concat(list_simulation, axis = 1)
#     result.to_pickle(f'{directory}{date_start}_{strategy}.pkl')


# @ray.remote
# def ray_tree_simulation(strategy: str, data : pd.DataFrame ,ret_df: pd.DataFrame, date_start: str, date_end: str, df_node: pd.DataFrame, directory: str):    
#     ret_d_adj_flt = ret_df[(ret_df.index>date_start) & (ret_df.index<=date_end)].copy()    
#     list_simulation = []
#     for n in tqdm.tqdm(range(len(df_node))):
#         data_flt = list(data[(data['strategy'] == strategy) & (data['tree_node'] == df_node['node'][n])].code)
#         sim = PortfolioSimulator(ret_d_adj_flt, data_flt, None, 1)
#         sim.simulate()
#         list_simulation.append(sim.return_series.to_frame(f'{strategy}_{df_node["node"][n]}'))
    
#     result = pd.concat(list_simulation, axis = 1)
#     result.to_pickle(f'{directory}{date_start}_{strategy}.pkl')


# def wrapper_ray_tree_simulation(list_strategy: list, data : pd.DataFrame ,ret_df: pd.DataFrame, date_start: str, date_end: str, df_node: pd.DataFrame, directory: str):
#     numb_cpu = os.cpu_count()-3
#     numb_cpu_max = min(len(list_strategy),numb_cpu)
#     ray.init(num_cpus=numb_cpu_max)
#     for i in range(0,len(list_strategy),numb_cpu_max): # tqdm.tqdm(range(0,len(name_list),numb_cpu_max))
#         print(list_strategy[i] +'_process_begins')
#         strategies = list_strategy[i:i+numb_cpu_max]
#         actors = []
#         for strategy in strategies:
#             actors.append(ray_tree_simulation.remote(strategy, data, ret_df, date_start, date_end, df_node, directory))
#         ray.get(actors)
#         print(list_strategy[i] +'_process_ends')
#     ray.shutdown()

In [None]:
### daily_simulation : 너무 느려서 일단 포기 
for d in range(len(list_date)-1):
    print(list_date[d] + '_process_begins')
    data = pd.read_pickle(f'{Directory.pkl_tmp_dix_factor_tree_node_universe}{list_date[d]}_tree_node.pkl')
    wrapper_ray_tree_simulation(list_strategy, data, ret_d, list_date[d], list_date[d+1], df_node, Directory.pkl_tmp_simulation_tree_node) # daily_return
    
    for strategy in list_strategy:
        if d == 0 :
            data = pd.read_pickle(f'{Directory.pkl_tmp_simulation_tree_node}{list_date[d]}_{strategy}.pkl')
            data.to_pickle(f'{Directory.pkl_tmp_simulation_tree_node}_{strategy}.pkl')
        else:
            first = pd.read_pickle(f'{Directory.pkl_tmp_simulation_tree_node}_{strategy}.pkl')
            second = pd.read_pickle(f'{Directory.pkl_tmp_simulation_tree_node}{list_date[d]}_{strategy}.pkl')
            merge = pd.concat([first,second])
            merge.to_pickle(f'{Directory.pkl_tmp_simulation_tree_node}_{strategy}.pkl')

In [None]:
ret_m_flt = ret_m[(ret_m.index>list_date[0]) & (ret_m.index<=list_date[1])]

In [None]:
strategy = list_strategy[0]
n = 2
codes = list(data[(data['strategy'] == strategy) & (data['tree_node'] == df_node['node'][n])].code)

In [None]:
sim = PortfolioSimulator(ret_m_flt, codes)
sim.simulate()

In [None]:
sim.return_series

In [None]:
ret_d_adj = ret_d.fillna(0).copy()
list_simulation = []
ret_d_adj_flt = ret_d_adj[(ret_d_adj.index>list_date[d]) & (ret_d_adj.index<=list_date[d + 1])]
for s in range(len(df_strategy)):
    print(s)
    for n in tqdm.tqdm(range(len(df_node))):
        data_flt = list(data[(data['strategy'] == df_strategy['strategy'][s]) & (data['tree_node'] == df_node['node'][n])].code)
        sim = PortfolioSimulator(ret_d_adj_flt, data_flt, None, 1)
        sim.simulate()
        list_simulation.append(sim.return_series.to_frame(f'{df_strategy["strategy"][s]}_{df_node["node"][n]}'))

In [None]:

# ## 너무 느림 
# list_data = []
# for d in tqdm.tqdm(range(len(list_date))):
#     data = pd.read_pickle(f'{Directory.pkl_tmp_dix_factor_tree_node_universe}{list_date[d]}_tree_node.pkl')
#     data_flt = data[(data['strategy'] == df_strategy['strategy'][0])]
#     list_data.append(data_flt)
    

In [None]:
def wrapper_simulation(return_df : pd.DataFrame, rebal_date : list, port : pd.DataFrame, cost = 0.002):
    
    ## pseudo code 작성중 
    
    """
    return_df : 수익률 time series, index - date, column - stock code, value - return
    rebal_date : list of relancing date
    port : dataframe , column date, code, weight
                        date       code   weight
                        2005-12-29 Axxxx    0.25
                        2005-12-29 Axxxx    0.25
                        ...
                        2006-01-31 Axxxx    0.25
    """
    ## pseudo code 
    result = []
    for i in range(0,len(rebal_date)-1):
        
        if i == 0 :
            initial_value = 1
        else :
            initial_value = simulation.iloc[-1,:].value

        return_df_cut = return_df[return_df.index > rebal_date[i] & return_df.index > rebal_date[i+1]]
        code = port[port.date == rebal_date[i]].code.to_list()
        weight = port[port.date == rebal_date[i]].weight.to_list()

        sim = PortfolioSimulator(return_df_cut, code, weight, initial_value)
        sim.simulate()
        simulation = sim.cumulative_return_series

        ### cost 계산
        ### 바로 빼면 안되고 종목별로 매핑해서 빼야함
        current_weights = sim.current_weights
        rebal_weights = port[port.date == rebal_date[i+1]].weight.to_list()
        _cost = (current_weights - rebal_weights) * cost
        simulation.iloc[-1,:] = simulation.iloc[-1,:] - _cost
        
        result = result.append(simulation)
    
    result = pd.concat(result)
    return result