#### This Code is the base of the current running Cate ML Model

In [2]:
import pandas as pd
pd.options.plotting.backend = "plotly"

import plotly.express as px
import plotly.graph_objects as go

import numpy as np

from make_new_features import get_df_with_features
from get_krx_value import get_krx_mean

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

from xgboost import plot_importance
import matplotlib.pyplot as plt

#### Preparing Price Dataset

In [2]:
SHORT_PERIOD=5
MID_PERIOD=20
LONG_PERIOD=60

try :
    df_with_feats = pd.read_pickle(f"./df_with_feats_f_20130101_t_20230919_{SHORT_PERIOD}_{MID_PERIOD}_{LONG_PERIOD}_with_min_max_ratio.pkl")
    # df_with_feats['date'] = df_with_feats['date'].dt.strftime("%Y-%m-%d")
except :
    # filtering : Normal Stocks, Not SPAC
    df_price = (
        pd.read_pickle("./df_price_price_only_f_20130101_t_20230919.pkl")
        .loc[lambda df : df["code"].str[5]== "0"]
        # .loc[lambda df : ~df["name"].str.contains("스펙")]
        # .loc[lambda df : ~df["name"].str.contains("스팩")]    
    )

    df_price['date'] = df_price['date'].dt.strftime("%Y-%m-%d")

    try:
        df_krx = pd.read_pickle("./df_krx.pkl")
    except :
        df_krx = get_krx_mean()
        df_krx.to_pickle("./df_krx.pkl")

    df_price = df_price.merge(
        df_krx,
        on='date'
    )
    
    df_with_feats = get_df_with_features(
        df_price, SHORT_PERIOD=SHORT_PERIOD, MID_PERIOD=MID_PERIOD, LONG_PERIOD=LONG_PERIOD)
    df_with_feats.to_pickle(f"./df_with_feats_f_20130101_t_20230919_{SHORT_PERIOD}_{MID_PERIOD}_{LONG_PERIOD}_with_min_max_ratio.pkl")

## ML

In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [4]:
df_with_feats_ml = (
    df_with_feats
    .assign(
        vol_x_price_sma_long_to_mid = lambda df : df.vol_x_price_sma_long / df.vol_x_price_sma_mid,
        vol_x_price_sma_mid_to_short = lambda df : df.vol_x_price_sma_mid / df.vol_x_price_sma_short,
    )
    .loc[lambda df : df.code.str[5] == '0']
    .loc[lambda df : ~df.name.str.contains("스펙")]
    .loc[lambda df : ~df.name.str.contains("스팩")]
    
    .loc[lambda df : df["vol_x_price_sma_mid"] >  0.75e+08]  # 0.75e+08
    .loc[lambda df : df["vol_x_price_sma_mid"] <  3.5e+08]  # 3.5e+08
    .loc[lambda df : df["vol_zero_count_mid"] == 0]
    .loc[lambda df : df["change"] < 0.29]
    .loc[lambda df : df["volume"] > 0]

    # .loc[lambda df : df.close > 1000]
)

In [6]:
# # All Available Features
# for col in df_with_feats_ml.columns:
#     print(f"'{col}',")

#### EDA & Selecting Features

In [None]:
feats = [
        'change_high',
        'change_low',
        'change_open',
        'close_std_short',
        # 'close_std_mid',
        'close_std_long',
        # 'close_mean_short',
        # 'close_mean_mid',
        # 'close_mean_long',
        'close_min_ratio_short',
        'close_min_ratio_mid',
        'close_min_ratio_long',
        'close_max_ratio_short',
        'close_max_ratio_mid',
        'close_max_ratio_long',
        'open_std_short',
        # 'open_std_mid',
        # 'open_std_long',
        'high_std_short',
        'high_std_mid',
        'high_std_long',
        'low_std_short',
        'low_std_mid',
        # 'low_std_long',
        # 'vol_zero_count_short',
        # 'vol_zero_count_mid',
        # 'vol_zero_count_long',
        'close_change_p_short',
        'close_change_p_mid',
        'close_change_p_long',
        'w_price_vol_corr_long',
        'w_price_vol_corr_mid',
        'w_price_vol_corr_short',
        'krx_corr_short',
        'krx_corr_mid',
        'krx_corr_long',
        'krx_change_std_short',
        'krx_change_std_mid',
        'krx_change_std_long',
        'vol_x_price_sma_long_to_mid',
        'vol_x_price_sma_mid_to_short'
    ]

In [None]:
corr_matrix = df_with_feats_ml[feats].corr()

upper = corr_matrix.where(~np.tril(np.ones(corr_matrix.shape)).astype(bool))

# Find the pairs where the absolute value of correlation is greater than 0.75
high_corr_pairs = [(column, index) for column, rowIndex in zip(upper.columns, upper.index) 
                   for index, value in upper[column].items() if abs(value) > 0.75]

# Print the high correlation pairs
for pair in high_corr_pairs:
    print(pair)

In [3]:
# For fixed length train dataset

import exchange_calendars as xcals
krx_cal = xcals.get_calendar("XKRX")

max_date = '2023-09-19'
# max_date = '2024-01-19'
start_date = '2015-01-02'

finish = False
dates = []

len_of_train = 200
gap_from_last_train_date = 7 # This number depends on the target ( rtn_5 -> 7, rnt_20 -> 22)
len_of_pred = 20 # The length of pred for 1 model update

while not finish :
    train_end = krx_cal.sessions_window(start_date, len_of_train)[-1].strftime("%Y-%m-%d")
    oos_start = krx_cal.sessions_window(train_end, gap_from_last_train_date)[-1].strftime("%Y-%m-%d")
    oos_end = krx_cal.sessions_window(oos_start, len_of_pred)[-1].strftime("%Y-%m-%d")

    if oos_end > max_date :
        dates.append(
            (start_date, train_end, oos_start, max_date)
        )
        finish = True

    else :
        dates.append(
            (start_date, train_end, oos_start, oos_end)
        )

        start_date = krx_cal.sessions_window(start_date, len_of_pred)[-1].strftime("%Y-%m-%d")

#### Create df_oos_with_proba

In [None]:
def run_ml_multi_seed(features, idx0, idx1, fixed, target):

    feats = features
    
    THRES_OF_PROBA = 0.5

    rtn_in_period = []
    l_df = []

    if fixed:
        print("fixed")
    else :
        print("float")

    for train_start, train_end, oos_start, oos_end in dates[idx0 : idx1]:

        df_train_set = df_with_feats_ml.loc[lambda df : df.date >= train_start].loc[lambda df : df.date < train_end]
        df_oos_set = df_with_feats_ml.loc[lambda df : df.date >= oos_start].loc[lambda df : df.date < oos_end]

        df_train_set_ = (
            df_train_set
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        if fixed:
            THRES_OF_TRUE = 0.03
        else :
            THRES_OF_TRUE = df_train_set_[target].quantile(0.75)
        
        # print(THRES_OF_TRUE)

        df_oos_set_ = (
            df_oos_set      
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        df_train_set_["target"] = (
                df_train_set_[target].apply(lambda x : 1 if x > THRES_OF_TRUE else 0)
            )

        feature_df = df_train_set_[feats]
        target_df = df_train_set_["target"]

        feature_df_oos = df_oos_set_[feats]

        doos = xgb.DMatrix(feature_df_oos)        

        if df_oos_set_.shape[0] > 0:
            
            col_list = []
            for id, seed in enumerate([6, 13, 5, 2]):

                X_train, X_valid, y_train, y_valid = train_test_split(
                    feature_df, target_df, test_size=0.25, random_state=seed
                )

                # Create DMatrix for training and validation data
                dtrain = xgb.DMatrix(X_train, label=y_train)
                dvalid = xgb.DMatrix(X_valid, label=y_valid)

                evals = [(dtrain, 'train'), (dvalid, 'eval')]

                params = {
                    'max_depth': 5, # Adjust based on your dataset : originally 5 --> try 8
                    'eta': 0.05,     # Learning rate : originally 0.05
                    'objective': 'binary:logistic',
                    'eval_metric': 'logloss',  # Or use 'auc', 'error', etc. based on your problem
                    'random_state': 42,
                }

                model = xgb.train(
                    params, dtrain,
                    1000,
                    evals=evals,
                    early_stopping_rounds=50,
                    verbose_eval=False
                )

                df_oos_set_[f"pred_proba_{id}"] = model.predict(doos)
                col_list.append(f"pred_proba_{id}")
            
            df_oos_set_["proba_mean"] = df_oos_set_[col_list].mean(axis=1)
            df_oos_set_["proba_max"] = df_oos_set_[col_list].max(axis=1)

            l_df.append(df_oos_set_)

            sr_selected_mean_mean = (
                df_oos_set_
                .loc[lambda df : df['proba_mean'] > THRES_OF_PROBA]
                .sort_values('proba_mean', ascending=False)
                .groupby('date')
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )       

            sr_selected_mean_max = (
                df_oos_set_
                .loc[lambda df : df['proba_mean'] > THRES_OF_PROBA]
                .sort_values('proba_max', ascending=False)
                .groupby('date')
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )      

            sr_selected_max_mean = (
                df_oos_set_
                .loc[lambda df : df['proba_max'] > THRES_OF_PROBA]
                .sort_values('proba_mean', ascending=False)
                .groupby('date')
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )    

            sr_selected_max_max = (
                df_oos_set_
                .loc[lambda df : df['proba_max'] > THRES_OF_PROBA]
                .sort_values('proba_max', ascending=False)
                .groupby('date')
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )     

            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : sr_selected_mean_mean.sum()/(int(target.split('_')[-1]) + 1),
                    'return_mean_max' : sr_selected_mean_max.sum()/(int(target.split('_')[-1]) + 1),
                    'return_max_mean' : sr_selected_max_mean.sum()/(int(target.split('_')[-1]) + 1),
                    'return_max_max' : sr_selected_max_max.sum()/(int(target.split('_')[-1]) + 1),
                    'date_n' : sr_selected_mean_max.shape[0]
                }
            )

        else :

        
            # cumsum += 0
            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : 0,
                    'return_mean_max' : 0,
                    'return_max_mean' : 0,
                    'return_max_max' : 0,
                    'date_n' : 0
                }
            )
        
        
    return rtn_in_period, l_df

In [None]:
for feat_var, target_var in [(feats, 'rtn_3'), (feats, 'rtn_4'), (feats, 'rtn_5')]:

    feats = feat_var
    target = target_var

    for tf in [True, False]:

        print(target)

        oos_test_result, dfs = run_ml_multi_seed(feats, 0, -1, tf, target)
        df_oos_result = pd.DataFrame(oos_test_result)
        df_oos_result = df_oos_result.set_index('date')

        if tf:
            df_oos_with_proba_fixed = pd.concat(dfs)
            df_oos_with_proba_fixed.to_pickle(f"df_oos_with_proba_fixed_{target}.pkl")
        else:
            df_oos_with_proba_float = pd.concat(dfs)
            df_oos_with_proba_float.to_pickle(f"df_oos_with_proba_float_{target}.pkl")

        for col in df_oos_result.columns:
            sr = (df_oos_result[col]+1)
            sr_org = df_oos_result[col]

            print(col)
            print(f"Prod : {round(sr.cumprod()[-1],2)}")
            print(f"Prod Max : {round(sr.cumprod().max(),2)}")
            print(f"Prod Min : {round(sr.cumprod().min(),2)}")
            print(f"min : {round(sr_org.min(),3)}")
            print(f"std : {round(sr_org.std(),3)}")
            print(f"mean : {round(sr_org.mean(),3)}")
            print('\n')


#### Get Best Thresholds for Proba

In [13]:
def objective(trial, df_proba, rtn_n):

    thres01 = trial.suggest_float("thres_for_max", 0.3, 0.9, step=0.01)
    thres02 = trial.suggest_float("thres_for_mean", 0.3, 0.9, step=0.01)
    thres03 = trial.suggest_float("thres_mean_max", 0.3, 0.9, step=0.01)
    thres04 = trial.suggest_float("thres_mean_mean", 0.3, 0.9, step=0.01)

    l_rtn_p = []

    for date in dates:

        df_ = df_proba.loc[lambda df : df.date >= date[2]].loc[lambda df : df.date < date[3]]
        df_ = df_.loc[lambda df : df['weighted_price_change_p_short'] >= df['weighted_price_change_p_mid']]


        df1 = (
            df_
            .loc[lambda df : df['proba_max'] > thres01]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df2 = (
            df_
            .loc[lambda df : df['proba_max'] > thres02]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df3 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres03]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df4 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres04]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        sr = (
            pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code']) #df1, df2, 
            .groupby('date')[rtn_n]
            .mean()
            -0.0023
        )

        n = int(rtn_n.split('_')[1]) + 1

        l_rtn_p.append(
            {
                'date' : date[3],
                'rtn_p' : (sr/n).sum(),
                'days' : pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code'])['date'].unique().__len__()
            }        
        )

    sr_final = (
        pd.DataFrame(l_rtn_p)
        .set_index('date')
        ['rtn_p']
        # +1
    )

    print(
        pd.DataFrame(l_rtn_p)['days'].sum()
    )

    return (sr_final+1).cumprod()[-1]


In [14]:
import optuna

In [15]:
# Optuna Strudy Run
def run_optuna(name, df_, rtn_n):

    study = optuna.create_study(
            direction="maximize",
            # direction="minimize",
            study_name=name,
        )

    study.optimize(
        lambda trial : objective(trial, df_, rtn_n),
        n_trials=400,
        show_progress_bar=True,
    )

    # Create study result df & Save to file
    df_study_result = (
        study.trials_dataframe()
        .rename(columns = lambda x : x.replace("params_", ""))
    )    
    df_study_result.to_pickle(f"./study_result_20240217_{name}.pkl")

In [None]:
for rtn_n in ['rtn_3', 'rtn_4', 'rtn_5']:
    for tf in [True, False]:

        type_str = 'fixed' if tf else 'float'

        df_with_proba = (
            pd.read_pickle(f"df_oos_with_proba_{type_str}_{rtn_n}.pkl")
            .merge(
                df_with_feats_ml[['code','date','weighted_price_change_p_short', 'weighted_price_change_p_mid']]
            )
            .dropna(subset=['weighted_price_change_p_short', 'weighted_price_change_p_mid'])
        )
        
        run_optuna(f"best_combi_of_thres_{type_str}_{rtn_n}", df_with_proba, rtn_n)

In [18]:
def get_df_rtn_result(df_with_proba, thres_01, thres_02, thres_03, thres_04, rtn):
    l_rtn_p = []

    n = int(rtn.split('_')[1])+1

    for date in dates:

        df_ = df_with_proba.loc[lambda df : df.date >= date[2]].loc[lambda df : df.date < date[3]]
        df_ = df_.loc[lambda df : df['weighted_price_change_p_short'] >= df['weighted_price_change_p_mid']]

        df1 = (
            df_
            .loc[lambda df : df['proba_max'] > thres_01]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df2 = (
            df_
            .loc[lambda df : df['proba_max'] > thres_02]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df3 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres_03]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df4 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres_04]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        sr = (
            pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code']) #df1, df2, 
            .groupby('date')[rtn]
            .mean()
            -0.0023
        )

        l_rtn_p.append(
            {
                'date' : date[3],
                'rtn_p' : (sr/n).sum(),
                'days' : pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code'])['date'].unique().__len__()
            }        
        )

    return (
        pd.DataFrame(l_rtn_p)
        .set_index('date')
        ['rtn_p']
    )

In [19]:
from scipy import stats

In [25]:
def get_rsquare(df_with_proba, rtn, row):
    
    sr_final = get_df_rtn_result(df_with_proba, row.thres_for_max, row.thres_for_mean, row.thres_mean_max, row.thres_mean_mean, rtn)
    #  'thres_for_max', 'thres_for_mean', 'thres_mean_max', 'thres_mean_mean',

    x = sr_final.cumsum()
    y = range(0, len(x))

    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

    print(f"R-squared: {r_value**2}")
    return r_value**2

#### Add R-Squared of Cumsum Series

In [27]:
typeStr = None

for useFixed in [True, False]: #, False
    if useFixed:
        typeStr = 'fixed'
    else :
        typeStr = 'float'

    for rtn in ['rtn_3', 'rtn_4','rtn_5']:
        df_with_proba = (
            pd.read_pickle(f"df_oos_with_proba_{typeStr}_{rtn}.pkl")
            .merge(
                df_with_feats_ml[['code','date','weighted_price_change_p_short', 'weighted_price_change_p_mid']]
            )
            .dropna(subset=['weighted_price_change_p_short', 'weighted_price_change_p_mid'])
        )
        df_study_result = pd.read_pickle(f"study_result_20240217_best_combi_of_thres_{typeStr}_{rtn}.pkl")
        # print(df_study_result.columns)

        df_study_result = df_study_result.drop_duplicates(subset=['thres_for_max', 'thres_for_mean', 'thres_mean_max', 'thres_mean_mean']).sort_values("value", ascending=False).head(50)

        df_study_result['r_square'] = df_study_result.apply(lambda row : get_rsquare(df_with_proba, rtn, row), axis=1 )

        
        df_study_result.to_pickle(f"study_result_add_r2_{typeStr}_{rtn}.pkl")
        

R-squared: 0.9660677112234342
R-squared: 0.9660677112234342
R-squared: 0.9661230405072893
R-squared: 0.9661230405072893
R-squared: 0.9661230405072893
R-squared: 0.9661230405072893
R-squared: 0.965628905788972
R-squared: 0.9656916183288693
R-squared: 0.9656916183288693
R-squared: 0.9657815670544935
R-squared: 0.9657815670544935
R-squared: 0.9658192765303161
R-squared: 0.9658192765303161
R-squared: 0.9658192765303161
R-squared: 0.9658192765303161
R-squared: 0.9653846818189821
R-squared: 0.9653846818189821
R-squared: 0.9659415827738057
R-squared: 0.9653699681628315
R-squared: 0.9655955333007891
R-squared: 0.9655955333007891
R-squared: 0.9652092801781328
R-squared: 0.9650972797014882
R-squared: 0.9649936392395431
R-squared: 0.9649936392395431
R-squared: 0.9639908761133945
R-squared: 0.9642056527287035
R-squared: 0.9636457408770827
R-squared: 0.9642598494230092
R-squared: 0.9644473789420402
R-squared: 0.963747564151217
R-squared: 0.9634465265638502
R-squared: 0.9648036153817274
R-squared: 0

#### Get Bset R2 Results

In [80]:
l_result = []
l_best_combi = []

for typeStr in ['fixed', 'float']:

    for rtn in ['rtn_3', 'rtn_4', 'rtn_5']:

        print(f"{rtn}_{typeStr}")
        df_with_proba = (
            pd.read_pickle(f"df_oos_with_proba_{typeStr}_{rtn}.pkl")
            .merge(
                df_with_feats_ml[['code','date','weighted_price_change_p_short', 'weighted_price_change_p_mid']]
            )
        )
        df_study_result_with_rs = pd.read_pickle(f"study_result_add_r2_{typeStr}_{rtn}.pkl")

        pecentile_09 = df_study_result_with_rs['r_square'].quantile(0.9)
        
        max_row = (
            df_study_result_with_rs.loc[lambda df : df['r_square'] >= pecentile_09]
            .sort_values('r_square', ascending=False).head(1)
        )
        max_row['case'] = f"{rtn}_{typeStr}"

        l_best_combi.append(
            max_row
        )

        max_row = df_study_result_with_rs.sort_values('r_square', ascending=False).head(1)
        print(max_row)
        thres_for_max = max_row['thres_for_max'].iloc[0]
        thres_for_mean = max_row['thres_for_mean'].iloc[0]
        thres_mean_max = max_row['thres_mean_max'].iloc[0]
        thres_mean_mean = max_row['thres_mean_mean'].iloc[0]

        # 'thres_for_max', 'thres_for_mean', 'thres_mean_max', 'thres_mean_mean',


        sr_final = get_df_rtn_result(df_with_proba, thres_for_max, thres_for_mean, thres_mean_max, thres_mean_mean, rtn)
        sr_final.name = f"{rtn}_{typeStr}"
        
        l_result.append(
            sr_final.rename({'rtn_p': f"{rtn}_{typeStr}"})
        )
        print('\n')

rtn_3_fixed
     number       value             datetime_start          datetime_complete  \
144     144  132.612399 2024-02-17 17:41:17.311245 2024-02-17 17:41:26.919777   

                  duration  thres_for_max  thres_for_mean  thres_mean_max  \
144 0 days 00:00:09.608532           0.86            0.51            0.88   

     thres_mean_mean     state  r_square  
144             0.53  COMPLETE  0.966123  


rtn_4_fixed
     number      value             datetime_start          datetime_complete  \
288     288  85.112958 2024-02-17 20:18:43.150731 2024-02-17 20:18:52.493840   

                  duration  thres_for_max  thres_for_mean  thres_mean_max  \
288 0 days 00:00:09.343109           0.89             0.3            0.67   

     thres_mean_mean     state  r_square  
288              0.7  COMPLETE  0.991692  


rtn_5_fixed
     number      value             datetime_start          datetime_complete  \
140     140  60.045734 2024-02-17 22:04:29.963007 2024-02-17 22:04:39.5358

In [81]:
pd.concat(l_best_combi)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,thres_for_max,thres_for_mean,thres_mean_max,thres_mean_mean,state,r_square,case
144,144,132.612399,2024-02-17 17:41:17.311245,2024-02-17 17:41:26.919777,0 days 00:00:09.608532,0.86,0.51,0.88,0.53,COMPLETE,0.966123,rtn_3_fixed
288,288,85.112958,2024-02-17 20:18:43.150731,2024-02-17 20:18:52.493840,0 days 00:00:09.343109,0.89,0.3,0.67,0.7,COMPLETE,0.991692,rtn_4_fixed
140,140,60.045734,2024-02-17 22:04:29.963007,2024-02-17 22:04:39.535849,0 days 00:00:09.572842,0.85,0.3,0.83,0.34,COMPLETE,0.990326,rtn_5_fixed
379,379,121.006597,2024-02-17 19:24:45.215560,2024-02-17 19:24:55.007103,0 days 00:00:09.791543,0.86,0.3,0.55,0.57,COMPLETE,0.996455,rtn_3_float
225,225,134.125601,2024-02-17 21:13:10.039117,2024-02-17 21:13:19.773068,0 days 00:00:09.733951,0.86,0.9,0.75,0.43,COMPLETE,0.9952,rtn_4_float
224,224,82.589792,2024-02-18 05:50:58.912852,2024-02-18 05:51:12.680491,0 days 00:00:13.767639,0.9,0.35,0.79,0.64,COMPLETE,0.989318,rtn_5_float


In [43]:
df_result_opt = pd.concat(l_result, axis=1)

In [44]:
df_result_opt.to_pickle("df_oos_opt_result_applied.pkl")

In [65]:
(
    df_result_opt['rtn_3_fixed']
    # + 1
).cumsum().plot(kind='scatter')

In [68]:
(
    df_result_opt['rtn_3_float']
    + 1
).cumprod().plot(kind='scatter')

In [49]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

x = df_result_opt.index
y1 = df_result_opt[['rtn_3_float', 'rtn_4_float']].mean(axis=1).cumsum()
y2 = (df_result_opt[['rtn_3_float', 'rtn_4_float']].mean(axis=1)+1).cumprod()

# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2)

# First plot (e.g., a scatter plot)
fig.add_trace(
    go.Scatter(x=x, y=y1, mode='markers', name='Simple'),
    row=1, col=1
)

# Second plot (e.g., a bar plot)
fig.add_trace(
    go.Scatter(x=x, y=y2, mode='markers', name='Complex'),
    row=1, col=2
)

# Optionally adjust layout
fig.update_layout(height=400, width=800,)

# Show the plot
fig.show()


In [62]:
# Check VaR
for rtn in ['rtn_3', 'rtn_4', 'rtn_5']:

    for typeStr in ['float']:

        print(f"{rtn}_{typeStr}")
        print(
            df_result_opt[f'{rtn}_{typeStr}'].mean()
        )
        print(
            df_result_opt[f'{rtn}_{typeStr}'].quantile([0, 0.05, 0.5, 0.95, 1])
        )
        
        # print(df_result_opt[f'{rtn}_{typeStr}'].cumsum()[-1])

rtn_3_float
0.050166712534576836
0.00   -0.098400
0.05   -0.065166
0.50    0.047365
0.95    0.192131
1.00    0.252868
Name: rtn_3_float, dtype: float64
rtn_4_float
0.050945219826053614
0.00   -0.115113
0.05   -0.069883
0.50    0.047243
0.95    0.155576
1.00    0.301097
Name: rtn_4_float, dtype: float64
rtn_5_float
0.046362357982864096
0.00   -0.162534
0.05   -0.067549
0.50    0.044283
0.95    0.178135
1.00    0.233305
Name: rtn_5_float, dtype: float64


In [3]:
df_result_opt = pd.read_pickle("df_oos_opt_result_applied.pkl")

In [5]:
df_result_opt.head(10)

Unnamed: 0_level_0,rtn_3_fixed,rtn_4_fixed,rtn_5_fixed,rtn_3_float,rtn_4_float,rtn_5_float
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-11-26,-0.042859,0.015303,0.008373,0.011477,0.056448,-0.006817
2015-12-23,0.202229,0.058554,0.145738,0.210461,0.193962,0.11865
2016-01-22,0.0448,-0.018662,-0.021817,0.035769,0.016453,-0.030784
2016-02-23,0.120932,0.100118,0.10524,0.081189,0.12495,0.124872
2016-03-22,0.03851,0.153385,0.101362,0.107383,0.085174,0.137864
2016-04-19,-0.008258,0.21976,0.188984,0.180351,0.045118,0.191306
2016-05-18,0.046994,-0.04709,-0.070235,-0.065241,-0.052873,-0.082141
2016-06-15,0.199569,0.193908,0.196229,0.105825,0.155773,0.203059
2016-07-12,0.048098,0.095044,0.020315,0.039037,0.030076,0.020315
2016-08-08,0.04974,0.037977,-0.004322,0.022357,0.064742,0.013883


In [11]:
(
    df_result_opt[['rtn_3_float', 'rtn_4_float', 'rtn_5_float']].mean(axis=1)
    +1
).cumprod().plot(kind="scatter")

In [12]:
114**(1/8.6)

1.73450270996002

In [13]:
(
    df_result_opt['rtn_3_float']
    +1
).cumprod().plot(kind="scatter")

In [14]:
df_result_opt.corr()

Unnamed: 0,rtn_3_fixed,rtn_4_fixed,rtn_5_fixed,rtn_3_float,rtn_4_float,rtn_5_float
rtn_3_fixed,1.0,0.625138,0.523552,0.718497,0.740857,0.525048
rtn_4_fixed,0.625138,1.0,0.824556,0.83494,0.753842,0.841853
rtn_5_fixed,0.523552,0.824556,1.0,0.800415,0.69134,0.902927
rtn_3_float,0.718497,0.83494,0.800415,1.0,0.774585,0.771783
rtn_4_float,0.740857,0.753842,0.69134,0.774585,1.0,0.70085
rtn_5_float,0.525048,0.841853,0.902927,0.771783,0.70085,1.0


In [17]:
df_result_opt['rtn_3_fixed'].std()

0.09857133659198757

In [16]:

df_result_opt[['rtn_3_fixed', 'rtn_5_float']].mean(axis=1).std()

0.07543192507564465

In [18]:
df_result_opt['rtn_5_float'].std()

0.07363327236866406

In [19]:
(
    df_result_opt[['rtn_3_fixed', 'rtn_5_float']].mean(axis=1)
    +1
).cumprod().plot(kind='scatter')

In [5]:
for rtn3 in ['rtn_3_fixed', 'rtn_3_float', '']:
    for rtn4 in ['rtn_4_fixed', 'rtn_4_float', '']:
        for rtn5 in ['rtn_5_fixed', 'rtn_5_float', '']:

            cols = []
            if rtn3 != '':
                cols.append(rtn3)
            if rtn4 != '':
                cols.append(rtn4)
            if rtn5 != '':
                cols.append(rtn5)

            sr = df_result_opt[cols].mean(axis=1)

            print( f"combi : {rtn3} / {rtn4} / {rtn5}")
            print(f"mean : {sr.mean()}")
            print(f"std : {sr.std()}")
            # print((sr+1).cumprod().iloc[-1])
            
            print(f"sharpe : {sr.mean()/sr.std()}")
            print('\n')

            

combi : rtn_3_fixed / rtn_4_fixed / rtn_5_fixed
mean : 0.04757255694481994
std : 0.07278096966336152
sharpe : 0.65364005405342


combi : rtn_3_fixed / rtn_4_fixed / rtn_5_float
mean : 0.048608249007069865
std : 0.07229897812108596
sharpe : 0.6723227667984604


combi : rtn_3_fixed / rtn_4_fixed / 
mean : 0.049731194519172746
std : 0.07852646327625361
sharpe : 0.6333049069613639


combi : rtn_3_fixed / rtn_4_float / rtn_5_fixed
mean : 0.0489563413613644
std : 0.0711077930673679
sharpe : 0.6884806748956882


combi : rtn_3_fixed / rtn_4_float / rtn_5_float
mean : 0.04999203342361436
std : 0.07058969331485399
sharpe : 0.7082058453014227


combi : rtn_3_fixed / rtn_4_float / 
mean : 0.051806871143989484
std : 0.07849748970949326
sharpe : 0.659981246989152


combi : rtn_3_fixed /  / rtn_5_fixed
mean : 0.04796190212901982
std : 0.07632717670222296
sharpe : 0.6283725430607074


combi : rtn_3_fixed /  / rtn_5_float
mean : 0.049515440222394715
std : 0.07543192507564465
sharpe : 0.6564255144322465

In [10]:
(df_result_opt[['rtn_3_float', 'rtn_4_float']]).mean(axis=1).std()

0.06713523624460727

In [32]:
data = df_result_opt[['rtn_3_float','rtn_4_float']].mean(axis=1)
fig = px.histogram(data, nbins=30, marginal="violin", histnorm='probability density')
fig.show()

In [41]:
(130.80380193278907)**(1/8.6)

1.7624574763697665

In [43]:
5.2/8.6

0.6046511627906977

In [50]:
df_result_opt.shape

(103, 6)

In [51]:
1.05**103

152.2291436143277