#### This Code is the base of the current running Cate ML Model

In [1]:
import pandas as pd
pd.options.plotting.backend = "plotly"

import plotly.express as px
import plotly.graph_objects as go

import numpy as np

from make_new_features import get_df_with_features
from get_krx_value import get_krx_mean

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

from xgboost import plot_importance
import matplotlib.pyplot as plt

#### Preparing Price Dataset

In [2]:
SHORT_PERIOD=5
MID_PERIOD=10
LONG_PERIOD=20

try :
    df_with_feats = pd.read_pickle(f"./df_with_feats_f_20130101_t_20230919_{SHORT_PERIOD}_{MID_PERIOD}_{LONG_PERIOD}_with_min_max_ratio.pkl")
    # df_with_feats['date'] = df_with_feats['date'].dt.strftime("%Y-%m-%d")
except :
    # filtering : Normal Stocks, Not SPAC
    df_price = (
        pd.read_pickle("./df_price_price_only_f_20130101_t_20230919.pkl")
        .loc[lambda df : df["code"].str[5]== "0"]
        # .loc[lambda df : ~df["name"].str.contains("스펙")]
        # .loc[lambda df : ~df["name"].str.contains("스팩")]    
    )

    df_price['date'] = df_price['date'].dt.strftime("%Y-%m-%d")

    try:
        df_krx = pd.read_pickle("./df_krx.pkl")
    except :
        df_krx = get_krx_mean()
        df_krx.to_pickle("./df_krx.pkl")

    df_price = df_price.merge(
        df_krx,
        on='date'
    )
    
    df_with_feats = get_df_with_features(
        df_price, SHORT_PERIOD=SHORT_PERIOD, MID_PERIOD=MID_PERIOD, LONG_PERIOD=LONG_PERIOD)
    df_with_feats.to_pickle(f"./df_with_feats_f_20130101_t_20230919_{SHORT_PERIOD}_{MID_PERIOD}_{LONG_PERIOD}_with_min_max_ratio.pkl")

## ML

In [3]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [4]:
df_with_feats['vol_x_price_sma_mid'].describe()

count    5.136201e+06
mean     6.584938e+09
std      4.550187e+10
min      0.000000e+00
25%      2.354808e+08
50%      9.026849e+08
75%      3.611053e+09
max      1.792124e+13
Name: vol_x_price_sma_mid, dtype: float64

In [5]:
df_with_feats_ml = (
    df_with_feats
    .assign(
        vol_x_price_sma_long_to_mid = lambda df : df.vol_x_price_sma_long / df.vol_x_price_sma_mid,
        vol_x_price_sma_mid_to_short = lambda df : df.vol_x_price_sma_mid / df.vol_x_price_sma_short,
    )
    .loc[lambda df : df.code.str[5] == '0']
    .loc[lambda df : ~df.name.str.contains("스펙")]
    .loc[lambda df : ~df.name.str.contains("스팩")]
    
    .loc[lambda df : df["vol_x_price_sma_long"] >  3.5e+08]  # 0.75e+08
    .loc[lambda df : df["vol_x_price_sma_long"] <  9.0e+08]  # 3.5e+08
    .loc[lambda df : df["vol_zero_count_long"] == 0]
    .loc[lambda df : df["change"] < 0.29]
    .loc[lambda df : df["volume"] > 0]

    # .loc[lambda df : df.close > 1000]
)

In [None]:
# # All Available Features
# for col in df_with_feats_ml.columns:
#     print(f"'{col}',")

#### EDA & Selecting Features

In [97]:
feats = [
        'change_high',
        'change_low',
        'change_open',
        'close_std_short',
        # 'close_std_mid',
        # 'close_std_long',
        # 'close_mean_short',
        # 'close_mean_mid',
        # 'close_mean_long',
        'close_min_ratio_short',
        # 'close_min_ratio_mid',
        # 'close_min_ratio_long',
        'close_max_ratio_short',
        # 'close_max_ratio_mid',
        # 'close_max_ratio_long',
        # 'open_std_short',
        'open_std_mid',
        # 'open_std_long',
        # 'high_std_short',
        'high_std_mid',
        # 'high_std_long',
        # 'low_std_short',
        # 'low_std_mid',
        # 'low_std_long',
        # 'vol_zero_count_short',
        # 'vol_zero_count_mid',
        # 'vol_zero_count_long',
        'close_change_p_short',
        'close_change_p_mid',
        'close_change_p_long',
        'w_price_vol_corr_long',
        'w_price_vol_corr_mid',
        'w_price_vol_corr_short',
        'krx_corr_short',
        'krx_corr_mid',
        'krx_corr_long',
        # 'krx_change_std_short',
        'krx_change_std_mid',
        # 'krx_change_std_long',
        'vol_x_price_sma_long_to_mid',
        'vol_x_price_sma_mid_to_short'
    ]

In [58]:
# feats = [
#         'change_high',
#         'change_low',
#         'change_open',
#         'close_std_short',
#         # 'close_std_mid',
#         'close_std_long',
#         # 'close_mean_short',
#         # 'close_mean_mid',
#         # 'close_mean_long',
#         # 'close_min_ratio_short',
#         'close_min_ratio_mid',
#         'close_min_ratio_long',
#         # 'close_max_ratio_short',
#         'close_max_ratio_mid',
#         'close_max_ratio_long',
#         'open_std_short',
#         # 'open_std_mid',
#         'open_std_long',
#         'high_std_short',
#         'high_std_mid',
#         'high_std_long',
#         'low_std_short',
#         # 'low_std_mid',
#         'low_std_long',
#         # 'vol_zero_count_short',
#         # 'vol_zero_count_mid',
#         # 'vol_zero_count_long',
#         # 'close_change_p_short',
#         'close_change_p_mid',
#         'close_change_p_long',
#         'w_price_vol_corr_long',
#         'w_price_vol_corr_mid',
#         'w_price_vol_corr_short',
#         'krx_corr_short',
#         'krx_corr_mid',
#         'krx_corr_long',
#         # 'krx_change_std_short',
#         # 'krx_change_std_mid',
#         # 'krx_change_std_long',
#         'vol_x_price_sma_long_to_mid',
#         'vol_x_price_sma_mid_to_short'
#     ]

In [98]:
corr_matrix = df_with_feats_ml[feats].corr()

upper = corr_matrix.where(~np.tril(np.ones(corr_matrix.shape)).astype(bool))

# Find the pairs where the absolute value of correlation is greater than 0.75
high_corr_pairs = [(column, index) for column, rowIndex in zip(upper.columns, upper.index) 
                   for index, value in upper[column].items() if abs(value) > 0.8]

# Print the high correlation pairs
for pair in high_corr_pairs:
    print(pair)

In [100]:
# For fixed length train dataset

import exchange_calendars as xcals
krx_cal = xcals.get_calendar("XKRX")

max_date = '2023-09-19'
# max_date = '2024-01-19'
start_date = '2015-01-02'

finish = False
dates = []

len_of_train = 100
gap_from_last_train_date = 7 # This number depends on the target ( rtn_5 -> 7, rnt_20 -> 22)
len_of_pred = 10 # The length of pred for 1 model update

while not finish :
    train_end = krx_cal.sessions_window(start_date, len_of_train)[-1].strftime("%Y-%m-%d")
    oos_start = krx_cal.sessions_window(train_end, gap_from_last_train_date)[-1].strftime("%Y-%m-%d")
    oos_end = krx_cal.sessions_window(oos_start, len_of_pred)[-1].strftime("%Y-%m-%d")

    if oos_end > max_date :
        dates.append(
            (start_date, train_end, oos_start, max_date)
        )
        finish = True

    else :
        dates.append(
            (start_date, train_end, oos_start, oos_end)
        )

        start_date = krx_cal.sessions_window(start_date, len_of_pred)[-1].strftime("%Y-%m-%d")

#### Create df_oos_with_proba

In [101]:
def run_ml_multi_seed(features, idx0, idx1, fixed, target):

    feats = features
    
    THRES_OF_PROBA = 0.5

    rtn_in_period = []
    l_df = []

    if fixed:
        print("fixed")
    else :
        print("float")

    for train_start, train_end, oos_start, oos_end in dates[idx0 : idx1]:

        df_train_set = df_with_feats_ml.loc[lambda df : df.date >= train_start].loc[lambda df : df.date < train_end]
        df_oos_set = df_with_feats_ml.loc[lambda df : df.date >= oos_start].loc[lambda df : df.date < oos_end]

        df_train_set_ = (
            df_train_set
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        if fixed:
            THRES_OF_TRUE = 0.05
        else :
            THRES_OF_TRUE = df_train_set_[target].quantile(0.8)
        
        # print(df_train_set_[target].quantile([0.7, 0.8, 0.85,0.9]))

        df_oos_set_ = (
            df_oos_set      
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        df_train_set_["target"] = (
                df_train_set_[target].apply(lambda x : 1 if x > THRES_OF_TRUE else 0)
            )

        feature_df = df_train_set_[feats]
        target_df = df_train_set_["target"]

        feature_df_oos = df_oos_set_[feats]

        doos = xgb.DMatrix(feature_df_oos)        

        if df_oos_set_.shape[0] > 0:
            
            col_list = []
            for id, seed in enumerate([6, 13]): #, 5, 2

                X_train, X_valid, y_train, y_valid = train_test_split(
                    feature_df, target_df, test_size=0.3, random_state=seed
                )

                # Create DMatrix for training and validation data
                dtrain = xgb.DMatrix(X_train, label=y_train)
                dvalid = xgb.DMatrix(X_valid, label=y_valid)

                evals = [(dtrain, 'train'), (dvalid, 'eval')]

                params = {
                    'max_depth': 5, # Adjust based on your dataset : originally 5 --> try 8
                    'eta': 0.05,     # Learning rate : originally 0.05
                    'objective': 'binary:logistic',
                    'eval_metric': 'logloss',  # Or use 'auc', 'error', etc. based on your problem
                    'random_state': 42,
                }

                model = xgb.train(
                    params, dtrain,
                    1000,
                    evals=evals,
                    early_stopping_rounds=50,
                    verbose_eval=False
                )

                df_oos_set_[f"pred_proba_{id}"] = model.predict(doos)
                col_list.append(f"pred_proba_{id}")
            
            df_oos_set_["proba_mean"] = df_oos_set_[col_list].mean(axis=1)
            df_oos_set_["proba_max"] = df_oos_set_[col_list].max(axis=1)

            l_df.append(df_oos_set_)

            sr_selected_mean_mean = (
                df_oos_set_
                .loc[lambda df : df['proba_mean'] > THRES_OF_PROBA]
                .sort_values('proba_mean', ascending=False)
                .groupby('date')
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )       

            sr_selected_mean_max = (
                df_oos_set_
                .loc[lambda df : df['proba_mean'] > THRES_OF_PROBA]
                .sort_values('proba_max', ascending=False)
                .groupby('date')
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )      

            sr_selected_max_mean = (
                df_oos_set_
                .loc[lambda df : df['proba_max'] > THRES_OF_PROBA]
                .sort_values('proba_mean', ascending=False)
                .groupby('date')
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )    

            sr_selected_max_max = (
                df_oos_set_
                .loc[lambda df : df['proba_max'] > THRES_OF_PROBA]
                .sort_values('proba_max', ascending=False)
                .groupby('date')
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )     

            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : sr_selected_mean_mean.sum()/(int(target.split('_')[-1]) + 1),
                    'return_mean_max' : sr_selected_mean_max.sum()/(int(target.split('_')[-1]) + 1),
                    'return_max_mean' : sr_selected_max_mean.sum()/(int(target.split('_')[-1]) + 1),
                    'return_max_max' : sr_selected_max_max.sum()/(int(target.split('_')[-1]) + 1),
                    'date_n' : sr_selected_mean_max.shape[0]
                }
            )

        else :

        
            # cumsum += 0
            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : 0,
                    'return_mean_max' : 0,
                    'return_max_mean' : 0,
                    'return_max_max' : 0,
                    'date_n' : 0
                }
            )
        
        # plot_importance(model)
        # plt.show()
        
    return rtn_in_period, l_df

In [119]:
for feat_var, target_var in [(feats, 'rtn_5')]: # , (feats, 'rtn_4'), (feats, 'rtn_5')

    feats = feat_var
    target = target_var

    for tf in [True] : #, False]:

        print(target)

        oos_test_result, dfs = run_ml_multi_seed(feats, 0, -1, tf, target)
        df_oos_result = pd.DataFrame(oos_test_result)
        df_oos_result = df_oos_result.set_index('date')

        if tf:
            df_oos_with_proba_fixed = pd.concat(dfs)
            df_oos_with_proba_fixed.to_pickle(f"df_oos_with_proba_fixed_{target}.pkl")
        else:
            df_oos_with_proba_float = pd.concat(dfs)
            df_oos_with_proba_float.to_pickle(f"df_oos_with_proba_float_{target}.pkl")

        for col in df_oos_result.columns:
            sr = (df_oos_result[col]+1)
            sr_org = df_oos_result[col]

            print(col)
            print(f"Cumsum : {round(sr_org.cumsum().iloc[-1],2)}")
            # print(f"Prod : {round(sr.cumprod()[-1],2)}")
            # print(f"Prod Max : {round(sr.cumprod().max(),2)}")
            # print(f"Prod Min : {round(sr.cumprod().min(),2)}")
            # print(f"min : {round(sr_org.min(),3)}")
            # print(f"std : {round(sr_org.std(),3)}")
            # print(f"mean : {round(sr_org.mean(),3)}")
            print('\n')


rtn_5
fixed
return_mean_mean
Cumsum : 1.79


return_mean_max
Cumsum : 1.7


return_max_mean
Cumsum : 2.25


return_max_max
Cumsum : 2.25


date_n
Cumsum : 1053




In [120]:
df_oos_result.head()

Unnamed: 0_level_0,return_mean_mean,return_mean_max,return_max_mean,return_max_max,date_n
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-06-19,-0.025599,-0.040014,-0.016572,-0.022543,9
2015-07-02,-0.051401,-0.049586,-0.046874,-0.050684,9
2015-07-15,0.099078,0.089589,0.090184,0.064137,9
2015-07-28,0.025418,0.022438,-0.030678,-0.039572,9
2015-08-10,0.0035,0.009718,-0.000479,0.005386,8


In [121]:
df_oos_result['return_max_mean'].cumsum().plot()

#### Get Best Thresholds for Proba

In [122]:
def objective(trial, df_proba, rtn_n):

    thres01 = trial.suggest_float("thres_for_max", 0.3, 0.9, step=0.01)
    thres02 = trial.suggest_float("thres_for_mean", 0.3, 0.9, step=0.01)
    thres03 = trial.suggest_float("thres_mean_max", 0.3, 0.9, step=0.01)
    thres04 = trial.suggest_float("thres_mean_mean", 0.3, 0.9, step=0.01)

    l_rtn_p = []

    for date in dates:

        df_ = df_proba.loc[lambda df : df.date >= date[2]].loc[lambda df : df.date < date[3]]
        df_ = df_.loc[lambda df : df['weighted_price_change_p_short'] >= df['weighted_price_change_p_mid']]


        df1 = (
            df_
            .loc[lambda df : df['proba_max'] > thres01]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df2 = (
            df_
            .loc[lambda df : df['proba_max'] > thres02]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df3 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres03]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df4 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres04]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        sr = (
            pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code']) #df1, df2, 
            .groupby('date')[rtn_n]
            .mean()
            -0.0023
        )

        n = int(rtn_n.split('_')[1]) + 1

        l_rtn_p.append(
            {
                'date' : date[3],
                'rtn_p' : (sr/n).sum(),
                'days' : pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code'])['date'].unique().__len__()
            }        
        )

    sr_final = (
        pd.DataFrame(l_rtn_p)
        .set_index('date')
        ['rtn_p']
        # +1
    )

    print(
        pd.DataFrame(l_rtn_p)['days'].sum()
    )

    return (sr_final+1).cumprod()[-1]


In [123]:
import optuna

In [124]:
# Optuna Strudy Run
def run_optuna(name, df_, rtn_n):

    study = optuna.create_study(
            direction="maximize",
            # direction="minimize",
            study_name=name,
        )

    study.optimize(
        lambda trial : objective(trial, df_, rtn_n),
        n_trials=300,
        show_progress_bar=True,
    )

    # Create study result df & Save to file
    df_study_result = (
        study.trials_dataframe()
        .rename(columns = lambda x : x.replace("params_", ""))
    )    
    df_study_result.to_pickle(f"./study_result_20240217_{name}.pkl")

In [125]:
for rtn_n in ['rtn_5'] : #, 'rtn_4', 'rtn_5']:
    for tf in [True]: #, False

        type_str = 'fixed' if tf else 'float'

        df_with_proba = (
            pd.read_pickle(f"df_oos_with_proba_{type_str}_{rtn_n}.pkl")
            .merge(
                df_with_feats_ml[['code','date','weighted_price_change_p_short', 'weighted_price_change_p_mid']]
            )
            .dropna(subset=['weighted_price_change_p_short', 'weighted_price_change_p_mid'])
        )
        
        run_optuna(f"best_combi_of_thres_{type_str}_{rtn_n}", df_with_proba, rtn_n)

[I 2024-02-23 21:46:39,975] A new study created in memory with name: best_combi_of_thres_fixed_rtn_5


  0%|          | 0/300 [00:00<?, ?it/s]

1653
[I 2024-02-23 21:47:05,274] Trial 0 finished with value: 4.578170539284666 and parameters: {'thres_for_max': 0.4, 'thres_for_mean': 0.64, 'thres_mean_max': 0.55, 'thres_mean_mean': 0.6599999999999999}. Best is trial 0 with value: 4.578170539284666.
1661
[I 2024-02-23 21:47:29,720] Trial 1 finished with value: 4.148382302873166 and parameters: {'thres_for_max': 0.81, 'thres_for_mean': 0.8600000000000001, 'thres_mean_max': 0.78, 'thres_mean_mean': 0.32}. Best is trial 0 with value: 4.578170539284666.
1762
[I 2024-02-23 21:47:54,872] Trial 2 finished with value: 4.333217916555996 and parameters: {'thres_for_max': 0.43, 'thres_for_mean': 0.38, 'thres_mean_max': 0.5800000000000001, 'thres_mean_mean': 0.32}. Best is trial 0 with value: 4.578170539284666.
1125
[I 2024-02-23 21:48:20,331] Trial 3 finished with value: 8.592879764855763 and parameters: {'thres_for_max': 0.53, 'thres_for_mean': 0.8999999999999999, 'thres_mean_max': 0.8700000000000001, 'thres_mean_mean': 0.65}. Best is trial 

In [126]:
def get_df_rtn_result(df_with_proba, thres_01, thres_02, thres_03, thres_04, rtn):
    l_rtn_p = []

    n = int(rtn.split('_')[1])+1

    for date in dates:

        df_ = df_with_proba.loc[lambda df : df.date >= date[2]].loc[lambda df : df.date < date[3]]
        df_ = df_.loc[lambda df : df['weighted_price_change_p_short'] >= df['weighted_price_change_p_mid']]

        df1 = (
            df_
            .loc[lambda df : df['proba_max'] > thres_01]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df2 = (
            df_
            .loc[lambda df : df['proba_max'] > thres_02]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df3 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres_03]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df4 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres_04]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        sr = (
            pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code']) #df1, df2, 
            .groupby('date')[rtn]
            .mean()
            -0.0023
        )

        l_rtn_p.append(
            {
                'date' : date[3],
                'rtn_p' : (sr/n).sum(),
                'days' : pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code'])['date'].unique().__len__()
            }        
        )

    return (
        pd.DataFrame(l_rtn_p)
        .set_index('date')
        ['rtn_p']
    )

In [127]:
from scipy import stats

In [128]:
def get_rsquare(df_with_proba, rtn, row):
    
    sr_final = get_df_rtn_result(df_with_proba, row.thres_for_max, row.thres_for_mean, row.thres_mean_max, row.thres_mean_mean, rtn)
    #  'thres_for_max', 'thres_for_mean', 'thres_mean_max', 'thres_mean_mean',

    x = sr_final.cumsum()
    y = range(0, len(x))

    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

    print(f"R-squared: {r_value**2}")
    return r_value**2

#### Add R-Squared of Cumsum Series

In [129]:
typeStr = None

for useFixed in [True,]: #, False
    if useFixed:
        typeStr = 'fixed'
    else :
        typeStr = 'float'

    for rtn in ['rtn_5',]: # 'rtn_4','rtn_5'
        df_with_proba = (
            pd.read_pickle(f"df_oos_with_proba_{typeStr}_{rtn}.pkl")
            .merge(
                df_with_feats_ml[['code','date','weighted_price_change_p_short', 'weighted_price_change_p_mid']]
            )
            .dropna(subset=['weighted_price_change_p_short', 'weighted_price_change_p_mid'])
        )
        df_study_result = pd.read_pickle(f"study_result_20240217_best_combi_of_thres_{typeStr}_{rtn}.pkl")
        # print(df_study_result.columns)

        df_study_result = df_study_result.drop_duplicates(subset=['thres_for_max', 'thres_for_mean', 'thres_mean_max', 'thres_mean_mean']).sort_values("value", ascending=False).head(50)

        df_study_result['r_square'] = df_study_result.apply(lambda row : get_rsquare(df_with_proba, rtn, row), axis=1 )

        
        df_study_result.to_pickle(f"study_result_add_r2_{typeStr}_{rtn}.pkl")
        

R-squared: 0.9701227102749147
R-squared: 0.9701227102749147
R-squared: 0.9699886460863099
R-squared: 0.9701072377649453
R-squared: 0.9701072377649453
R-squared: 0.9698224456127418
R-squared: 0.9699282980153323
R-squared: 0.9697754989091534
R-squared: 0.9696584054001465
R-squared: 0.9694590143729169
R-squared: 0.9704176044271147
R-squared: 0.9704176044271147
R-squared: 0.9704176044271147
R-squared: 0.9704176044271147
R-squared: 0.9704176044271147
R-squared: 0.9704176044271147
R-squared: 0.97040006385207
R-squared: 0.9703595337133984
R-squared: 0.9703595337133984
R-squared: 0.9701408078122722
R-squared: 0.9672278084772971
R-squared: 0.96706607411497
R-squared: 0.9702775030460515
R-squared: 0.9672275213670353
R-squared: 0.9702910578134742
R-squared: 0.9669315235868567
R-squared: 0.9669315235868567
R-squared: 0.9667682364992696
R-squared: 0.9670945906089853
R-squared: 0.9671749364660004
R-squared: 0.9667964305381672
R-squared: 0.9694996234033465
R-squared: 0.97008919192533
R-squared: 0.970

#### Get Bset R2 Results

In [130]:
l_result = []
l_best_combi = []

for typeStr in ['fixed']: #, 'float']:

    for rtn in ['rtn_5']:#, 'rtn_4', 'rtn_5']:

        print(f"{rtn}_{typeStr}")
        df_with_proba = (
            pd.read_pickle(f"df_oos_with_proba_{typeStr}_{rtn}.pkl")
            .merge(
                df_with_feats_ml[['code','date','weighted_price_change_p_short', 'weighted_price_change_p_mid']]
            )
        )
        df_study_result_with_rs = pd.read_pickle(f"study_result_add_r2_{typeStr}_{rtn}.pkl")

        pecentile_09 = df_study_result_with_rs['r_square'].quantile(0.9)
        
        max_row = (
            df_study_result_with_rs.loc[lambda df : df['r_square'] >= pecentile_09]
            .sort_values('r_square', ascending=False).head(1)
        )
        max_row['case'] = f"{rtn}_{typeStr}"

        l_best_combi.append(
            max_row
        )

        max_row = df_study_result_with_rs.sort_values('r_square', ascending=False).head(1)
        print(max_row)
        thres_for_max = max_row['thres_for_max'].iloc[0]
        thres_for_mean = max_row['thres_for_mean'].iloc[0]
        thres_mean_max = max_row['thres_mean_max'].iloc[0]
        thres_mean_mean = max_row['thres_mean_mean'].iloc[0]

        # 'thres_for_max', 'thres_for_mean', 'thres_mean_max', 'thres_mean_mean',


        sr_final = get_df_rtn_result(df_with_proba, thres_for_max, thres_for_mean, thres_mean_max, thres_mean_mean, rtn)
        sr_final.name = f"{rtn}_{typeStr}"
        
        l_result.append(
            sr_final.rename({'rtn_p': f"{rtn}_{typeStr}"})
        )
        print('\n')

rtn_5_fixed
     number      value             datetime_start          datetime_complete  \
241     241  12.424608 2024-02-23 23:27:37.474785 2024-02-23 23:28:02.537902   

                  duration  thres_for_max  thres_for_mean  thres_mean_max  \
241 0 days 00:00:25.063117           0.49            0.78            0.88   

     thres_mean_mean     state  r_square  
241             0.86  COMPLETE  0.970418  




In [131]:
(pd.concat(l_best_combi)).to_pickle("best_r2_result.pkl")

In [132]:
df_result_opt = pd.concat(l_result, axis=1)

In [133]:
df_result_opt.to_pickle("df_oos_opt_result_applied.pkl")

In [136]:
(
    df_result_opt['rtn_5_fixed']
    # + 1
).cumsum().plot(kind='scatter')

In [137]:
(df_result_opt['rtn_5_fixed'].mean()) / (df_result_opt['rtn_5_fixed'].std())

0.26279359291637594

In [None]:
(
    df_result_opt['rtn_3_float']
    + 1
).cumprod().plot(kind='scatter')

In [56]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

col = 'rtn_5_float'
x = df_result_opt.index
y1 = df_result_opt[col].cumsum()
y2 = (df_result_opt[col]+1).cumprod()

# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2)

# First plot (e.g., a scatter plot)
fig.add_trace(
    go.Scatter(x=x, y=y1, mode='markers', name='Simple'),
    row=1, col=1
)

# Second plot (e.g., a bar plot)
fig.add_trace(
    go.Scatter(x=x, y=y2, mode='markers', name='Complex'),
    row=1, col=2
)

# Optionally adjust layout
fig.update_layout(height=400, width=800,)

# Show the plot
fig.show()


In [50]:
# Check VaR
for rtn in ['rtn_3', 'rtn_4', 'rtn_5']:

    for typeStr in ['fixed']:

        print(f"{rtn}_{typeStr}")
        print(
            df_result_opt[f'{rtn}_{typeStr}'].mean()
        )
        print(
            df_result_opt[f'{rtn}_{typeStr}'].quantile([0, 0.05, 0.5, 0.95, 1])
        )
        
        # cumsum_final = (df_result_opt[f'{rtn}_{typeStr}']+0).cumsum()[-1]
        # cumprod_final = (df_result_opt[f'{rtn}_{typeStr}']+1).cumprod()[-1]

        # annual_cumsum = (cumsum_final / 8.6)*100
        # annual_cumprod = ((cumprod_final)**(1/8.6) - 1)*100

        # print(annual_cumsum)

rtn_3_fixed
0.033933132943604304
0.00   -0.183870
0.05   -0.066981
0.50    0.018598
0.95    0.178004
1.00    0.316937
Name: rtn_3_fixed, dtype: float64
rtn_4_fixed
0.035740915824616505
0.00   -0.165820
0.05   -0.061477
0.50    0.021731
0.95    0.159689
1.00    0.459247
Name: rtn_4_fixed, dtype: float64
rtn_5_fixed
0.03210603745875421
0.00   -0.121034
0.05   -0.072264
0.50    0.014780
0.95    0.158073
1.00    0.504571
Name: rtn_5_fixed, dtype: float64
