#### This Code is the base of the current running Cate ML Model

In [1]:
import pandas as pd
pd.options.plotting.backend = "plotly"

import plotly.express as px
import plotly.graph_objects as go

import numpy as np

from make_new_features_v2 import get_df_with_features
from get_krx_value import get_krx_mean

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

from xgboost import plot_importance
import matplotlib.pyplot as plt

#### Preparing Price Dataset

In [2]:
SHORT_PERIOD=5
MID_PERIOD=20
LONG_PERIOD=60

try :
    df_with_feats = pd.read_pickle(f"./df_with_feats_f_20130101_t_20230919_{SHORT_PERIOD}_{MID_PERIOD}_{LONG_PERIOD}_new_feats.pkl")
    # df_with_feats['date'] = df_with_feats['date'].dt.strftime("%Y-%m-%d")
except :
    # filtering : Normal Stocks, Not SPAC
    df_price = (
        pd.read_pickle("./df_price_price_only_f_20130101_t_20230919.pkl")
        .loc[lambda df : df["code"].str[5]== "0"]
        # .loc[lambda df : df.code == '000020']
        # .loc[lambda df : ~df["name"].str.contains("스펙")]
        # .loc[lambda df : ~df["name"].str.contains("스팩")]    
    )

    df_price['date'] = df_price['date'].dt.strftime("%Y-%m-%d")

    try:
        df_krx = pd.read_pickle("./df_krx.pkl")
    except :
        df_krx = get_krx_mean()
        df_krx.to_pickle("./df_krx.pkl")

    df_price = df_price.merge(
        df_krx,
        on='date'
    )
    
    df_with_feats = get_df_with_features(
        df_price, SHORT_PERIOD=SHORT_PERIOD, MID_PERIOD=MID_PERIOD, LONG_PERIOD=LONG_PERIOD)
    df_with_feats.to_pickle(f"./df_with_feats_f_20130101_t_20230919_{SHORT_PERIOD}_{MID_PERIOD}_{LONG_PERIOD}_new_feats.pkl")

In [30]:
df_with_feats.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'change', 'code',
       'name', 'marcap', 'kospi', 'kosdaq', 'ratio_close_1d_ago',
       'ratio_open_1d_ago', 'ratio_high_1d_ago', 'ratio_low_1d_ago',
       'ratio_vol_1d_ago', 'ratio_close_2d_ago', 'ratio_open_2d_ago',
       'ratio_high_2d_ago', 'ratio_low_2d_ago', 'ratio_vol_2d_ago',
       'ratio_close_3d_ago', 'ratio_open_3d_ago', 'ratio_high_3d_ago',
       'ratio_low_3d_ago', 'ratio_vol_3d_ago', 'ratio_close_4d_ago',
       'ratio_open_4d_ago', 'ratio_high_4d_ago', 'ratio_low_4d_ago',
       'ratio_vol_4d_ago', 'ratio_close_5d_ago', 'ratio_open_5d_ago',
       'ratio_high_5d_ago', 'ratio_low_5d_ago', 'ratio_vol_5d_ago',
       'ratio_close_6d_ago', 'ratio_open_6d_ago', 'ratio_high_6d_ago',
       'ratio_low_6d_ago', 'ratio_vol_6d_ago', 'ratio_close_7d_ago',
       'ratio_open_7d_ago', 'ratio_high_7d_ago', 'ratio_low_7d_ago',
       'ratio_vol_7d_ago', 'ratio_close_8d_ago', 'ratio_open_8d_ago',
       'ratio_high_8d_ago'

## ML

In [12]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [36]:
df_with_feats_ml = (
    df_with_feats
    # .assign(
    #     vol_x_price_sma_long_to_mid = lambda df : df.vol_x_price_sma_long / df.vol_x_price_sma_mid,
    #     vol_x_price_sma_mid_to_short = lambda df : df.vol_x_price_sma_mid / df.vol_x_price_sma_short,
    # )
    .loc[lambda df : df.code.str[5] == '0']
    # .loc[lambda df : df.code == "068270"]
    # .loc[lambda df : ~df.name.str.contains("스펙")]
    # .loc[lambda df : ~df.name.str.contains("스팩")]
    
    # .loc[lambda df : df["vol_x_price_sma_mid"] >  0.75e+08]  # 0.75e+08
    # .loc[lambda df : df["vol_x_price_sma_mid"] <  3.5e+08]  # 3.5e+08
    # .loc[lambda df : df['close_std_long'] > 0.03]
    # .loc[lambda df : df.marcap > 1500e+08]
    # .loc[lambda df : df.marcap < 8000e+08]
    .loc[lambda df : df["vol_zero_count_short"] == 0]
    .loc[lambda df : df["change"] < 0.29]
    .loc[lambda df : df["volume"] > 0]

    # .loc[lambda df : df.close > 1000]
)

In [37]:
# # All Available Features
# for col in df_with_feats_ml.columns:
#     print(f"'{col}',")

#### EDA & Selecting Features

In [68]:
feats = [
        'ratio_close_1d_ago',
        'ratio_open_1d_ago',
        'ratio_high_1d_ago',
        'ratio_low_1d_ago',
        'ratio_vol_1d_ago',
        'ratio_close_2d_ago',
        'ratio_open_2d_ago',
        'ratio_high_2d_ago',
        'ratio_low_2d_ago',
        'ratio_vol_2d_ago',
        'ratio_close_3d_ago',
        'ratio_open_3d_ago',
        'ratio_high_3d_ago',
        'ratio_low_3d_ago',
        'ratio_vol_3d_ago',
        'ratio_close_4d_ago',
        'ratio_open_4d_ago',
        'ratio_high_4d_ago',
        'ratio_low_4d_ago',
        'ratio_vol_4d_ago',
        'ratio_close_5d_ago',
        'ratio_open_5d_ago',
        'ratio_high_5d_ago',
        'ratio_low_5d_ago',
        'ratio_vol_5d_ago',
        'ratio_close_6d_ago',
        'ratio_open_6d_ago',
        'ratio_high_6d_ago',
        'ratio_low_6d_ago',
        'ratio_vol_6d_ago',
        'ratio_close_7d_ago',
        'ratio_open_7d_ago',
        'ratio_high_7d_ago',
        'ratio_low_7d_ago',
        'ratio_vol_7d_ago',
        'ratio_close_8d_ago',
        'ratio_open_8d_ago',
        'ratio_high_8d_ago',
        'ratio_low_8d_ago',
        'ratio_vol_8d_ago',
        'ratio_close_9d_ago',
        'ratio_open_9d_ago',
        'ratio_high_9d_ago',
        'ratio_low_9d_ago',
        'ratio_vol_9d_ago',
        'delta_w_price_sma_short_pivot',
        'delta_w_price_mid_short_pivot',
        'delta_w_price_long_short_pivot',
        'sma_mid_by_short',
        'sma_long_by_short',
    ]

In [39]:
# for feat in feats :
#     q_low, q_high = df_with_feats_ml[feat].quantile([0.01, 0.99])
#     df_with_feats_ml = df_with_feats_ml.loc[lambda df : df[feat] > q_low].loc[lambda df : df[feat] < q_high]    

In [20]:
# corr_matrix = df_with_feats_ml[feats].corr()

# upper = corr_matrix.where(~np.tril(np.ones(corr_matrix.shape)).astype(bool))

# # Find the pairs where the absolute value of correlation is greater than 0.75
# high_corr_pairs = [(column, index) for column, rowIndex in zip(upper.columns, upper.index) 
#                    for index, value in upper[column].items() if abs(value) > 0.8]

# # Print the high correlation pairs
# for pair in high_corr_pairs:
#     print(pair)

In [40]:
# # For fixed length train dataset

# import exchange_calendars as xcals
# krx_cal = xcals.get_calendar("XKRX")

# max_date = '2023-09-19'
# # max_date = '2024-01-19'
# start_date = '2015-01-02'

# finish = False
# dates = []

# len_of_train = 200
# gap_from_last_train_date = 7 # This number depends on the target ( rtn_5 -> 7, rnt_20 -> 22)
# len_of_pred = 10 # The length of pred for 1 model update

# while not finish :
#     train_end = krx_cal.sessions_window(start_date, len_of_train)[-1].strftime("%Y-%m-%d")
#     oos_start = krx_cal.sessions_window(train_end, gap_from_last_train_date)[-1].strftime("%Y-%m-%d")
#     oos_end = krx_cal.sessions_window(oos_start, len_of_pred)[-1].strftime("%Y-%m-%d")

#     if oos_end > max_date :
#         dates.append(
#             (start_date, train_end, oos_start, max_date)
#         )
#         finish = True

#     else :
#         dates.append(
#             (start_date, train_end, oos_start, oos_end)
#         )

#         start_date = krx_cal.sessions_window(start_date, len_of_pred)[-1].strftime("%Y-%m-%d")

In [54]:
# For fixed length train dataset

import exchange_calendars as xcals
krx_cal = xcals.get_calendar("XKRX")

max_date = '2023-09-19'
# max_date = '2024-01-19'
start_date = '2015-01-02'

finish = False
dates = []

len_of_train = 200
gap_from_last_train_date = 7 # This number depends on the target ( rtn_5 -> 7, rnt_20 -> 22)
len_of_valid = 3
len_of_pred = 10 # The length of pred for 1 model update

while not finish :
    train_end = krx_cal.sessions_window(start_date, len_of_train)[-1].strftime("%Y-%m-%d")

    valid_start = krx_cal.sessions_window(train_end, gap_from_last_train_date)[-1].strftime("%Y-%m-%d")
    valid_end = krx_cal.sessions_window(valid_start, len_of_valid)[-1].strftime("%Y-%m-%d")

    oos_start = krx_cal.sessions_window(valid_end, 1)[-1].strftime("%Y-%m-%d")
    oos_end = krx_cal.sessions_window(oos_start, len_of_pred)[-1].strftime("%Y-%m-%d")

    if oos_end > max_date :
        dates.append(
            (start_date, train_end, valid_start, valid_end, oos_start, max_date)
        )
        finish = True

    else :
        dates.append(
            (start_date, train_end, valid_start, valid_end, oos_start, oos_end)
        )

        start_date = krx_cal.sessions_window(start_date, len_of_pred)[-1].strftime("%Y-%m-%d")

In [55]:
dates[-2]

('2022-11-01',
 '2023-08-18',
 '2023-08-28',
 '2023-08-30',
 '2023-08-30',
 '2023-09-12')

#### Create df_oos_with_proba

In [41]:
def calc_final_rtn(df):
    if df['max_rtn_in_5days'] > 0.05:
        return  0.05
    else :
        return df['rtn_5']

In [80]:
def run_ml_multi_seed_with_valid(features, idx0, idx1, fixed, target):

    feats = features
    
    THRES_OF_PROBA = 0.5

    rtn_in_period = []
    l_df = []

    if fixed:
        print("fixed")
    else :
        print("float")

    plus = 0
    minus = 0
    total = 0
    rtn_p_sum = 0

    for train_start, train_end, valid_start, valid_end, oos_start, oos_end in dates[idx0 : idx1]:

        df_train_set = df_with_feats_ml.loc[lambda df : df.date >= train_start].loc[lambda df : df.date < train_end]
        df_valid_set = df_with_feats_ml.loc[lambda df : df.date >= valid_start].loc[lambda df : df.date < valid_end]
        df_oos_set = df_with_feats_ml.loc[lambda df : df.date >= oos_start].loc[lambda df : df.date < oos_end]

        df_train_set_ = (
            df_train_set
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        if fixed:
            THRES_OF_TRUE = 0.03
        else :
            THRES_OF_TRUE = df_train_set_[target].quantile(0.8)
        
        # print(THRES_OF_TRUE)
            
        df_valid_set_ = (
            df_valid_set      
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        df_oos_set_ = (
            df_oos_set      
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        df_train_set_["target"] = (
                df_train_set_[target].apply(lambda x : 1 if x > THRES_OF_TRUE else 0)
            )
        
        print(
            df_train_set_["target"].value_counts(normalize=True)
        )

        feature_df = df_train_set_[feats]
        target_df = df_train_set_["target"]

        feature_df_valid = df_valid_set_[feats]
        feature_df_oos = df_oos_set_[feats]

        dvalid_oos = xgb.DMatrix(feature_df_valid)
        doos = xgb.DMatrix(feature_df_oos)        

        if df_oos_set_.shape[0] > 0:
            
            col_list = []
            for id, seed in enumerate([6, 12, 24]): #, 13, 5, 2

                X_train, X_valid, y_train, y_valid = train_test_split(
                    feature_df, target_df, test_size=0.25, random_state=seed
                )

                # Create DMatrix for training and validation data
                dtrain = xgb.DMatrix(X_train, label=y_train)
                dvalid = xgb.DMatrix(X_valid, label=y_valid)

                evals = [(dtrain, 'train'), (dvalid, 'eval')]

                params = {
                    'max_depth': 5, # Adjust based on your dataset : originally 5 --> try 8
                    'eta': 0.05,     # Learning rate : originally 0.05
                    'objective': 'binary:logistic',
                    'eval_metric': 'logloss',  # Or use 'auc', 'error', etc. based on your problem
                    'random_state': 42,
                }

                model = xgb.train(
                    params, dtrain,
                    1000,
                    evals=evals,
                    early_stopping_rounds=50,
                    verbose_eval=False
                )

                df_valid_set_[f"pred_proba_{id}"] = model.predict(dvalid_oos)
                df_oos_set_[f"pred_proba_{id}"] = model.predict(doos)
                col_list.append(f"pred_proba_{id}")
            
            df_valid_set_["proba_mean"] = df_oos_set_[col_list].mean(axis=1)
            df_valid_set_["proba_max"] = df_oos_set_[col_list].max(axis=1)

            df_oos_set_["proba_mean"] = df_oos_set_[col_list].mean(axis=1)
            df_oos_set_["proba_max"] = df_oos_set_[col_list].max(axis=1)

            codes_to_buy = (
                df_valid_set_
                .loc[lambda df : df['proba_mean'] > 0.3]
                .loc[lambda df : df[target] > 0.03]
                ['code']
                .unique()
            )

            print(codes_to_buy.__len__())

            l_df.append(df_oos_set_)

            sr_selected_mean_mean = (
                df_oos_set_
                .loc[lambda df : df['proba_max'] > THRES_OF_PROBA]
                .loc[lambda df : df['code'].isin(codes_to_buy)]
                .sort_values('proba_mean', ascending=False)
                # .groupby('date', group_keys=False)  # Add group_keys=False to avoid adding the group keys as an index
                .groupby('date')
                # .apply(lambda x: x.iloc[1:4])  # Apply slicing to each group
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )       

            rtn_p = sr_selected_mean_mean.sum()/(int(target.split('_')[-1]) + 1)

            total += 1
            if rtn_p > 0:
                plus += 1
            elif rtn_p < 0:
                minus += 1

            rtn_p_sum += rtn_p

            print(f"rnt_p : {round(rtn_p, 3)} / plus_ratio : {round(plus/total, 2)} / minus_ratio : {round(minus/total, 2)} / total : {total}")

            print(round(rtn_p_sum, 3))

            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : sr_selected_mean_mean.sum()/(int(target.split('_')[-1]) + 1),
                    # 'return_mean_max' : sr_selected_mean_max.sum()/(int(target.split('_')[-1]) + 1),
                    # 'return_max_mean' : sr_selected_max_mean.sum()/(int(target.split('_')[-1]) + 1),
                    # 'return_max_max' : sr_selected_max_max.sum()/(int(target.split('_')[-1]) + 1),
                    'date_n' : sr_selected_mean_mean.shape[0]
                }
            )

        else :

        
            # cumsum += 0
            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : 0,
                    'return_mean_max' : 0,
                    'return_max_mean' : 0,
                    'return_max_max' : 0,
                    'date_n' : 0
                }
            )
        
        
    return rtn_in_period, l_df

In [81]:
def run_ml_multi_seed(features, idx0, idx1, fixed, target):

    feats = features
    
    THRES_OF_PROBA = 0.5

    rtn_in_period = []
    l_df = []

    if fixed:
        print("fixed")
    else :
        print("float")

    plus = 0
    minus = 0
    total = 0
    rtn_p_sum = 0

    for train_start, train_end, oos_start, oos_end in dates[idx0 : idx1]:

        df_train_set = df_with_feats_ml.loc[lambda df : df.date >= train_start].loc[lambda df : df.date < train_end]
        df_oos_set = df_with_feats_ml.loc[lambda df : df.date >= oos_start].loc[lambda df : df.date < oos_end]

        df_train_set_ = (
            df_train_set
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        if fixed:
            THRES_OF_TRUE = 0.03
        else :
            THRES_OF_TRUE = df_train_set_[target].quantile(0.8)
        
        # print(THRES_OF_TRUE)

        df_oos_set_ = (
            df_oos_set      
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        df_train_set_["target"] = (
                df_train_set_[target].apply(lambda x : 1 if x > THRES_OF_TRUE else 0)
            )
        
        print(
            df_train_set_["target"].value_counts(normalize=True)
        )

        feature_df = df_train_set_[feats]
        target_df = df_train_set_["target"]

        feature_df_oos = df_oos_set_[feats]

        doos = xgb.DMatrix(feature_df_oos)        

        if df_oos_set_.shape[0] > 0:
            
            col_list = []
            for id, seed in enumerate([6, 12, 24]): #, 13, 5, 2

                X_train, X_valid, y_train, y_valid = train_test_split(
                    feature_df, target_df, test_size=0.25, random_state=seed
                )

                # Create DMatrix for training and validation data
                dtrain = xgb.DMatrix(X_train, label=y_train)
                dvalid = xgb.DMatrix(X_valid, label=y_valid)

                evals = [(dtrain, 'train'), (dvalid, 'eval')]

                params = {
                    'max_depth': 5, # Adjust based on your dataset : originally 5 --> try 8
                    'eta': 0.05,     # Learning rate : originally 0.05
                    'objective': 'binary:logistic',
                    'eval_metric': 'logloss',  # Or use 'auc', 'error', etc. based on your problem
                    'random_state': 42,
                }

                model = xgb.train(
                    params, dtrain,
                    2000,
                    evals=evals,
                    early_stopping_rounds=50,
                    verbose_eval=False
                )

                df_oos_set_[f"pred_proba_{id}"] = model.predict(doos)
                col_list.append(f"pred_proba_{id}")
            
            df_oos_set_["proba_mean"] = df_oos_set_[col_list].mean(axis=1)
            df_oos_set_["proba_max"] = df_oos_set_[col_list].max(axis=1)

            l_df.append(df_oos_set_)

            df_oos_set_['final_rtn'] = df_oos_set_.apply(calc_final_rtn, axis=1)
            print(df_oos_set_.loc[lambda df : df['proba_mean'] > THRES_OF_PROBA][['code', 'date', 'close', 'final_rtn', 'proba_mean']])

            sr_selected_mean_mean = (
                df_oos_set_
                # .loc[lambda df : df['proba_mean'] > THRES_OF_PROBA]
                .sort_values('proba_max', ascending=False)
                # .groupby('date', group_keys=False)  # Add group_keys=False to avoid adding the group keys as an index
                .groupby('date')
                # .apply(lambda x: x.iloc[1:4])  # Apply slicing to each group
                .head(5)
                .reset_index()
                .groupby('date')['final_rtn']
                .mean()
                -0.0023
            )       




            rtn_p = sr_selected_mean_mean.sum()/(int(target.split('_')[-1]) + 1)

            total += 1
            if rtn_p > 0:
                plus += 1
            elif rtn_p < 0:
                minus += 1

            rtn_p_sum += rtn_p

            print(f"rnt_p : {round(rtn_p, 3)} / plus_ratio : {round(plus/total, 2)} / minus_ratio : {round(minus/total, 2)} / total : {total}")

            print(round(rtn_p_sum, 3))

            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : sr_selected_mean_mean.sum()/(int(target.split('_')[-1]) + 1),
                    # 'return_mean_max' : sr_selected_mean_max.sum()/(int(target.split('_')[-1]) + 1),
                    # 'return_max_mean' : sr_selected_max_mean.sum()/(int(target.split('_')[-1]) + 1),
                    # 'return_max_max' : sr_selected_max_max.sum()/(int(target.split('_')[-1]) + 1),
                    'date_n' : sr_selected_mean_mean.shape[0]
                }
            )

        else :

        
            # cumsum += 0
            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : 0,
                    'return_mean_max' : 0,
                    'return_max_mean' : 0,
                    'return_max_max' : 0,
                    'date_n' : 0
                }
            )
        
        
    return rtn_in_period, l_df

In [None]:
for feat_var, target_var in [(feats, 'rtn_3'), ]: #, (feats, 'rtn_4'), (feats, 'rtn_5')

    feats = feat_var
    target = target_var

    for tf in [True]: # True,

        print(target)

        # oos_test_result, dfs = run_ml_multi_seed(feats, 0, -1, tf, target)
        oos_test_result, dfs = run_ml_multi_seed_with_valid(feats, 0, -1, tf, target)
        df_oos_result = pd.DataFrame(oos_test_result)
        df_oos_result = df_oos_result.set_index('date')

        if tf:
            df_oos_with_proba_fixed = pd.concat(dfs)
            df_oos_with_proba_fixed.to_pickle(f"df_oos_with_proba_fixed_{target}_rm_hl.pkl")
        else:
            df_oos_with_proba_float = pd.concat(dfs)
            df_oos_with_proba_float.to_pickle(f"df_oos_with_proba_float_{target}_rm_hl.pkl")

        for col in df_oos_result.columns:
            sr = (df_oos_result[col]+1)
            sr_org = df_oos_result[col]

            print(col)
            print(f"Prod : {round(sr.cumprod()[-1],2)}")
            print(f"Prod Max : {round(sr.cumprod().max(),2)}")
            print(f"Prod Min : {round(sr.cumprod().min(),2)}")
            print(f"min : {round(sr_org.min(),3)}")
            print(f"std : {round(sr_org.std(),3)}")
            print(f"mean : {round(sr_org.mean(),3)}")
            print('\n')


In [4]:
df_with_feats["v_p"] = (df_with_feats['close']*0.25 + df_with_feats['open']*0.25 + df_with_feats['high']*0.25 + df_with_feats['low']*0.25) * df_with_feats['volume']

In [88]:
df_price.head()

Unnamed: 0,date,open,high,low,close,volume,change,code,name,marcap,kospi,kosdaq,v_p
0,2013-01-02,30660,31519,30540,31520,229274,0.03548,5930,삼성전자,416690821990000,2031.1,501.61,7121193000.0
1,2013-01-03,31640,31680,30860,30860,284927,-0.020939,5930,삼성전자,416690821990000,2019.41,499.07,8906818000.0
2,2013-01-04,30800,30840,30199,30500,260120,-0.011666,5930,삼성전자,416690821990000,2011.94,504.84,7955705000.0
3,2013-01-07,30300,30560,29999,30400,252436,-0.003279,5930,삼성전자,416690821990000,2011.25,508.72,7652534000.0
4,2013-01-08,30260,30340,29960,30000,276757,-0.013158,5930,삼성전자,416690821990000,1997.94,509.01,8341456000.0


In [12]:
(
    df_with_feats
    .loc[lambda df : df.code == '041020']    
    .set_index('date')
    ['v_p']
    # .cumsum()
).plot()

In [13]:
(
    df_with_feats
    .loc[lambda df : df.code == '041020']        
    .set_index('date')
    ['v_p']
    .rolling(5).mean()
).plot()

In [16]:
df_with_feats.set_index('date', inplace=True)

In [17]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create a subplot with 1 column and 2 rows
fig = make_subplots(rows=2, cols=1, shared_xaxes=True)
df_ = df_with_feats.loc[lambda df : df.code == '041020']  

x =df_.index
y1 = df_['v_p']
y2 = df_['close'] 

# Add a scatter plot to the first row
fig.add_trace(
    go.Scatter(x=x, y=y1),
    row=1, col=1
)

# Add a bar plot to the second row
fig.add_trace(
    go.Scatter(x=x, y=y2),
    row=2, col=1
)

# Update layout if needed
fig.update_layout(height=900, width=1800, title_text="1 Column, 2 Rows Plot with Plotly")

# Show plot
fig.show()

In [None]:
(df_oos_result['return_mean_mean']+1).cumprod().plot()

#### Get Best Thresholds for Proba

In [None]:
def objective(trial, df_proba, rtn_n):

    thres01 = trial.suggest_float("thres_for_max", 0.3, 0.9, step=0.01)
    thres02 = trial.suggest_float("thres_for_mean", 0.3, 0.9, step=0.01)
    thres03 = trial.suggest_float("thres_mean_max", 0.3, 0.9, step=0.01)
    thres04 = trial.suggest_float("thres_mean_mean", 0.3, 0.9, step=0.01)

    l_rtn_p = []

    for date in dates:

        df_ = df_proba.loc[lambda df : df.date >= date[2]].loc[lambda df : df.date < date[3]]
        df_ = df_.loc[lambda df : df['weighted_price_change_p_short'] >= df['weighted_price_change_p_mid']]


        df1 = (
            df_
            .loc[lambda df : df['proba_max'] > thres01]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df2 = (
            df_
            .loc[lambda df : df['proba_max'] > thres02]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df3 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres03]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df4 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres04]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        sr = (
            pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code']) #df1, df2, 
            .groupby('date')[rtn_n]
            .mean()
            -0.0023
        )

        n = int(rtn_n.split('_')[1]) + 1

        l_rtn_p.append(
            {
                'date' : date[3],
                'rtn_p' : (sr/n).sum(),
                'days' : pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code'])['date'].unique().__len__()
            }        
        )

    sr_final = (
        pd.DataFrame(l_rtn_p)
        .set_index('date')
        ['rtn_p']
        # +1
    )

    print(
        pd.DataFrame(l_rtn_p)['days'].sum()
    )

    return (sr_final+1).cumprod()[-1]


In [None]:
import optuna

In [None]:
# Optuna Strudy Run
def run_optuna(name, df_, rtn_n):

    study = optuna.create_study(
            direction="maximize",
            # direction="minimize",
            study_name=name,
        )

    study.optimize(
        lambda trial : objective(trial, df_, rtn_n),
        n_trials=300,
        show_progress_bar=True,
    )

    # Create study result df & Save to file
    df_study_result = (
        study.trials_dataframe()
        .rename(columns = lambda x : x.replace("params_", ""))
    )    
    df_study_result.to_pickle(f"./study_result_20240217_{name}.pkl")

In [None]:
for rtn_n in ['rtn_3', 'rtn_4',]: #  'rtn_5'
    for tf in [ False]: # True,

        type_str = 'fixed' if tf else 'float'

        df_with_proba = (
            pd.read_pickle(f"df_oos_with_proba_{type_str}_{rtn_n}.pkl")
            .merge(
                df_with_feats_ml[['code','date','weighted_price_change_p_short', 'weighted_price_change_p_mid']]
            )
            .dropna(subset=['weighted_price_change_p_short', 'weighted_price_change_p_mid'])
        )
        
        run_optuna(f"best_combi_of_thres_{type_str}_{rtn_n}", df_with_proba, rtn_n)

In [None]:
def get_df_rtn_result(df_with_proba, thres_01, thres_02, thres_03, thres_04, rtn):
    l_rtn_p = []

    n = int(rtn.split('_')[1])+1

    for date in dates:

        df_ = df_with_proba.loc[lambda df : df.date >= date[2]].loc[lambda df : df.date < date[3]]
        df_ = df_.loc[lambda df : df['weighted_price_change_p_short'] >= df['weighted_price_change_p_mid']]

        df1 = (
            df_
            .loc[lambda df : df['proba_max'] > thres_01]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df2 = (
            df_
            .loc[lambda df : df['proba_max'] > thres_02]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df3 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres_03]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        df4 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres_04]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(3)
            .reset_index()
        )

        sr = (
            pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code']) #df1, df2, 
            .groupby('date')[rtn]
            .mean()
            -0.0023
        )

        l_rtn_p.append(
            {
                'date' : date[3],
                'rtn_p' : (sr/n).sum(),
                'days' : pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code'])['date'].unique().__len__()
            }        
        )

    return (
        pd.DataFrame(l_rtn_p)
        .set_index('date')
        ['rtn_p']
    )

In [None]:
from scipy import stats

In [None]:
def get_rsquare(df_with_proba, rtn, row):
    
    sr_final = get_df_rtn_result(df_with_proba, row.thres_for_max, row.thres_for_mean, row.thres_mean_max, row.thres_mean_mean, rtn)
    #  'thres_for_max', 'thres_for_mean', 'thres_mean_max', 'thres_mean_mean',

    x = sr_final.cumsum()
    y = range(0, len(x))

    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

    print(f"R-squared: {r_value**2}")
    return r_value**2

#### Add R-Squared of Cumsum Series

In [None]:
typeStr = None

for useFixed in [True, False]: #, False
    if useFixed:
        typeStr = 'fixed'
    else :
        typeStr = 'float'

    for rtn in ['rtn_3', 'rtn_4','rtn_5']:
        df_with_proba = (
            pd.read_pickle(f"df_oos_with_proba_{typeStr}_{rtn}.pkl")
            .merge(
                df_with_feats_ml[['code','date','weighted_price_change_p_short', 'weighted_price_change_p_mid']]
            )
            .dropna(subset=['weighted_price_change_p_short', 'weighted_price_change_p_mid'])
        )
        df_study_result = pd.read_pickle(f"study_result_20240217_best_combi_of_thres_{typeStr}_{rtn}.pkl")
        # print(df_study_result.columns)

        df_study_result = df_study_result.drop_duplicates(subset=['thres_for_max', 'thres_for_mean', 'thres_mean_max', 'thres_mean_mean']).sort_values("value", ascending=False).head(50)

        df_study_result['r_square'] = df_study_result.apply(lambda row : get_rsquare(df_with_proba, rtn, row), axis=1 )

        
        df_study_result.to_pickle(f"study_result_add_r2_{typeStr}_{rtn}.pkl")
        

#### Get Bset R2 Results

In [None]:
l_result = []
l_best_combi = []

for typeStr in ['fixed', 'float']:

    for rtn in ['rtn_3', 'rtn_4', 'rtn_5']:

        print(f"{rtn}_{typeStr}")
        df_with_proba = (
            pd.read_pickle(f"df_oos_with_proba_{typeStr}_{rtn}.pkl")
            .merge(
                df_with_feats_ml[['code','date','weighted_price_change_p_short', 'weighted_price_change_p_mid']]
            )
        )
        df_study_result_with_rs = pd.read_pickle(f"study_result_add_r2_{typeStr}_{rtn}.pkl")

        pecentile_09 = df_study_result_with_rs['r_square'].quantile(0.9)
        
        max_row = (
            df_study_result_with_rs.loc[lambda df : df['r_square'] >= pecentile_09]
            .sort_values('r_square', ascending=False).head(1)
        )
        max_row['case'] = f"{rtn}_{typeStr}"

        l_best_combi.append(
            max_row
        )

        max_row = df_study_result_with_rs.sort_values('r_square', ascending=False).head(1)
        print(max_row)
        thres_for_max = max_row['thres_for_max'].iloc[0]
        thres_for_mean = max_row['thres_for_mean'].iloc[0]
        thres_mean_max = max_row['thres_mean_max'].iloc[0]
        thres_mean_mean = max_row['thres_mean_mean'].iloc[0]

        # 'thres_for_max', 'thres_for_mean', 'thres_mean_max', 'thres_mean_mean',


        sr_final = get_df_rtn_result(df_with_proba, thres_for_max, thres_for_mean, thres_mean_max, thres_mean_mean, rtn)
        sr_final.name = f"{rtn}_{typeStr}"
        
        l_result.append(
            sr_final.rename({'rtn_p': f"{rtn}_{typeStr}"})
        )
        print('\n')

In [None]:
pd.concat(l_best_combi)

In [None]:
df_result_opt = pd.concat(l_result, axis=1)

In [None]:
df_result_opt.to_pickle("df_oos_opt_result_applied.pkl")

In [None]:
(
    df_result_opt['rtn_3_fixed']
    # + 1
).cumsum().plot(kind='scatter')

In [None]:
(
    df_result_opt['rtn_3_float']
    + 1
).cumprod().plot(kind='scatter')

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

x = df_result_opt.index
y1 = df_result_opt[['rtn_3_float', 'rtn_4_float']].mean(axis=1).cumsum()
y2 = (df_result_opt[['rtn_3_float', 'rtn_4_float']].mean(axis=1)+1).cumprod()

# Create a subplot with 1 row and 2 columns
fig = make_subplots(rows=1, cols=2)

# First plot (e.g., a scatter plot)
fig.add_trace(
    go.Scatter(x=x, y=y1, mode='markers', name='Simple'),
    row=1, col=1
)

# Second plot (e.g., a bar plot)
fig.add_trace(
    go.Scatter(x=x, y=y2, mode='markers', name='Complex'),
    row=1, col=2
)

# Optionally adjust layout
fig.update_layout(height=400, width=800,)

# Show the plot
fig.show()


In [None]:
# Check VaR
for rtn in ['rtn_3', 'rtn_4', 'rtn_5']:

    for typeStr in ['float']:

        print(f"{rtn}_{typeStr}")
        print(
            df_result_opt[f'{rtn}_{typeStr}'].mean()
        )
        print(
            df_result_opt[f'{rtn}_{typeStr}'].quantile([0, 0.05, 0.5, 0.95, 1])
        )
        
        # print(df_result_opt[f'{rtn}_{typeStr}'].cumsum()[-1])

In [None]:
df_result_opt = pd.read_pickle("df_oos_opt_result_applied.pkl")

In [None]:
df_result_opt.head(10)

In [None]:
(
    df_result_opt[['rtn_3_float', 'rtn_4_float', 'rtn_5_float']].mean(axis=1)
    +1
).cumprod().plot(kind="scatter")

In [None]:
114**(1/8.6)

In [None]:
(
    df_result_opt['rtn_3_float']
    +1
).cumprod().plot(kind="scatter")

In [None]:
df_result_opt.corr()

In [None]:
df_result_opt['rtn_3_fixed'].std()

In [None]:

df_result_opt[['rtn_3_fixed', 'rtn_5_float']].mean(axis=1).std()

In [None]:
df_result_opt['rtn_5_float'].std()

In [None]:
(
    df_result_opt[['rtn_3_fixed', 'rtn_5_float']].mean(axis=1)
    +1
).cumprod().plot(kind='scatter')

In [None]:
for rtn3 in ['rtn_3_fixed', 'rtn_3_float', '']:
    for rtn4 in ['rtn_4_fixed', 'rtn_4_float', '']:
        for rtn5 in ['rtn_5_fixed', 'rtn_5_float', '']:

            cols = []
            if rtn3 != '':
                cols.append(rtn3)
            if rtn4 != '':
                cols.append(rtn4)
            if rtn5 != '':
                cols.append(rtn5)

            sr = df_result_opt[cols].mean(axis=1)

            print( f"combi : {rtn3} / {rtn4} / {rtn5}")
            print(f"mean : {sr.mean()}")
            print(f"std : {sr.std()}")
            # print((sr+1).cumprod().iloc[-1])
            
            print(f"sharpe : {sr.mean()/sr.std()}")
            print('\n')

            

In [None]:
(df_result_opt[['rtn_3_float', 'rtn_4_float']]).mean(axis=1).std()

In [None]:
data = df_result_opt[['rtn_3_float','rtn_4_float']].mean(axis=1)
fig = px.histogram(data, nbins=30, marginal="violin", histnorm='probability density')
fig.show()

In [None]:
(130.80380193278907)**(1/8.6)

In [None]:
5.2/8.6

In [None]:
df_result_opt.shape

In [None]:
1.05**103

In [None]:
rtn = 'rtn_3'
df_oos_with_proba = pd.read_pickle(f"df_oos_with_proba_float_{rtn}.pkl")

In [None]:
l_rtn_p = []

n = int(rtn.split('_')[1])+1

for date in dates:

    df_ = df_oos_with_proba.loc[lambda df : df.date >= date[2]].loc[lambda df : df.date < date[3]]
    # df_ = df_.loc[lambda df : df['weighted_price_change_p_short'] >= df['weighted_price_change_p_mid']]

    df1 = (
        df_
        # .loc[lambda df : df['proba_mean'] < 0.9]
        .loc[lambda df : df.close > 1000]
        .sort_values('proba_mean', ascending=False)
        .groupby('date')
        .head(3)
        .reset_index()
    )

    # print(df1.shape)

    sr = (
        df1
        .groupby('date')
        [rtn]
        .mean()
        # -0.0023
    )/n

    l_rtn_p.append(
        {
            'date' : date[3],
            'rtn_p' : (sr).sum(),
            # 'days' : pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code'])['date'].unique().__len__()
        }        
    )

result = (
    pd.DataFrame(l_rtn_p)
    .set_index('date')
    ['rtn_p']
)

In [None]:
(result+1).cumprod().plot()

In [None]:
(result).cumsum().plot(kind='scatter')

In [None]:
(result+1).cumprod()[-1]

In [None]:
((result+1).cumprod()[-1])**(1/8.6)

In [None]:
((20.37)**(1/8.6) - 1) / 12

In [None]:
(result).cumsum()[-1] / 8.6

In [None]:
3.16 / 8.6 / 12

In [None]:
(3 * 5_000_000 * 4 ) / 100_000_000

In [None]:
1.0 * (result).cumsum()[-1] / 8.6

In [None]:
0.6 * .9876786620061364 / 12