#### This Code is the base of the current running Cate ML Model

In [1]:
import pandas as pd
pd.options.plotting.backend = "plotly"

import plotly.express as px
import plotly.graph_objects as go

import numpy as np


from make_new_features import get_df_with_features
from get_krx_value import get_krx_mean

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

from xgboost import plot_importance
import matplotlib.pyplot as plt

#### Preparing Price Dataset

In [2]:
SHORT_PERIOD=5
MID_PERIOD=20
LONG_PERIOD=60

try :
    df_with_feats = pd.read_pickle(f"./df_with_feats_f_20130101_t_20230919_{SHORT_PERIOD}_{MID_PERIOD}_{LONG_PERIOD}_diff_feats.pkl")
    # df_with_feats['date'] = df_with_feats['date'].dt.strftime("%Y-%m-%d")
except :
    # filtering : Normal Stocks, Not SPAC
    df_price = (
        pd.read_pickle("../230917_df_price_price_only_f_20130101_t_20230919.pkl")
        .loc[lambda df : df["code"].str[5]== "0"]
        # .loc[lambda df : ~df["name"].str.contains("스펙")]
        # .loc[lambda df : ~df["name"].str.contains("스팩")]    
    )

    df_price['date'] = df_price['date'].dt.strftime("%Y-%m-%d")

    try:
        df_krx = pd.read_pickle("./df_krx.pkl")
    except :
        df_krx = get_krx_mean()
        df_krx.to_pickle("./df_krx.pkl")

    df_price = df_price.merge(
        df_krx,
        on='date'
    )
    
    df_with_feats = get_df_with_features(
        df_price, SHORT_PERIOD=SHORT_PERIOD, MID_PERIOD=MID_PERIOD, LONG_PERIOD=LONG_PERIOD)
    df_with_feats.to_pickle(f"./df_with_feats_f_20130101_t_20230919_{SHORT_PERIOD}_{MID_PERIOD}_{LONG_PERIOD}_diff_feats.pkl")

  result = func(self.values, **kwargs)


: 

In [None]:
df_with_feats.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'change', 'code',
       'name', 'marcap', 'kospi', 'kosdaq', 'open_diff_std_short',
       'high_diff_std_short', 'low_diff_std_short', 'close_diff_std_short',
       'vol_diff_std_short', 'open_diff_std_mid', 'high_diff_std_mid',
       'low_diff_std_mid', 'close_diff_std_mid', 'vol_diff_std_mid',
       'open_diff_std_short_L1', 'high_diff_std_short_L1',
       'low_diff_std_short_L1', 'close_diff_std_short_L1',
       'vol_diff_std_short_L1', 'open_diff_std_short_L2',
       'high_diff_std_short_L2', 'low_diff_std_short_L2',
       'close_diff_std_short_L2', 'vol_diff_std_short_L2',
       'open_diff_std_mid_L1', 'high_diff_std_mid_L1', 'low_diff_std_mid_L1',
       'close_diff_std_mid_L1', 'vol_diff_std_mid_L1', 'open_diff_std_mid_L2',
       'high_diff_std_mid_L2', 'low_diff_std_mid_L2', 'close_diff_std_mid_L2',
       'vol_diff_std_mid_L2', 'open_diff_mean_short', 'high_diff_mean_short',
       'low_diff_mean_short', 'clos

## ML

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [None]:
df_with_feats_ml = (
    df_with_feats
    .assign(
        vol_x_price_sma_long_to_mid = lambda df : df.vol_x_price_sma_long / df.vol_x_price_sma_mid,
        vol_x_price_sma_mid_to_short = lambda df : df.vol_x_price_sma_mid / df.vol_x_price_sma_short,
    )
    .loc[lambda df : df.code.str[5] == '0']
    .loc[lambda df : ~df.name.str.contains("스펙")]
    .loc[lambda df : ~df.name.str.contains("스팩")]
    
    .loc[lambda df : df["vol_x_price_sma_mid"] >  0.75e+08]  # 0.75e+08
    .loc[lambda df : df["vol_x_price_sma_mid"] <  3.5e+08]  # 3.5e+08
    .loc[lambda df : df["vol_zero_count_mid"] == 0]
    .loc[lambda df : df["change"] < 0.29]
    .loc[lambda df : df["volume"] > 0]
)

KeyboardInterrupt: 

In [None]:
# For fixed length train dataset

import exchange_calendars as xcals
krx_cal = xcals.get_calendar("XKRX")

max_date = '2023-09-19'
# max_date = '2024-01-19'
start_date = '2015-01-02'

finish = False
dates = []

len_of_train = 200
gap_from_last_train_date = 7 # This number depends on the target ( rtn_5 -> 7, rnt_20 -> 22)
len_of_pred = 20 # The length of pred for 1 model update

while not finish :
    train_end = krx_cal.sessions_window(start_date, len_of_train)[-1].strftime("%Y-%m-%d")
    oos_start = krx_cal.sessions_window(train_end, gap_from_last_train_date)[-1].strftime("%Y-%m-%d")
    oos_end = krx_cal.sessions_window(oos_start, len_of_pred)[-1].strftime("%Y-%m-%d")

    if oos_end > max_date :
        dates.append(
            (start_date, train_end, oos_start, max_date)
        )
        finish = True

    else :
        dates.append(
            (start_date, train_end, oos_start, oos_end)
        )

        start_date = krx_cal.sessions_window(start_date, len_of_pred)[-1].strftime("%Y-%m-%d")

### Feats

In [None]:
for col in df_with_feats_ml.columns:
    print(f"'{col}',")

In [9]:
feats = [
    'vol_x_price_sma_short',
    'close_std_mid',
    'close_std_long',
    'open_std_short',
    'open_std_mid',
    'open_std_long',
    'high_std_short',
    'high_std_mid',
    'low_std_short',
    'close_change_p_long',
    'krx_corr_mid',
    'krx_corr_long',
    'krx_change_std_short',
    'krx_change_std_mid',
    'krx_change_std_long',
    'w_price_vol_corr_long',
    'w_price_vol_corr_mid',

    'close_mean_short',
    'close_mean_mid',
    'close_mean_long',
    'close_min_ratio_short',
    'close_min_ratio_mid',
    'close_min_ratio_long',
    'close_max_ratio_short',
    'close_max_ratio_mid',
    'close_max_ratio_long',   
    'rsi_pivot_short', 'rsi_pivot_mid',
       'rsi_pivot_long'    
]

#### Create df_oos_with_proba

In [17]:
def run_ml_multi_seed(features, idx0, idx1, fixed, target):

    feats = features
    
    THRES_OF_PROBA = 0.5

    rtn_in_period = []
    l_df = []

    if fixed:
        print("fixed")
    else :
        print("float")

    for train_start, train_end, oos_start, oos_end in dates[idx0 : idx1]:

        df_train_set = df_with_feats_ml.loc[lambda df : df.date >= train_start].loc[lambda df : df.date < train_end]
        df_oos_set = df_with_feats_ml.loc[lambda df : df.date >= oos_start].loc[lambda df : df.date < oos_end]

        df_train_set_ = (
            df_train_set
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        if fixed:
            THRES_OF_TRUE = 0.03
        else :
            THRES_OF_TRUE = df_train_set_[target].quantile(0.75)
        
        # print(THRES_OF_TRUE)

        df_oos_set_ = (
            df_oos_set      
            .replace([-np.inf, np.inf], np.nan)
            .dropna(subset=feats)
            .reset_index(drop=True)
        )

        df_train_set_["target"] = (
                df_train_set_[target].apply(lambda x : 1 if x > THRES_OF_TRUE else 0)
            )

        feature_df = df_train_set_[feats]
        target_df = df_train_set_["target"]

        feature_df_oos = df_oos_set_[feats]

        doos = xgb.DMatrix(feature_df_oos)        

        if df_oos_set_.shape[0] > 0:
            
            col_list = []
            for id, seed in enumerate([6]):

                X_train, X_valid, y_train, y_valid = train_test_split(
                    feature_df, target_df, test_size=0.25, random_state=seed
                )

                # Create DMatrix for training and validation data
                dtrain = xgb.DMatrix(X_train, label=y_train)
                dvalid = xgb.DMatrix(X_valid, label=y_valid)

                evals = [(dtrain, 'train'), (dvalid, 'eval')]

                params = {
                    'max_depth': 5, # Adjust based on your dataset : originally 5 --> try 8
                    'eta': 0.05,     # Learning rate : originally 0.05
                    'objective': 'binary:logistic',
                    'eval_metric': 'logloss',  # Or use 'auc', 'error', etc. based on your problem
                    'random_state': 42,
                }

                model = xgb.train(
                    params, dtrain,
                    1000,
                    evals=evals,
                    early_stopping_rounds=50,
                    verbose_eval=False
                )

                df_oos_set_[f"pred_proba_{id}"] = model.predict(doos)
                col_list.append(f"pred_proba_{id}")
            
            df_oos_set_["proba_mean"] = df_oos_set_[col_list].mean(axis=1)
            df_oos_set_["proba_max"] = df_oos_set_[col_list].max(axis=1)

            l_df.append(df_oos_set_)

            sr_selected_mean_mean = (
                df_oos_set_
                .loc[lambda df : df['proba_mean'] > THRES_OF_PROBA]
                .sort_values('proba_mean', ascending=False)
                .groupby('date')
                .head(5)
                .reset_index()
                .groupby('date')[target]
                .mean()
                -0.0023
            )       

            # sr_selected_mean_max = (
            #     df_oos_set_
            #     .loc[lambda df : df['proba_mean'] > THRES_OF_PROBA]
            #     .sort_values('proba_max', ascending=False)
            #     .groupby('date')
            #     .head(5)
            #     .reset_index()
            #     .groupby('date')[target]
            #     .mean()
            #     -0.0023
            # )      

            # sr_selected_max_mean = (
            #     df_oos_set_
            #     .loc[lambda df : df['proba_max'] > THRES_OF_PROBA]
            #     .sort_values('proba_mean', ascending=False)
            #     .groupby('date')
            #     .head(5)
            #     .reset_index()
            #     .groupby('date')[target]
            #     .mean()
            #     -0.0023
            # )    

            # sr_selected_max_max = (
            #     df_oos_set_
            #     .loc[lambda df : df['proba_max'] > THRES_OF_PROBA]
            #     .sort_values('proba_max', ascending=False)
            #     .groupby('date')
            #     .head(5)
            #     .reset_index()
            #     .groupby('date')[target]
            #     .mean()
            #     -0.0023
            # )     

            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : sr_selected_mean_mean.sum()/(int(target.split('_')[-1]) + 1),
                    # 'return_mean_max' : sr_selected_mean_max.sum()/(int(target.split('_')[-1]) + 1),
                    # 'return_max_mean' : sr_selected_max_mean.sum()/(int(target.split('_')[-1]) + 1),
                    # 'return_max_max' : sr_selected_max_max.sum()/(int(target.split('_')[-1]) + 1),
                    'date_n' : sr_selected_mean_mean.shape[0]
                }
            )

        else :

        
            # cumsum += 0
            rtn_in_period.append(
                {
                    'date' : oos_end,
                    "return_mean_mean" : 0,
                    'return_mean_max' : 0,
                    'return_max_mean' : 0,
                    'return_max_max' : 0,
                    'date_n' : 0
                }
            )
        
        
    return rtn_in_period, l_df

In [18]:
target_type = {
    'rtn_3' : True,
    'rtn_4' : True,
    'rtn_5' : False
}

In [20]:
for feat_var, target_var in [(feats_rtn_3, target_rtn_3), (feats_rtn_4, target_rtn_4), (feats_rtn_5, target_rtn_5)]: #

    feats = feat_var
    target = target_var

    tf = target_type[target]

    for tf in [True, False]:

        print(target)

        oos_test_result, dfs = run_ml_multi_seed(feats, 0, -1, tf, target)
        df_oos_result = pd.DataFrame(oos_test_result)
        df_oos_result = df_oos_result.set_index('date')

        if tf:
            df_oos_with_proba_fixed = pd.concat(dfs)
            df_oos_with_proba_fixed.to_pickle(f"df_oos_with_proba_fixed_{target}.pkl")
        else:
            df_oos_with_proba_float = pd.concat(dfs)
            df_oos_with_proba_float.to_pickle(f"df_oos_with_proba_float_{target}.pkl")

        for col in df_oos_result.columns:
            sr = (df_oos_result[col]+1)
            sr_org = df_oos_result[col]

            print(col)
            print(f"Prod : {round(sr.cumprod()[-1],2)}")
            print(f"Prod Max : {round(sr.cumprod().max(),2)}")
            print(f"Prod Min : {round(sr.cumprod().min(),2)}")
            print(f"min : {round(sr_org.min(),3)}")
            print(f"std : {round(sr_org.std(),3)}")
            print(f"mean : {round(sr_org.mean(),3)}")
            print('\n')


rtn_3
fixed
return_mean_mean
Prod : 1.95
Prod Max : 2.65
Prod Min : 1.0
min : -0.612
std : 0.097
mean : 0.013


return_mean_max
Prod : 1.78
Prod Max : 2.62
Prod Min : 0.95
min : -0.612
std : 0.095
mean : 0.012


return_max_mean
Prod : 6.99
Prod Max : 6.99
Prod Min : 1.02
min : -0.417
std : 0.083
mean : 0.023


return_max_max
Prod : 5.9
Prod Max : 5.9
Prod Min : 1.03
min : -0.412
std : 0.082
mean : 0.021


date_n
Prod : 0
Prod Max : 9169168889496469504
Prod Min : -9223372036854775808
min : 0
std : 5.142
mean : 10.245


rtn_3
float
return_mean_mean
Prod : 14.73
Prod Max : 14.73
Prod Min : 1.0
min : -0.232
std : 0.076
mean : 0.03


return_mean_max
Prod : 14.87
Prod Max : 14.87
Prod Min : 0.99
min : -0.195
std : 0.075
mean : 0.029


return_max_mean
Prod : 14.32
Prod Max : 14.32
Prod Min : 1.01
min : -0.232
std : 0.072
mean : 0.029


return_max_max
Prod : 13.17
Prod Max : 13.17
Prod Min : 1.01
min : -0.181
std : 0.07
mean : 0.028


date_n
Prod : 0
Prod Max : 9126101401199218688
Prod Min : -

#### Get Best Thresholds for Proba

In [21]:
def objective(trial, df_proba, rtn_n):

    thres01 = trial.suggest_float("thres_for_max", 0.3, 0.9, step=0.01)
    thres02 = trial.suggest_float("thres_for_mean", 0.3, 0.9, step=0.01)
    # thres03 = trial.suggest_float("thres_mean_max", 0.3, 0.9, step=0.01)
    # thres04 = trial.suggest_float("thres_mean_mean", 0.3, 0.9, step=0.01)
    top_n = trial.suggest_int("top_n",3, 20)
    close_minimum = trial.suggest_categorical("close_minimum", [500, 1000, 1500])

    l_rtn_p = []

    for date in dates:

        df_ = df_proba.loc[lambda df : df.date >= date[2]].loc[lambda df : df.date < date[3]].loc[lambda df : df.close > close_minimum]


        df1 = (
            df_
            .loc[lambda df : df['proba_max'] > thres01]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(top_n)
            .reset_index()
        )

        df2 = (
            df_
            .loc[lambda df : df['proba_max'] > thres01]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(top_n)
            .reset_index()
        )

        df3 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres02]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(top_n)
            .reset_index()
        )

        df4 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres02]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(top_n)
            .reset_index()
        )

        sr = (
            pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code']) #df1, df2, 
            .groupby('date')[rtn_n]
            .mean()
            -0.0023
        )

        n = int(rtn_n.split('_')[1]) + 1

        l_rtn_p.append(
            {
                'date' : date[3],
                'rtn_p' : (sr/n).sum(),
                'days' : pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code'])['date'].unique().__len__()
            }        
        )

    sr_final = (
        pd.DataFrame(l_rtn_p)
        .set_index('date')
        ['rtn_p']
        # +1
    )

    print(
        pd.DataFrame(l_rtn_p)['days'].sum()
    )

    return (sr_final+1).cumprod()[-1]


In [22]:
import optuna

In [23]:
# Optuna Strudy Run
def run_optuna(name, df_, rtn_n):

    study = optuna.create_study(
            direction="maximize",
            # direction="minimize",
            study_name=name,
        )

    study.optimize(
        lambda trial : objective(trial, df_, rtn_n),
        n_trials=250,
        show_progress_bar=True,
    )

    # Create study result df & Save to file
    df_study_result = (
        study.trials_dataframe()
        .rename(columns = lambda x : x.replace("params_", ""))
    )    
    df_study_result.to_pickle(f"./study_result_20240328_{name}.pkl")

In [25]:
for rtn_n in [ 'rtn_4', 'rtn_5']:
    for tf in [True, False]:

        type_str = 'fixed' if tf else 'float'

        df_with_proba = pd.read_pickle(f"df_oos_with_proba_{type_str}_{rtn_n}.pkl")
        run_optuna(f"best_combi_of_thres_{type_str}_{rtn_n}", df_with_proba, rtn_n)

[I 2024-03-28 10:17:24,674] A new study created in memory with name: best_combi_of_thres_fixed_rtn_4


  0%|          | 0/250 [00:00<?, ?it/s]

344
[I 2024-03-28 10:17:37,171] Trial 0 finished with value: 5.624937226545213 and parameters: {'thres_for_max': 0.8600000000000001, 'thres_for_mean': 0.79, 'top_n': 14, 'close_minimum': 1000}. Best is trial 0 with value: 5.624937226545213.
941
[I 2024-03-28 10:17:51,607] Trial 1 finished with value: 8.30529827686887 and parameters: {'thres_for_max': 0.71, 'thres_for_mean': 0.8899999999999999, 'top_n': 18, 'close_minimum': 500}. Best is trial 1 with value: 8.30529827686887.
1855
[I 2024-03-28 10:18:06,879] Trial 2 finished with value: 5.65073480541642 and parameters: {'thres_for_max': 0.42, 'thres_for_mean': 0.77, 'top_n': 9, 'close_minimum': 1500}. Best is trial 1 with value: 8.30529827686887.
1740
[I 2024-03-28 10:18:20,788] Trial 3 finished with value: 6.777744964950106 and parameters: {'thres_for_max': 0.51, 'thres_for_mean': 0.8899999999999999, 'top_n': 15, 'close_minimum': 500}. Best is trial 1 with value: 8.30529827686887.
1763
[I 2024-03-28 10:18:34,600] Trial 4 finished with v

[I 2024-03-28 11:08:57,044] A new study created in memory with name: best_combi_of_thres_float_rtn_4


  0%|          | 0/250 [00:00<?, ?it/s]

1565
[I 2024-03-28 11:09:09,285] Trial 0 finished with value: 7.624995002006918 and parameters: {'thres_for_max': 0.8700000000000001, 'thres_for_mean': 0.47, 'top_n': 19, 'close_minimum': 1000}. Best is trial 0 with value: 7.624995002006918.
1767
[I 2024-03-28 11:09:21,211] Trial 1 finished with value: 13.446139531172246 and parameters: {'thres_for_max': 0.62, 'thres_for_mean': 0.41, 'top_n': 8, 'close_minimum': 500}. Best is trial 1 with value: 13.446139531172246.
1916
[I 2024-03-28 11:09:32,988] Trial 2 finished with value: 11.495290469298515 and parameters: {'thres_for_max': 0.41, 'thres_for_mean': 0.6599999999999999, 'top_n': 11, 'close_minimum': 500}. Best is trial 1 with value: 13.446139531172246.
1896
[I 2024-03-28 11:09:45,241] Trial 3 finished with value: 6.561476859855622 and parameters: {'thres_for_max': 0.43, 'thres_for_mean': 0.41, 'top_n': 16, 'close_minimum': 1500}. Best is trial 1 with value: 13.446139531172246.
1935
[I 2024-03-28 11:09:57,745] Trial 4 finished with val

[I 2024-03-28 12:01:56,659] A new study created in memory with name: best_combi_of_thres_fixed_rtn_5


  0%|          | 0/250 [00:00<?, ?it/s]

1869
[I 2024-03-28 12:02:10,462] Trial 0 finished with value: 7.970972825307835 and parameters: {'thres_for_max': 0.4, 'thres_for_mean': 0.44, 'top_n': 18, 'close_minimum': 500}. Best is trial 0 with value: 7.970972825307835.
1744
[I 2024-03-28 12:02:23,402] Trial 1 finished with value: 6.301452670264647 and parameters: {'thres_for_max': 0.5, 'thres_for_mean': 0.42, 'top_n': 15, 'close_minimum': 1500}. Best is trial 0 with value: 7.970972825307835.
1309
[I 2024-03-28 12:02:36,341] Trial 2 finished with value: 9.903274490950583 and parameters: {'thres_for_max': 0.6599999999999999, 'thres_for_mean': 0.77, 'top_n': 19, 'close_minimum': 500}. Best is trial 2 with value: 9.903274490950583.
1315
[I 2024-03-28 12:02:49,355] Trial 3 finished with value: 13.029285283716575 and parameters: {'thres_for_max': 0.8700000000000001, 'thres_for_mean': 0.52, 'top_n': 13, 'close_minimum': 500}. Best is trial 3 with value: 13.029285283716575.
910
[I 2024-03-28 12:03:06,420] Trial 4 finished with value: 7.

[I 2024-03-28 12:59:53,469] A new study created in memory with name: best_combi_of_thres_float_rtn_5


  0%|          | 0/250 [00:00<?, ?it/s]

1688
[I 2024-03-28 13:00:07,597] Trial 0 finished with value: 4.474598310287864 and parameters: {'thres_for_max': 0.55, 'thres_for_mean': 0.78, 'top_n': 11, 'close_minimum': 1500}. Best is trial 0 with value: 4.474598310287864.
951
[I 2024-03-28 13:00:20,000] Trial 1 finished with value: 13.841538493877575 and parameters: {'thres_for_max': 0.77, 'thres_for_mean': 0.8200000000000001, 'top_n': 3, 'close_minimum': 1000}. Best is trial 1 with value: 13.841538493877575.
1617
[I 2024-03-28 13:00:32,706] Trial 2 finished with value: 12.686804977887482 and parameters: {'thres_for_max': 0.7, 'thres_for_mean': 0.45999999999999996, 'top_n': 3, 'close_minimum': 500}. Best is trial 1 with value: 13.841538493877575.
1875
[I 2024-03-28 13:00:46,003] Trial 3 finished with value: 6.6137050244632425 and parameters: {'thres_for_max': 0.39, 'thres_for_mean': 0.36, 'top_n': 17, 'close_minimum': 1000}. Best is trial 1 with value: 13.841538493877575.
1582
[I 2024-03-28 13:00:59,105] Trial 4 finished with val

In [None]:
def get_df_rtn_result(df_with_proba, thres_for_max, thres_for_mean, rtn, minimum_close, top_n):
    l_rtn_p = []

    n = int(rtn.split('_')[1])+1

    for date in dates:

        df_ = df_with_proba.loc[lambda df : df.date >= date[2]].loc[lambda df : df.date < date[3]].loc[lambda df : df.close > minimum_close]


        df1 = (
            df_
            .loc[lambda df : df['proba_max'] > thres_for_max]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(top_n)
            .reset_index()
        )

        df2 = (
            df_
            .loc[lambda df : df['proba_max'] > thres_for_max]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(top_n)
            .reset_index()
        )

        df3 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres_for_mean]
            .sort_values('proba_max', ascending=False)
            .groupby('date')
            .head(top_n)
            .reset_index()
        )

        df4 = (
            df_
            .loc[lambda df : df['proba_mean'] > thres_for_mean]
            .sort_values('proba_mean', ascending=False)
            .groupby('date')
            .head(top_n)
            .reset_index()
        )

        sr = (
            pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code']) #df1, df2, 
            .groupby('date')[rtn]
            .mean()
            -0.0023
        )

        l_rtn_p.append(
            {
                'date' : date[3],
                'rtn_p' : (sr/n).sum(),
                'days' : pd.concat([df1, df2, df3, df4]).drop_duplicates(subset=['date','code'])['date'].unique().__len__()
            }        
        )

    return (
        pd.DataFrame(l_rtn_p)
        .set_index('date')
        ['rtn_p']
    )

In [None]:
from scipy import stats

In [None]:
def get_rsquare(df_with_proba, rtn, row):
    
    sr_final = get_df_rtn_result(df_with_proba, row.thres_for_max, row.thres_for_mean, rtn, row.close_minimum, row.top_n)

    x = sr_final.cumsum()
    y = range(0, len(x))

    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

    print(f"R-squared: {r_value**2}")
    return r_value**2

#### Add R-Squared

In [None]:
typeStr = None

for useFixed in [False]: #, False
    if useFixed:
        typeStr = 'fixed'
    else :
        typeStr = 'float'

    for rtn in ['rtn_4','rtn_5']:
        df_with_proba = pd.read_pickle(f"df_oos_with_proba_{typeStr}_{rtn}.pkl")
        df_study_result = pd.read_pickle(f"study_result_20240214_best_combi_of_thres_{typeStr}_{rtn}.pkl")

        df_study_result = df_study_result.drop_duplicates(subset=['thres_for_max','thres_for_mean']).sort_values("value", ascending=False).head(100)

        df_study_result['r_square'] = df_study_result.apply(lambda row : get_rsquare(df_with_proba, rtn, row), axis=1 )

        
        df_study_result.to_pickle(f"study_result_add_r2_{typeStr}_{rtn}.pkl")
        

#### Get Bset R2 Results

In [None]:
l_result = []

for typeStr in ['fixed', 'float']:

    for rtn in ['rtn_3', 'rtn_4', 'rtn_5']:

        print(f"{rtn}_{typeStr}")
        df_with_proba = pd.read_pickle(f"df_oos_with_proba_{typeStr}_{rtn}.pkl")
        df_study_result_with_rs = pd.read_pickle(f"study_result_add_r2_{typeStr}_{rtn}.pkl")

        max_row = df_study_result_with_rs.sort_values('r_square', ascending=False).head(1)
        print(max_row)
        thres_for_max = max_row['thres_for_max'].iloc[0]
        thres_for_mean = max_row['thres_for_mean'].iloc[0]

        sr_final = get_df_rtn_result(df_with_proba, thres_for_max, thres_for_mean, rtn)
        sr_final.name = f"{rtn}_{typeStr}"
        
        l_result.append(
            sr_final.rename({'rtn_p': f"{rtn}_{typeStr}"})
        )
        print('\n')

In [None]:
df_result_opt = pd.concat(l_result, axis=1)

#### Final Result Summary
- Set All to 'fixed'
- rtn_3 : thres_for_max 0.53 / thres_for_mean 0.4 / value 14.149
- rtn_4 : thres_for_max 0.53 / thres_for_mean 0.35 / value 22.543
- rtn_5 : thres_for_max 0.35 / thres_for_mean 0.48 / value 22.785

In [None]:
df_result_opt.head()

In [None]:
df_result_opt.to_pickle("df_oos_opt_result_applied.pkl")

In [None]:
(
    df_result_opt['rtn_5_fixed']
    + 1
).cumprod().plot(kind='scatter')

In [None]:
# Check VaR
for rtn in ['rtn_3', 'rtn_4', 'rtn_5']:

    print(rtn)
    print(
        df_result_opt[f'{rtn}_fixed'].quantile([0, 0.05, 0.5, 0.95, 1])
    )

In [None]:
"""
Pulse extraction using POS algorithm (%(version)s)
"""


import matplotlib
#matplotlib.use("TkAgg")
matplotlib.use("MacOSX")
from matplotlib import pyplot as plt

import os
import sys
sys.path.insert(0, './SkinDetector')
import pkg_resources

import numpy as np
import cv2
import dlib

from imutils.video import VideoStream
from imutils import face_utils
import imutils

import argparse
import skin_detector

def main(user_input=None):
    # EXTRACT PULSE
    pulsedir ="/Volumes/MacMini-Backups/siw-db/live/pulse/"
    start = 0
    end = 450
 
    framerate = 30

    # FREQUENCY ANALYSIS
    nsegments = 12
    
    plot =  False
    image_show = True

    left_increase_ratio = 0.05 #5%
    top_increase_ratio = 0.25 #5%
  
    ap = argparse.ArgumentParser()
    ap.add_argument("-v", "--video", help = "path to the (optional) video file")
    args = vars(ap.parse_args())


    if not args.get("video", False):
        from_webcam = True
        camera = cv2.VideoCapture(0)
        start = 0
        end = 450
	# otherwise, load the video
    else:
        camera = cv2.VideoCapture(args["video"])

    video_file_path = args["video"]
    video_file_name = os.path.basename(video_file_path)
    
    start_index = start
    end_index = end

    # number of final frames
    if end_index > 0:
        nb_frames = end_index - start_index


    # loop on video frames
    frame_counter = 0
    i = start_index

    detector = dlib.get_frontal_face_detector()
    predictor = dlib.shape_predictor("shape_predictor_68_face_landmarks.dat")
   
    while (i >= start_index and i < end_index):
        (grabbed, frame) = camera.read()
    
        if not grabbed:
            continue

        print("Processing frame %d/%d...", i+1, end_index)
        
        h,w,_ = frame.shape

        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        rects = detector(gray, 0)

        if len(rects)==0:
            continue

        if image_show:
            show_frame = frame.copy()
       
        if(len(rects)>0):
            rect = rects[0] 
            '''          
            shape = predictor(gray, rect)
            shape = face_utils.shape_to_np(shape)

            for counter,(x, y) in enumerate(shape):
                cv2.circle(show_frame, (x, y), 4, (0, 0, 255), -1)
                cv2.putText(show_frame,str(counter),(x, y), cv2.FONT_HERSHEY_SIMPLEX, 0.4,(255,255,255),1)
            '''
               
            
            left, right, top, bottom = rect.left(), rect.right(), rect.top(),rect.bottom()
            width = abs(right - left)
            height = abs(bottom - top)
            print("Left, right, top, bottom: ",left, right, top, bottom)
            #print("Width and Height of bounding box : ",width,height)
            
            face_left = int(left - (left_increase_ratio/2)*width)
            face_top = int(top - (top_increase_ratio)*height)
            #face_right = int(right + (area_increase_ratio/2)*width)
            #face_bottom = int(bottom + (area_increase_ratio/2)*height)
            
            face_right = right
            face_bottom = bottom
            
            print("Increased coordinates: ",face_left, face_right, face_top, face_bottom)
            
            if image_show:
                cv2.rectangle(show_frame,(left,top),(right,bottom),(255,255,0),3)
                cv2.rectangle(show_frame,(face_left,face_top),(face_right,face_bottom),(0,255,0),3)
            
            face = frame[face_top:face_bottom,face_left:face_right]
            
            if(face.size==0):
                continue
            #    continue
            #Extract face skin pixels
            mask = skin_detector.process(face)
     
            
            #print("Mask shape: ",mask.shape)
            masked_face = cv2.bitwise_and(face, face, mask=mask)
            number_of_skin_pixels = np.sum(mask>0)



            #compute mean
            r = np.sum(masked_face[:,:,2])/number_of_skin_pixels
            g = np.sum(masked_face[:,:,1])/number_of_skin_pixels 
            b = np.sum(masked_face[:,:,0])/number_of_skin_pixels

            if frame_counter==0:
                mean_rgb = np.array([r,g,b])
            else:
                mean_rgb = np.vstack((mean_rgb,np.array([r,g,b])))

            
            print("Mean RGB -> R = {0}, G = {1}, B = {2} ".format(r,g,b))

        if image_show:
            if h>w and h>640:
                    dim = (int(640 * (w/h)),640)    
                    show_frame = cv2.resize(show_frame, dim, interpolation = cv2.INTER_LINEAR)
            if w>h and w>640:
                    dim = (640, int(640 * (h/w)))
                    show_frame = cv2.resize(show_frame, dim, interpolation = cv2.INTER_LINEAR)
         
        #cv2.imshow("frame",show_frame)
        if(image_show):
            cv2.imshow("Masked face",masked_face)
            cv2.waitKey(1)
        frame_counter +=1
        i += 1
        #end loop
    
    camera.release()
    cv2.destroyAllWindows()

    if plot:
        f = np.arange(0,mean_rgb.shape[0])
        plt.plot(f, mean_rgb[:,0] , 'r', f,  mean_rgb[:,1], 'g', f,  mean_rgb[:,2], 'b')
        plt.title("Mean RGB - Complete")
        plt.show()

    #Calculating l
    l = int(framerate * 1.6)
    print("Window Length : ",l)

    H = np.zeros(mean_rgb.shape[0])

    for t in range(0, (mean_rgb.shape[0]-l)):
        #t = 0
        # Step 1: Spatial averaging
        C = mean_rgb[t:t+l-1,:].T
        #C = mean_rgb.T
        print("C shape", C.shape)
        print("t={0},t+l={1}".format(t,t+l))
        if t == 3:
            plot = False

        if plot:
            f = np.arange(0,C.shape[1])
            plt.plot(f, C[0,:] , 'r', f,  C[1,:], 'g', f,  C[2,:], 'b')
            plt.title("Mean RGB - Sliding Window")
            plt.show()
        
        #Step 2 : Temporal normalization
        mean_color = np.mean(C, axis=1)
        #print("Mean color", mean_color)
        
        diag_mean_color = np.diag(mean_color)
        #print("Diagonal",diag_mean_color)
        
        diag_mean_color_inv = np.linalg.inv(diag_mean_color)
        #print("Inverse",diag_mean_color_inv)
        
        Cn = np.matmul(diag_mean_color_inv,C)
        #Cn = diag_mean_color_inv@C
        #print("Temporal normalization", Cn)
        #print("Cn shape", Cn.shape)

        if plot:
            f = np.arange(0,Cn.shape[1])
            #plt.ylim(0,100000)
            plt.plot(f, Cn[0,:] , 'r', f,  Cn[1,:], 'g', f,  Cn[2,:], 'b')
            plt.title("Temporal normalization - Sliding Window")
            plt.show()
    
        #Step 3: 
        projection_matrix = np.array([[0,1,-1],[-2,1,1]])
        S = np.matmul(projection_matrix,Cn)
        #S = projection_matrix@Cn
        print("S matrix",S)
        print("S shape", S.shape)
        if plot:
            f = np.arange(0,S.shape[1])
            #plt.ylim(0,100000)
            plt.plot(f, S[0,:] , 'c', f,  S[1,:], 'm')
            plt.title("Projection matrix")
            plt.show()

        #Step 4:
        #2D signal to 1D signal
        std = np.array([1,np.std(S[0,:])/np.std(S[1,:])])
        print("std",std)
        P = np.matmul(std,S)
        #P = std@S
        print("P",P)
        if plot:
            f = np.arange(0,len(P))
            plt.plot(f, P, 'k')
            plt.title("Alpha tuning")
            plt.show()

        #Step 5: Overlap-Adding
        H[t:t+l-1] = H[t:t+l-1] +  (P-np.mean(P))/np.std(P)

    print("Pulse",H)
    signal = H
    print("Pulse shape", H.shape)
 

    #FFT to find the maxiumum frequency
    # find the segment length, such that we have 8 50% overlapping segments (Matlab's default)
    segment_length = (2*signal.shape[0]) // (nsegments + 1) 

    # the number of points for FFT should be larger than the segment length ...
    '''
    if nfft < segment_length:
        print("(nfft < nperseg): {0}, {1}".format(nfft,segment_length))
    '''
        
    print("nperseg",segment_length)
    
    from matplotlib import pyplot
    pyplot.plot(range(signal.shape[0]), signal, 'g')
    pyplot.title('Filtered green signal')
    pyplot.show()

    

    from scipy.signal import welch
    signal = signal.flatten()
    green_f, green_psd = welch(signal, framerate, 'flattop', nperseg=segment_length) #, scaling='spectrum',nfft=2048)
    print("Green F, Shape",green_f,green_f.shape)
    print("Green PSD, Shape",green_psd,green_psd.shape)

    #green_psd = green_psd.flatten()
    first = np.where(green_f > 0.9)[0] #0.8 for 300 frames
    last = np.where(green_f < 1.8)[0]
    first_index = first[0]
    last_index = last[-1]
    range_of_interest = range(first_index, last_index + 1, 1)

    print("Range of interest",range_of_interest)
    max_idx = np.argmax(green_psd[range_of_interest])
    f_max = green_f[range_of_interest[max_idx]]

    hr = f_max*60.0
    print("Heart rate = {0}".format(hr))

    import scipy.io as sio
    #mat_file_name = pulsedir + "pulse_" + video_file_name[:-4] + "_frame-0-15" + ".mat"
    mat_file_name = "pulse_" + video_file_name[:-4] + "_frame-0-15" + ".mat"
    sio.savemat(mat_file_name,{'pulse':signal, 'heartrate':hr, 'nperseg':segment_length})


    
    from matplotlib import pyplot
    pyplot.semilogy(green_f, green_psd, 'g')
    xmax, xmin, ymax, ymin = pyplot.axis()
    pyplot.vlines(green_f[range_of_interest[max_idx]], ymin, ymax, color='red')
    pyplot.title('Power spectrum of the green signal (HR = {0:.1f})'.format(hr))
    pyplot.show()
    

if __name__ == "__main__":
	main()

In [58]:
#compute mean
for frame_counter in range(0,10):
    r = 1
    g = 2
    b = 3

    if frame_counter==0:
        mean_rgb = np.array([r,g,b])
    else:
        mean_rgb = np.vstack((mean_rgb,np.array([r+frame_counter*0.1,g + frame_counter*0.2,b+frame_counter*0.15])))

In [59]:
mean_rgb.shape

(10, 3)

In [60]:
mean_rgb[1:5,:]

array([[1.1 , 2.2 , 3.15],
       [1.2 , 2.4 , 3.3 ],
       [1.3 , 2.6 , 3.45],
       [1.4 , 2.8 , 3.6 ]])

In [61]:
mean_rgb[1:5,:].T

array([[1.1 , 1.2 , 1.3 , 1.4 ],
       [2.2 , 2.4 , 2.6 , 2.8 ],
       [3.15, 3.3 , 3.45, 3.6 ]])

In [62]:
C = mean_rgb[1:5,:].T
np.mean(C, axis=1)

array([1.25 , 2.5  , 3.375])

In [63]:
mean_color = np.mean(C, axis=1)
np.diag(mean_color)

array([[1.25 , 0.   , 0.   ],
       [0.   , 2.5  , 0.   ],
       [0.   , 0.   , 3.375]])

In [64]:
diag_mean_color = np.diag(mean_color)
np.linalg.inv(diag_mean_color)

array([[0.8      , 0.       , 0.       ],
       [0.       , 0.4      , 0.       ],
       [0.       , 0.       , 0.2962963]])

In [65]:
diag_mean_color_inv = np.linalg.inv(diag_mean_color)
Cn = np.matmul(diag_mean_color_inv,C)
Cn

array([[0.88      , 0.96      , 1.04      , 1.12      ],
       [0.88      , 0.96      , 1.04      , 1.12      ],
       [0.93333333, 0.97777778, 1.02222222, 1.06666667]])

In [66]:
projection_matrix = np.array([[0,1,-1],[-2,1,1]])
projection_matrix

array([[ 0,  1, -1],
       [-2,  1,  1]])

In [67]:
S = np.matmul(projection_matrix,Cn)
S

array([[-0.05333333, -0.01777778,  0.01777778,  0.05333333],
       [ 0.05333333,  0.01777778, -0.01777778, -0.05333333]])

In [69]:
std = np.array([1,np.std(S[0,:])/np.std(S[1,:])])
print("std",std)
P = np.matmul(std,S)
P

std [1. 1.]


array([0., 0., 0., 0.])

In [70]:
(P-np.mean(P))/np.std(P)

  (P-np.mean(P))/np.std(P)


array([nan, nan, nan, nan])

In [72]:
np.std(S)

0.03975231959999616

In [6]:
df_price = (
    pd.read_pickle("../230917_df_price_price_only_f_20130101_t_20230919.pkl")
)

In [14]:
df_close = (
    df_price
    .pivot(
        index='date',
        columns='code',
        values='close'
    )
)

df_high = (
    df_price
    .pivot(
        index='date',
        columns='code',
        values='high'
    )
)

df_low = (
    df_price
    .pivot(
        index='date',
        columns='code',
        values='low'
    )
)

df_open = (
    df_price
    .pivot(
        index='date',
        columns='code',
        values='open'
    )
)

df_volume = (
    df_price
    .pivot(
        index='date',
        columns='code',
        values='volume'
    )
)

In [15]:
df_close_diff = df_close.pct_change()
df_high_diff = df_high.pct_change()
df_low_diff = df_low.pct_change()
df_open_diff = df_open.pct_change()
df_volume_diff = df_volume.pct_change()

In [11]:
df_close.pct_change().head()

code,000020,000040,000050,000070,000080,000100,000120,000140,000150,000180,...,900340,950110,950130,950140,950160,950170,950190,950200,950210,950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,,,,,,,,,,,...,,,,,,,,,,
2013-01-03,-0.014778,0.009901,0.02002,-0.002801,0.011146,0.017472,-0.014493,0.020528,0.007607,-0.011047,...,,0.03875,,,,,,,,
2013-01-04,0.003333,-0.001961,0.01855,-0.001404,-0.001575,0.034151,0.053922,0.017241,0.0,-0.005879,...,,0.022864,,,,,,,,
2013-01-07,0.0,-0.001965,0.005287,-0.008439,-0.006309,0.013763,0.0,0.0,0.0,-0.02602,...,,0.051765,,,,,,,,
2013-01-08,0.01495,-0.001969,-0.027583,-0.017021,-0.001587,0.019036,0.051163,0.048023,-0.003775,0.017608,...,,-0.004474,,,,,,,,


In [23]:
df_close['000020'].plot()

In [24]:
df_close_diff["000020"].plot()

In [26]:
np.log(df_volume['000020']).diff().plot()

In [31]:
from scipy.stats import zscore



In [66]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Create subplots
fig = make_subplots(rows=3, cols=1, shared_xaxes=True)

code = '005930'
z_transformed_series = zscore(df_close_diff[code].dropna())
z_transformed_series_v = zscore(np.log(df_volume[code]).diff().dropna())


x = df_close.index
y1 = df_close[code]
y2 = df_close_diff[code]
y3 = np.log(df_volume[code]).diff()

# Add traces to the subplots
fig.add_trace(go.Scatter(x=x, y=y1, mode='lines', name='Trace 1'), row=1, col=1)
fig.add_trace(go.Scatter(x=x, y=y2, mode='lines', name='Trace 2'), row=2, col=1)
fig.add_trace(go.Scatter(x=x, y=y3, mode='lines', name='Trace 3'), row=3, col=1)

# Update layout
fig.update_layout(height=800, width=800, title_text="3-Row Plot")

# Show the plot
fig.show()


divide by zero encountered in log


invalid value encountered in reduce


invalid value encountered in reduce


divide by zero encountered in log



In [67]:
import plotly.graph_objects as go

# Sample data
x = df_close.index
y1 = df_close[code]
y2 = np.log(df_volume[code]).diff().cumsum().rolling(60).mean()
y3 = np.log(df_volume[code]).diff().cumsum().rolling(20).mean()

# Create the first trace with the first y-axis
trace1 = go.Scatter(x=x, y=y1, mode='lines', name='close diff')

# Create the second trace with the second y-axis
trace2 = go.Scatter(x=x, y=y2, mode='lines', name='vol diff', yaxis='y2')

# Create the second trace with the second y-axis
trace3 = go.Scatter(x=x, y=y3, mode='lines', name='vol diff', yaxis='y2')

# Create layout with multiple y-axes
layout = go.Layout(
    yaxis=dict(title='Y-axis 1'),
    yaxis2=dict(title='Y-axis 2', overlaying='y', side='right')
)

# Combine traces and layout into a figure
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)

# Show the plot
fig.show()


invalid value encountered in accumulate



In [52]:
(y2).cumsum().plot()

In [51]:
df_close[code].plot()

In [72]:
df_close.head()

code,000020,000040,000050,000070,000080,000100,000120,000140,000150,000180,...,900340,950110,950130,950140,950160,950170,950190,950200,950210,950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,6090.0,3030.0,8192.0,71400.0,31400.0,25412.0,103500.0,17050.0,103587.0,1720.0,...,,4000.0,,,,,,,,
2013-01-03,6000.0,3060.0,8356.0,71200.0,31750.0,25856.0,102000.0,17400.0,104375.0,1701.0,...,,4155.0,,,,,,,,
2013-01-04,6020.0,3054.0,8511.0,71100.0,31700.0,26739.0,107500.0,17700.0,104375.0,1691.0,...,,4250.0,,,,,,,,
2013-01-07,6020.0,3048.0,8556.0,70500.0,31500.0,27107.0,107500.0,17700.0,104375.0,1647.0,...,,4470.0,,,,,,,,
2013-01-08,6110.0,3042.0,8320.0,69300.0,31450.0,27623.0,113000.0,18550.0,103981.0,1676.0,...,,4450.0,,,,,,,,


In [83]:
from arch.unitroot.cointegration import phillips_ouliaris

l_code_pvalue = []
for code in df_close.columns:

    df_ = (
        df_price.loc[lambda df : df.code == code]
        [['date','close','volume']]
        .dropna()
    )

    s1 = df_['close']
    s2 = np.log(df_['volume'])

    try :
        r = phillips_ouliaris(
                s1, s2, trend="c", test_type="Za", kernel="bartlett"
            )

        l_code_pvalue.append(
            {
                'code' : code,
                'pvalue' : r._pvalue
            }
        )
    except :
        pass


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered in log


divide by zero encountered 

In [84]:
df_coint_close_vol = pd.DataFrame(l_code_pvalue)

In [104]:
df_coint_close_vol.loc[lambda df : df.pvalue > 0].sort_values('pvalue').head(20)

Unnamed: 0,code,pvalue
895,122310,0.000131
191,9580,0.000131
808,96870,0.000131
1062,214320,0.000132
37,1450,0.000132
242,14440,0.000132
958,143160,0.000132
477,43610,0.000133
56,2310,0.000133
92,4000,0.000133


In [114]:
import plotly.graph_objects as go

code = '377460'
# Sample data
x = df_close.index

s1 = df_close[code].dropna()
s2 = np.log(df_volume[code]).dropna()

# Create the first trace with the first y-axis
trace1 = go.Scatter(x=x, y=s1, mode='lines', name='close diff')

# Create the second trace with the second y-axis
trace2 = go.Scatter(x=x, y=s2, mode='lines', name='vol diff', yaxis='y2')

# Create layout with multiple y-axes
layout = go.Layout(
    yaxis=dict(title='Y-axis 1'),
    yaxis2=dict(title='Y-axis 2', overlaying='y', side='right')
)

# Combine traces and layout into a figure
fig = go.Figure(data=[trace1, trace2], layout=layout)

# Show the plot
fig.show()

In [115]:
import statsmodels.api as sm

# Add a constant to the independent variable
x = sm.add_constant(s2)

# Fit the model
model = sm.OLS(s1, x)
results = model.fit()

# Get the residuals
residuals = results.resid

In [116]:
zscore(residuals).plot()

In [117]:
s1.plot()

In [118]:
np.log(df_volume).diff()


divide by zero encountered in log



code,000020,000040,000050,000070,000080,000100,000120,000140,000150,000180,...,900340,950110,950130,950140,950160,950170,950190,950200,950210,950220
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-02,,,,,,,,,,,...,,,,,,,,,,
2013-01-03,0.250314,0.092519,0.086348,1.003027,-0.269877,0.395180,0.235152,0.016032,-0.028081,0.175620,...,,0.216640,,,,,,,,
2013-01-04,-0.112988,-0.518960,1.088013,0.245005,-0.663768,-0.382393,-0.073140,0.214668,-0.547546,-0.181673,...,,1.267208,,,,,,,,
2013-01-07,-0.261992,0.185343,-0.766247,-0.880373,0.005826,-0.036151,0.406932,-0.072034,-1.063739,0.697313,...,,-1.652177,,,,,,,,
2013-01-08,0.554099,0.393415,-0.946732,0.592192,0.001474,0.289876,0.855081,0.264280,0.449953,-0.421416,...,,-0.374610,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-09-13,0.033073,-0.675651,1.763794,-1.441069,0.789247,-0.747995,-0.071475,-0.531773,-1.142835,0.593788,...,-2.173284,0.366613,0.712927,0.529766,0.113692,-1.107210,-1.255640,-0.349648,-0.598795,0.657488
2023-09-14,0.115407,-1.136800,-2.589003,1.049382,-0.346124,-0.485909,0.666853,0.055708,-0.029494,-1.238769,...,-0.179503,-0.357857,-0.697386,0.396054,-0.364537,-0.469226,0.441190,-0.289943,0.991716,-0.771830
2023-09-15,-0.316949,1.024719,0.152784,0.184277,0.670836,0.033612,0.147190,-0.071686,-0.111613,0.353781,...,-0.123791,1.292781,-1.367366,-0.136895,0.608179,0.414819,-0.471535,0.573729,-1.513482,-0.232020
2023-09-18,-0.291410,0.148521,0.370929,-1.564014,-0.927604,-0.051864,1.256994,-0.153791,-0.596902,0.696395,...,-0.105188,-0.155890,-0.516194,0.320392,-0.481904,0.365601,0.594986,2.926378,-0.096474,-0.136359


In [119]:
np.log(df_volume['000020']).diff()

date
2013-01-02         NaN
2013-01-03    0.250314
2013-01-04   -0.112988
2013-01-07   -0.261992
2013-01-08    0.554099
                ...   
2023-09-13    0.033073
2023-09-14    0.115407
2023-09-15   -0.316949
2023-09-18   -0.291410
2023-09-19    0.276032
Name: 000020, Length: 2640, dtype: float64