In [1]:
import pandas as pd
from arch.unitroot import ADF, KPSS
from arch.unitroot.cointegration import phillips_ouliaris
from statsmodels.stats.stattools import durbin_watson
from statsmodels.tsa.ardl import UECM
import matplotlib.pyplot as plt
import numpy as np

from statsmodels.tsa.stattools import adfuller
import itertools

import pickle

pd.options.plotting.backend = "plotly"

### Price Data Preperation

In [2]:
df_price = pd.read_pickle("df_price_price_only_f_20130101_t_20230919.pkl")

In [3]:
# code_list = df_price.code.unique().tolist()
code_list = (
    df_price.loc[lambda df : df.marcap > 10_000_00_000_000]
    .code.unique().tolist()
)

In [4]:
code_name_dict = df_price.set_index('code')['name'].to_dict()

In [5]:
df_close = (
    df_price
    .pivot(
        index = 'date',
        columns='code',
        values='close'
    )
)

In [7]:
df_close_change = df_close.pct_change(1)
df_close_diff = df_close.diff()
df_close_lag1 = df_close.shift(1)

df_close_log_diff = np.log(df_close).diff()
df_close_log_diff_lag1 = df_close_log_diff.shift(1)

  df_close_change = df_close.pct_change(1)


### Calc Cointegration

#### Calculation Scheme

1. Prepare Stationary data
    1) Check Stationarity for raw time series
    2) (if non-stationary) Check Stationarity for 1st diff
    3) Keep the codes that the 1st diff is stationary

2. Check Cointegration
    1) Pick two code from the stationary time series
    2) Run test for cointegration
    3) (if cointegrated) Keep the codes

3. Calculate Coefficients

In [8]:
def get_dates():
    import exchange_calendars as xcals
    krx_cal = xcals.get_calendar("XKRX")

    max_date = '2024-05-30'
    start_date = '2015-01-02'

    finish = False
    dates = []

    len_of_train = 200
    gap_from_last_train_date = 7 # This number depends on the target ( rtn_5 -> 6, rnt_20 -> 21)
    len_of_pred = 20 # The length of pred for 1 model update

    while not finish :
        train_end = krx_cal.sessions_window(start_date, len_of_train)[-1].strftime("%Y-%m-%d")
        oos_start = krx_cal.sessions_window(train_end, gap_from_last_train_date)[-1].strftime("%Y-%m-%d")
        oos_end = krx_cal.sessions_window(oos_start, len_of_pred)[-1].strftime("%Y-%m-%d")

        if oos_end > max_date :
            finish = True
            
        else :
            dates.append(
                (start_date, train_end, oos_start, oos_end)
            )

            start_date = krx_cal.sessions_window(start_date, len_of_pred)[-1].strftime("%Y-%m-%d")
    
    return dates

In [9]:
dates = get_dates()

In [10]:
dates[-11]

('2022-09-22', '2023-07-12', '2023-07-20', '2023-08-17')

In [11]:
def apply_unit_root_test(timeseries : pd.Series):
    """
    Apply ADF and KPSS test to the given time seires
    """
    
    adf_test_score = 0 # if this value greater than 0, this time series has unit root
    kpss_test_score = 0 # If this value greater than 0, this time series has unit root

    for type in ['n', 'c', 'ct']:
        result = adfuller(timeseries, regression=type, autolag='BIC')
        p_value = result[1]

        if p_value < 0.05:
            # print("Has no unit root")
            adf_test_score += 0
            
        else :
            # print("Has unit root")
            adf_test_score += 1

    result = KPSS(timeseries, trend="ct", lags=-1)
    p_value_kpss = result.pvalue

    # print(f"KPSS : {p_value_kpss}")

    if p_value_kpss < 0.05:
        # print("Has unit root")
        kpss_test_score += 1
    else :
        # print("weakly stationary")    
        kpss_test_score += 0

    return adf_test_score, kpss_test_score

In [12]:
stationay_raw_time_series = []
stationary_1st_diff_time_series = []

for code in code_list:

    try :
        time_series_raw = df_close[code].dropna()
        adf_test_score, kpss_test_score = apply_unit_root_test(time_series_raw)

        if kpss_test_score == 0 and adf_test_score == 0 :
            stationay_raw_time_series.append(code)
        else :
            time_series_1st_diff = df_close_log_diff[code].dropna()
            adf_test_score, kpss_test_score = apply_unit_root_test(time_series_1st_diff)

            if kpss_test_score == 0 and adf_test_score == 0:
                stationary_1st_diff_time_series.append(code)
    except :
        print(f"something wrong with this code : {code}")

In [13]:
# Check Stationarity Result
print(f"Number of total codes : {code_list.__len__()}")
print(f"Number of stationary at raw : {stationay_raw_time_series.__len__()}")
print(f"Number of stationary at 1st diff : {stationary_1st_diff_time_series.__len__()}")


Number of total codes : 260
Number of stationary at raw : 0
Number of stationary at 1st diff : 254


In [14]:
def calc_cointegration(df_combi, combi, start_date, end_date):
    
    df_combi_ = (
        df_combi
        .loc[lambda df : df.index > start_date]
        .loc[lambda df : df.index <= end_date]
        .dropna()
    )

    try :

        po_result = phillips_ouliaris(
                df_combi_[combi[0]], df_combi_[combi[1]], trend="c", test_type="Za", kernel="bartlett"
            )
        
        temp_dict = {
            'date' : end_date,
            'code_pair' : f"{combi[0]}-{combi[1]}",
            'p_value' : po_result.pvalue
        }
        
    except :
        temp_dict = {
            'date' : end_date,
            'code_pair' : f"{combi[0]}-{combi[1]}",
            'p_value' : 100
        }

    return temp_dict


In [16]:
combinations = list(itertools.combinations(stationary_1st_diff_time_series, 2))

l_of_cointegrated_combi_c = []
for combi in combinations:

    df_combi = (
        df_close[[combi[0], combi[1]]]
    )

    for date in dates[-13:-12]:

        test_result = calc_cointegration(df_combi, combi, date[0], date[1])
        l_of_cointegrated_combi_c.append(test_result)

In [18]:
(
    pd.DataFrame(l_of_cointegrated_combi_c)
    # .loc[lambda df : df.code_pair == '005930-071050']
    .loc[lambda df : df.p_value < 0.05]
    .sort_values('p_value', ascending=True)

).head(10)#.plot(x='date', y='p_value', kind='scatter')

Unnamed: 0,date,code_pair,p_value
15018,2023-05-17,302440-026960,0.000239
13147,2023-05-17,402340-010060,0.000243
26055,2023-05-17,014680-137400,0.000259
17684,2023-05-17,000720-004000,0.000278
20660,2023-05-17,263750-003410,0.000296
8890,2023-05-17,018260-195940,0.000326
19308,2023-05-17,005940-000880,0.000367
30167,2023-05-17,003230-114090,0.000451
15306,2023-05-17,004020-006800,0.000525
5605,2023-05-17,003550-241560,0.000534


In [None]:
# Example dictionary
sorted_combi = sorted(l_of_cointegrated_combi, key=lambda x: x['p_value'])

In [122]:
# selected_combi = sorted_combi[30]
# 004020-006800


combi_array = [
    '004020', 
    '006800'
    ]

df_combi_arry = df_close[combi_array].loc[lambda df : df.index > "2022-01-01"]
df_combi = df_combi_arry.dropna()


In [123]:
df_combi

code,004020,006800
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-03,41550.0,8550.0
2022-01-04,42950.0,8560.0
2022-01-05,44400.0,8490.0
2022-01-06,45350.0,8390.0
2022-01-07,44900.0,8470.0
...,...,...
2023-09-13,38550.0,6890.0
2023-09-14,38900.0,7000.0
2023-09-15,39450.0,6990.0
2023-09-18,38950.0,6950.0


In [124]:
# ecm_model = UECM(
#     endog=df_combi[
#         [combi_array[0]]
#         ],
#     lags=1,
#     exog=df_combi[
#         [combi_array[1]]
#         ],
#     order=1,
#     trend="c",
# ).fit()

# print(ecm_model.summary())
# print(combi_array[0], combi_array[1])

In [135]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

x = df_combi.index
# y1 = df_close['005930']
# y2 = df_clise [#df_combi[combi_array[1]]]

y1 = df_combi[combi_array[0]]
y2 = df_combi[combi_array[1]]

name1 = code_name_dict[combi_array[0]]
name2 = code_name_dict[combi_array[1]]

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=x, y=y1 / y1.iloc[0], name=name1, line=dict(color='blue')),
    secondary_y=False,  # False to use the left y-axis
)

fig.add_trace(
    go.Scatter(x=x, y=y2/y2.iloc[0], name=name2, line=dict(color='red')),
    secondary_y=False,  # True to use the right y-axis
)

# fig.add_trace(
#     go.Scatter(x=x, y=zscore, name='calculated', line=dict(color='green')),
#     secondary_y=True,  # True to use the right y-axis
# )

# Add figure title
fig.update_layout(
    title_text="Dual Y-Axis Example",
    width=1200,  # Set the width of the figure (in pixels)
    height=600  # Optional: you can also set the height
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>Primary</b> Y-axis", secondary_y=False)
fig.update_yaxes(title_text="<b>Secondary</b> Y-axis", secondary_y=True)

# Show the plot
fig.show()


In [133]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

x = df_combi.index
# y1 = df_close['005930']
# y2 = df_clise [#df_combi[combi_array[1]]]

y1 = df_combi[combi_array[0]]
y2 = df_combi[combi_array[1]]

name1 = code_name_dict[combi_array[0]]
name2 = code_name_dict[combi_array[1]]

# Create figure with secondary y-axis
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=x, y=y1, name=name1, line=dict(color='blue')),
    secondary_y=False,  # False to use the left y-axis
)

# fig.add_trace(
#     go.Scatter(x=x, y=y2, name=name2, line=dict(color='red')),
#     secondary_y=False,  # True to use the right y-axis
# )

fig.add_trace(
    go.Scatter(x=x, y=zscore, name='calculated', line=dict(color='green')),
    secondary_y=True,  # True to use the right y-axis
)

# Add figure title
fig.update_layout(
    title_text="Dual Y-Axis Example",
    width=1200,  # Set the width of the figure (in pixels)
    height=600  # Optional: you can also set the height
)

# Set x-axis title
fig.update_xaxes(title_text="Date")

# Set y-axes titles
fig.update_yaxes(title_text="<b>Primary</b> Y-axis", secondary_y=False)
fig.update_yaxes(title_text="<b>Secondary</b> Y-axis", secondary_y=True)

# Show the plot
fig.show()

In [127]:
S1_log = np.log(y1)  
S2_log = np.log(y2)
log_diff = (S2_log - S1_log).rename("Log diff")


In [128]:
from statsmodels.regression.rolling import RollingOLS
import statsmodels.api as sm

# Sample data
# S1_close and S2_close are pandas Series obtained from your data source

# Define the window for the rolling regression
window = 20  # example window size

# Add a constant to the independent variable (needed for OLS)

S1_close = y1
S2_close = y2
S2_close_with_const = sm.add_constant(y2)

# Run the rolling OLS regression
model = RollingOLS(S1_close, S2_close, window=window)
results = model.fit()

# # Get the residuals (spread)
spread = results.mse_resid.rename("Spread")
# spread = pd.Series(results.mse_resid.squeeze(), index=S1_close.index).rename("Spread")

# # Calculate z-score of the residuals
zscore = (spread - spread.rolling(window).mean()) / spread.rolling(window).std()
zscore = zscore.rename("Z-score")

In [129]:
zscore.plot()