## Bond Data Download

In [100]:
import pickle
from pykrx.website.krx.krxio import KrxWebIo
from tqdm import tqdm
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

data_path = 'simulation_data'

class Bond_YTM(KrxWebIo):
    @property
    def bld(self):
        return 'dbms/MDC/STAT/standard/MDCSTAT11402'
    def fetch(self, strtDd: str, endDd: str, bndKindTpCd: str):
        result = self.read(locale="ko_KR", inqTpCd='E', bndKindTpCd=bndKindTpCd, strtDd=strtDd, endDd=endDd)
        return pd.DataFrame(result['output'])

In [86]:
# 채권 데이터를 저장할 Frame 생성
start_date = '2015-01-02'
end_date = '2025-03-20'
with open(f"{data_path}/KS200_MASK.pkl", "rb") as f:
    frame = pickle.load(f) # 열이 시계열
# start_date와 end_date 사이의 데이터만 추출
frame = frame.loc[:, start_date:end_date]

In [101]:
# bndKindTpCd
# 국고채 10년: 3013
# cd 91일: 3013
# 회사채 AA- 무보증 3년: 3009
# 회사채 BB- 무보증 3년: 3010
def fast_fill_frame(frame: pd.DataFrame, bond_df: pd.DataFrame) -> pd.DataFrame:
    # 1) 복사 & numeric 변환
    frame_ = frame.copy().apply(pd.to_numeric, errors='coerce')
    
    # 2) gov10_df의 'LST_ORD_BAS_YD' 시리즈를 frame 컬럼 순서에 맞게 재색인
    #    (컬럼이 완전히 일치한다면 reindex는 생략 가능)
    series = bond_df.loc['LST_ORD_BAS_YD'].reindex(frame_.columns)
    
    # 3) 벡터 브로드캐스트로 한 번에 모든 행을 채우기
    #    DataFrame[:] = 2D numpy array 형태로 할당하면 모든 셀에 일괄 적용됩니다.
    frame_.iloc[:, :] = np.tile(series.values, (len(frame_), 1))
    
    return frame_

def fetch_bond_df(start_date, end_date, bond_type, frame):
    # 1) 원시 데이터 fetch & 날짜 포맷 정리
    raw = Bond_YTM().fetch(
        strtDd=start_date.replace('-', ''), 
        endDd=end_date.replace('-', ''), 
        bndKindTpCd=bond_type
    ).sort_values(by='DISCLS_DD')
    
    raw['DISCLS_DD'] = raw['DISCLS_DD'].str.replace('/', '-')
    raw = raw.set_index('DISCLS_DD')
    
    # 2) 모든 컬럼을 강제 numeric → float (문자열은 NaN 처리)
    raw = raw.apply(pd.to_numeric, errors='coerce')
    
    # 3) 전치하여 날짜가 컬럼이 되도록
    X = raw.transpose()
    
    # 4) target 시계열 D로 리인덱스 & 정렬
    D = frame.columns
    X = X.sort_index(axis=1)
    X = X.reindex(columns=D)
    
    # 5) 보간 및 끝단 채우기
    X = (
        X
        .interpolate(method="linear", axis=1)  # 중간 NaN 선형 보간
        .ffill(axis=1)                         # 앞단 NaN forward-fill
        .bfill(axis=1)                         # 뒷단 NaN backward-fill
    )
    
    # 6) 원본 범위 밖 컬럼(drop)
    orig_dates = raw.index  # 실제 있는 날짜들
    min_date, max_date = orig_dates.min(), orig_dates.max()
    keep = [d for d in D if min_date <= d <= max_date]
    X_final = X.loc[:, keep]
    
    return fast_fill_frame(frame, X_final)

In [105]:
bond_data_dict={
    'gov10': '3013', # 국고채 10년 금리
    'rf': '4000', # CD 91일 금리
    'corp_aa': '3009', # 회사채 AA- 무보증 3년 금리
    'corp_bb': '3010', # 회사채 BB- 무보증 3년 금리
}
for bond_type, bond_code in bond_data_dict.items():
    print(f"Fetching {bond_type} data...")
    bond_df = fetch_bond_df(start_date, end_date, bond_code, frame)
    print(f"Saving {bond_type} data...")
    with open(f"{data_path}/{bond_type}_bond.pkl", "wb") as f:
        pickle.dump(bond_df, f)
    print(f"{bond_type} data saved.")

Fetching gov10 data...
Saving gov10 data...
gov10 data saved.
Fetching rf data...
Saving rf data...
rf data saved.
Fetching corp_aa data...
Saving corp_aa data...
corp_aa data saved.
Fetching corp_bb data...
Saving corp_bb data...
corp_bb data saved.


## Data Load and Portfolio Construction

In [None]:
start_date = '2015-01-02'
end_date = '2025-03-20'
with open(f"{data_path}/KS200_MASK.pkl", 'rb') as f:
    mask_df:pd.DataFrame = pickle.load(f)
    mask_df = mask_df.loc[:, start_date:end_date]
with open(f"{data_path}/Return.pkl", 'rb') as f:
    returns_df = pickle.load(f)
    returns_df = returns_df.loc[:, start_date:end_date]
with open(f"{data_path}/MarketCap.pkl", 'rb') as f:
    mc_df = pickle.load(f)
    mc_df = mc_df.loc[:, start_date:end_date]
with open(f"{data_path}/ifrs-full_Equity.pkl", 'rb') as f:
    be_df = pickle.load(f)
    be_df = be_df.loc[:, start_date:end_date]
with open(f"{data_path}/KOSPI_Close.pkl", 'rb') as f:
    kospi_close_df = pickle.load(f)
    kospi_close_df = kospi_close_df.loc[:, start_date:end_date]
with open(f"{data_path}/.pkl", 'rb') as f:


In [22]:
import pandas as pd
import statsmodels.api as sm


R     = returns_df
MC    = mc_df
BE    = be_df 
RF_df = rf_df
GOV   = gov_df
CORP  = corp_df
MKT   = kospi_close_df.pct_change(axis=1)
mask  = mask_df.astype(bool)

dates = R.columns
tickers = R.index

NameError: name 'rf_df' is not defined

In [None]:

# ── 2) 일별 리밸런싱으로 SMB, HML 계산 ──
SMB = pd.Series(index=dates, dtype=float)
HML = pd.Series(index=dates, dtype=float)

for dt in dates:
    # 당일 KS200 구성 종목
    in_ks = mask[dt]
    members = in_ks[in_ks].index  # boolean Series -> Index
    
    # (a) size split among 당일 KS200
    me = MC.loc[members, dt].dropna()
    med = me.median()
    small = me[me <= med].index
    big   = me[me >  med].index
    
    # (b) BM split among 당일 KS200
    # BE is presumed already daily-aligned; BE.loc[:,dt] gives BE for that day
    bm = (BE.loc[members, dt] / MC.loc[members, dt]).dropna()
    q30, q70 = bm.quantile([0.3, 0.7])
    low_idx  = bm[bm <= q30].index
    high_idx = bm[bm >  q70].index
    
    # (c) KS200 멤버 범위 유지
    s_idx = small
    b_idx = big
    l_idx = low_idx
    h_idx = high_idx
    
    # (d) 시가총액 가중치
    w_s = MC.loc[s_idx, dt] / MC.loc[s_idx, dt].sum()
    w_b = MC.loc[b_idx, dt] / MC.loc[b_idx, dt].sum()
    w_l = MC.loc[l_idx, dt] / MC.loc[l_idx, dt].sum()
    w_h = MC.loc[h_idx, dt] / MC.loc[h_idx, dt].sum()
    
    # (e) 포트폴리오 일간 수익
    r = R.loc[:, dt]  # 모든 티커
    SMB[dt] = (r.loc[s_idx] * w_s).sum() - (r.loc[b_idx] * w_b).sum()
    HML[dt] = (r.loc[h_idx] * w_h).sum() - (r.loc[l_idx] * w_l).sum()

# ── 3) 팩터 DataFrame 구성 ──
factors = pd.DataFrame({
    'MKT_RF': MKT.iloc[0] - RF_df.iloc[0],
    'SMB':     SMB,
    'HML':     HML,
    'RF':      RF_df.iloc[0]
}, index=dates).dropna()

# ── 4) 3요인 회귀 (매일 편입 종목만 사용) ──
results = []
for tkr in tickers:
    # 당일 편입일자만 사용
    in_ks = mask.loc[tkr]
    use_dates = factors.index[in_ks[factors.index]]
    if len(use_dates) < len(factors.index) * 0.5:
        # 50% 미만 관측치면 스킵
        continue

    # (a) 종목별 일간 초과수익
    y = (R.loc[tkr, use_dates] - RF_df.loc[tkr, use_dates])
    
    # (b) 설명변수 정렬
    X = factors.loc[use_dates, ['MKT_RF','SMB','HML']]
    X = sm.add_constant(X)
    
    # (c) OLS 적합
    res = sm.OLS(y, X).fit()
    
    # (d) 결과 저장
    results.append({
        'Ticker':   tkr,
        'alpha':    res.params['const'],
        'beta_MKT': res.params['MKT_RF'],
        'beta_SMB': res.params['SMB'],
        'beta_HML': res.params['HML'],
        'R2':       res.rsquared
    })

res_df = pd.DataFrame(results)

# ── 5) 결과 요약 ──
print("=== 3요인 모형 (매일 리밸런싱, KS200 동적 편입) ===")
print("평균 α:  ", res_df['alpha'].mean())
print("평균 R²: ", res_df['R2'].mean())
print("\nα 분포:\n", res_df['alpha'].describe())
print("\n베타 분포:\n", res_df[['beta_MKT','beta_SMB','beta_HML']].describe())


In [5]:
beme

Unnamed: 0_level_0,2010-01-04,2010-01-05,2010-01-06,2010-01-07,2010-01-08,2010-01-11,2010-01-12,2010-01-13,2010-01-14,2010-01-15,...,2025-03-07,2025-03-10,2025-03-11,2025-03-12,2025-03-13,2025-03-14,2025-03-17,2025-03-18,2025-03-19,2025-03-20
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000030,,,,,,,,,,,...,,,,,,,,,,
000050,,,,,,,,,,,...,,,,,,,,,,
000060,,,,,,,,,,,...,,,,,,,,,,
000070,,,,,,,,,,,...,,,,,,,,,,
000080,,,,,,,,,,,...,,,,,,,,,,0.840948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450080,,,,,,,,,,,...,,,,,,,,,,
454910,,,,,,,,,,,...,,,,,,,,,,
456040,,,,,,,,,,,...,,,,,,2.37483,,,,
457190,,,,,,,,,,,...,,,,,,,,,,
