In [1]:
import pandas as pd
import numpy as np
import sys
import os
import gc
from loguru import logger

sys.path.append("../../../note")
sys.path.append(os.getcwd())

%load_ext autoreload
%autoreload 2

from module.get_info_FinMind import FinMindClient
from module.get_info_Finlab import FinlabClient
from utils import batch_fetch_prices, run_event_study, process_disposal_events
from analyzer import DisposalAnalyzer

# 全域參數設定
OFFSET_DAYS = 5      # 事件前後擷取的緩衝交易日 (用於觀察 s-5 ~ e+5)
START_DATE = '2018-01-01' # 資料起始年份
DATA_DIR = '../../data/disposal' # 資料儲存路徑

os.makedirs(DATA_DIR, exist_ok=True)

# Setup

## Data Preparation

### 抓取處置股名單 (Finlab)

In [29]:
finlab_client = FinlabClient()
print("Fetching disposal information from Finlab...")

# 抓取處置資訊
finlab_disposal = finlab_client.get_data("disposal_information", start_date=START_DATE)

# 篩選日期與前處理
if not finlab_disposal.empty:
    finlab_disposal['date'] = pd.to_datetime(finlab_disposal['date'])
    finlab_disposal = finlab_disposal[finlab_disposal['date'] >= START_DATE]
    
    print(f"Fetched {len(finlab_disposal):,} records.")
    
    # 執行前處理：計算連續處置等級 (Disposal Level)
    processed_disposal = process_disposal_events(finlab_disposal)
    
    # 儲存備份
    save_path = f'{DATA_DIR}/processed_disposal_events.csv'
    processed_disposal.to_csv(save_path, index=False, encoding='utf-8-sig')
    print(f"Saved processed events to: {save_path}")
else:
    print("[Error] No data fetched from Finlab.")
    processed_disposal = pd.DataFrame()

Fetching disposal information from Finlab...
Fetched 3,383 records.
Columns before processing: ['Stock_id', 'date', '證券名稱', 'condition', '處置措施', '處置內容', 'event_start_date', 'event_end_date', 'interval', 'key_date']
Saved processed events to: ../../data/disposal/processed_disposal_events.csv


### 抓取個股股價 (FinMind)

In [30]:
# 初始化 FinMind Client
fm_client = FinMindClient()

# 讀取已處理的事件表 (若 Kernal 重啟可直接從這裡開始)
if 'processed_disposal' not in locals():
    processed_disposal = pd.read_csv(f'{DATA_DIR}/processed_disposal_events.csv')

# 批次抓取股價 (平行運算)
if not processed_disposal.empty:
    print("Starting batch price fetch (this may take a while)...")
    price_df = batch_fetch_prices(fm_client, processed_disposal, offset_days=OFFSET_DAYS, max_workers=10)
    
    if not price_df.empty:
        save_path = f'{DATA_DIR}/price_df.csv'
        price_df.to_csv(save_path, index=False)
        print(f"Fetched {len(price_df):,} rows over {price_df['Stock_id'].nunique()} stocks.")
        print(f"Saved prices to: {save_path}")
    else:
        print("[Warning] No price data fetched.")
else:
    print("[Error] Processed disposal dataframe is empty.")

# 釋放記憶體
gc.collect()

Starting batch price fetch (this may take a while)...
Using pre-processed columns 'event_start_date' and 'event_end_date'.
Starting batch fetch for 1317 stocks with 10 workers...


Fetching Prices: 100%|██████████| 1317/1317 [00:35<00:00, 36.85it/s]


Fetched total 45255 rows.
Fetched 45,255 rows over 1042 stocks.
Saved prices to: ../../data/disposal/price_df.csv


5446

## Market Benchmark

In [35]:
print("Fetching TAIEX (Taiwan Stock Index) data...")
# Initialize if not already
if 'fm_client' not in locals(): fm_client = FinMindClient()

# 抓取大盤指數 (TAIEX)
try:
    taiex = fm_client.get_data(
        dataset="TaiwanStockPrice",
        data_id="TAIEX",
        start_date=START_DATE,
        end_date=pd.Timestamp.now().strftime('%Y-%m-%d')
    )
except KeyError as e:
    if 'data' in str(e):
        print(f"[Warning] FinMind API returned invalid response: {e}")
        print("          Proceeding without Market Return data (AR calculation will be partial).")
        taiex = pd.DataFrame()
    else:
        print(f"[Error] Unexpected KeyError: {e}")
        taiex = pd.DataFrame()
except Exception as e:
    print(f"[Error] Failed to fetch TAIEX data: {e}")
    taiex = pd.DataFrame()

if not taiex.empty:
    # 整理欄位
    # FinMind TAIEX columns: [date, stock_id, spread, open, high, low, close, volume...]
    # We only need Date, Open, Close to calculate Return
    market_df = taiex[['date', 'open', 'close']].copy()
    market_df.columns = ['Date', 'market_open', 'market_close']
    market_df['Date'] = pd.to_datetime(market_df['Date'])
    
    # 計算大盤報酬率
    market_df['market_ret'] = (market_df['market_close'] / market_df['market_open']) - 1
    
    # 移除異常值 (Option)
    market_df = market_df.dropna(subset=['market_ret'])
    
    print(f"Fetched {len(market_df):,} market records.")
else:
    print("[Warning] No TAIEX data available. Market return (market_ret) will be NaN.")
    market_df = pd.DataFrame()

Fetching TAIEX (Taiwan Stock Index) data...
Fetched 1,947 market records.


## Event Integration

In [40]:
# 讀取最新的 Checkpoint (防止變數遺失)
price_df = pd.read_csv(f'{DATA_DIR}/price_df.csv')
processed_disposal = pd.read_csv(f'{DATA_DIR}/processed_disposal_events.csv')

# 執行核心邏輯 (位於 utils.py)
# 回傳：
# 1. disposal_wide: 寬表格 (Signal Use)
# 2. disposal_long: 長表格 (Analysis Use)
print("Running Event Study algorithm...")
disposal_wide, disposal_long = run_event_study(price_df, processed_disposal, offset_days=OFFSET_DAYS)

if not disposal_long.empty:
    # 合併大盤數據
    if 'market_df' in locals() and not market_df.empty:
        # Merge on Date
        # Ensure output is datetime
        disposal_long['Date'] = pd.to_datetime(disposal_long['Date'])
        disposal_long = disposal_long.merge(market_df, on='Date', how='left')
    
    # 儲存最終結果
    disposal_wide.to_csv(f'{DATA_DIR}/disposal_df_wide.csv', index=False, encoding='utf-8-sig')
    disposal_long.to_csv(f'{DATA_DIR}/disposal_df_long.csv', index=False, encoding='utf-8-sig')
    
    print(f"Analysis Ready! Data shapes: Wide {disposal_wide.shape}, Long {disposal_long.shape}")
else:
    print("[Error] Event study returned empty result.")

Running Event Study algorithm...






Detected Disposal Levels: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32)]
Converting to Wide Format...
Analysis completed. Wide shape: (45045, 354), Long shape: (55599, 55)
Merging Market Data...
Analysis Ready! Data shapes: Wide (45045, 354), Long (55599, 58)


## 僅保留股票資訊

In [29]:
# 篩選條件：長度為4 且 開頭不是 00 (ETF) 且 開頭不是 91 (DR)
s_id = disposal_long['Stock_id'].astype(str)

disposal_long = disposal_long[
    (s_id.str.len() == 4) &
    (~s_id.str.startswith('00')) &
    (~s_id.str.startswith('91'))
]

if 'Unnamed: 0' in disposal_long.columns:
    disposal_long.drop(columns=['Unnamed: 0'], inplace=True)

disposal_long.dropna(axis=1, how='all').to_csv(f'{DATA_DIR}/disposal_df_long_stock.csv', index=False)

# Analysis

In [43]:
DATA_DIR = '../../data/disposal'
disposal_long = pd.read_csv(f'{DATA_DIR}/disposal_df_long_stock.csv', parse_dates=['Date'], low_memory=False)
analyzer = DisposalAnalyzer(disposal_long)
analyzer.display_dataframe()

Unnamed: 0,Date,Stock_id,Open,High,Low,Close,Volume,TradingAmount,trading_idx,prev_trade_date,...,t_label_second,t_label_third,t_label_fourth,t_label_level_5,t_label_level_6,t_label_level_7,daily_ret,market_open,market_close,market_ret
0,2020-07-14,1213,7.10,7.38,7.10,7.38,137685,1013332,11,,...,,,,,,,0.039437,12202.89,12209.01,0.000502
1,2020-07-15,1213,8.11,8.11,8.11,8.11,35955,291593,12,2020-07-14,...,,,,,,,0.000000,12233.99,12202.85,-0.002545
2,2020-07-16,1213,8.92,8.92,8.92,8.92,41813,372971,13,2020-07-15,...,,,,,,,0.000000,12173.04,12157.74,-0.001257
3,2020-07-17,1213,9.81,9.81,9.05,9.81,1009465,9853081,14,2020-07-16,...,,,,,,,0.000000,12195.72,12181.56,-0.001161
4,2020-07-17,1213,9.81,9.81,9.05,9.81,1009465,9853081,14,2020-07-16,...,s-5,,,,,,0.000000,12195.72,12181.56,-0.001161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47857,2021-08-09,9962,19.85,22.95,19.20,21.10,13423000,283985300,41,2021-08-06,...,,,,,,,0.062972,17505.01,17485.15,-0.001135
47858,2021-08-10,9962,21.10,21.40,19.30,19.45,6969000,139699850,42,2021-08-09,...,,,,,,,-0.078199,17492.45,17323.64,-0.009650
47859,2021-08-11,9962,20.30,20.50,18.05,18.45,5636000,108715150,43,2021-08-10,...,,,,,,,-0.091133,17308.62,17227.18,-0.004705
47860,2021-08-12,9962,19.30,20.00,18.35,19.90,4170000,80480950,44,2021-08-11,...,,,,,,,0.031088,17221.35,17219.94,-0.000082


## overall
`[Disposal Level Statistics]` 中的統計數據基於 **日當沖報酬率** 計算

In [23]:
analyzer.overall_analysis()


[Disposal Condition Distribution]


Unnamed: 0,condition,days_count,event_count,days_pct,5min_count,20min_count
0,因連續3個營業日達本中心作業要點第四條第一項第一款,24208,1222,50.58%,811,313
1,連續三次,11757,593,24.56%,430,115
2,最近10個營業日內有6個營業日,2631,132,5.50%,74,55
3,連續5個營業日,1769,89,3.70%,36,53
4,連續3個營業日及沖銷標準,1693,77,3.54%,62,15
5,最近十個營業日已有六次,1670,85,3.49%,64,17
6,連續三次及當日沖銷標準,1444,66,3.02%,59,7
7,連續五次,1255,63,2.62%,45,16
8,連續5個營業日及沖銷標準,701,32,1.46%,17,15
9,連續五次及當日沖銷標準,594,27,1.24%,25,2



[Disposal Level Statistics]


Unnamed: 0,disposal_level,days_count,event_count,mean,std
0,1,39631,1976,0.36%,4.96%
1,2,6535,328,0.48%,5.45%
2,3,1221,62,0.37%,5.94%
3,4,295,15,0.58%,6.03%
4,5,120,6,0.19%,6.43%
5,6,40,2,0.20%,3.90%
6,7,20,1,-0.27%,3.88%


## seperate by trend

In [44]:
seperated_df = analyzer.seprate_by_trend()


[處置前趨勢分佈 (s-3 ~ s-1)]


Unnamed: 0,direction,days_count,event_count,days_pct,5min_count,20min_count
0,Overbought,28868,1436,60.32%,1034,344
1,Oversold,18975,955,39.65%,593,265
2,Unknown,19,2,0.04%,1,0



[方向 vs 層級 交叉表 (交易天數)]


disposal_level,1,2,3,4,5,6,7,Total
direction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Overbought,24390,3812,486,140,40,0,0,28868
Oversold,15233,2712,735,155,80,40,20,18975
Unknown,8,11,0,0,0,0,0,19
Total,39631,6535,1221,295,120,40,20,47862


### 日盤

In [45]:
analyzer.plot_trend_return(df=seperated_df)

### 夜盤

In [46]:
analyzer.plot_trend_return(seperated_df, 'after_market')

### 夜 + 日

In [47]:
analyzer.plot_trend_return(seperated_df, 'all')

In [50]:
seperated_df

Unnamed: 0,Date,Stock_id,Open,High,Low,Close,Volume,TradingAmount,trading_idx,prev_trade_date,...,t_label_level_6,t_label_level_7,daily_ret,market_open,market_close,market_ret,direction,base_start_date,base_end_date,t_label
0,2020-07-14,1213,7.10,7.38,7.10,7.38,137685,1013332,11,,...,,,,12202.89,12209.01,0.000502,Oversold,2020-07-21,2020-08-06,s-5
1,2020-07-15,1213,8.11,8.11,8.11,8.11,35955,291593,12,2020-07-14,...,,,0.098916,12233.99,12202.85,-0.002545,Oversold,2020-07-21,2020-08-06,s-4
2,2020-07-16,1213,8.92,8.92,8.92,8.92,41813,372971,13,2020-07-15,...,,,0.099877,12173.04,12157.74,-0.001257,Oversold,2020-07-21,2020-08-06,s-3
3,2020-07-17,1213,9.81,9.81,9.05,9.81,1009465,9853081,14,2020-07-16,...,,,0.099776,12195.72,12181.56,-0.001161,Oversold,2020-07-21,2020-08-06,s-2
4,2020-07-20,1213,10.75,10.75,9.43,10.00,503610,5266828,15,2020-07-17,...,,,0.019368,12205.25,12174.54,-0.002516,Oversold,2020-07-21,2020-08-06,s-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40273,2021-08-09,9962,19.85,22.95,19.20,21.10,13423000,283985300,41,2021-08-06,...,,,-0.002364,17505.01,17485.15,-0.001135,Oversold,2021-07-26,2021-08-06,e+1
40274,2021-08-10,9962,21.10,21.40,19.30,19.45,6969000,139699850,42,2021-08-09,...,,,-0.078199,17492.45,17323.64,-0.009650,Oversold,2021-07-26,2021-08-06,e+2
40275,2021-08-11,9962,20.30,20.50,18.05,18.45,5636000,108715150,43,2021-08-10,...,,,-0.051414,17308.62,17227.18,-0.004705,Oversold,2021-07-26,2021-08-06,e+3
40276,2021-08-12,9962,19.30,20.00,18.35,19.90,4170000,80480950,44,2021-08-11,...,,,0.078591,17221.35,17219.94,-0.000082,Oversold,2021-07-26,2021-08-06,e+4
