# 處置股事件研究

In [13]:
# [Env Setup] 載入必要套件與設定
import pandas as pd
import numpy as np
from tqdm import tqdm
from loguru import logger
import sys
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns

%load_ext autoreload
%autoreload 2
sys.path.append("/Users/xinc./Documents/GitHub/note")
sys.path.append(os.getcwd()) # 加入目前路徑以匯入 utils

from module.get_info_FinMind import FinMindClient, FinMindConfig
from module.get_info_Finlab import FinlabClient
from module.plot_func import plot
from utils import batch_fetch_prices, run_event_study, process_disposal_events

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Part 1: Data Preparation

### 核心功能
1. **資料整合**：自動對接 Finlab (處置資訊) 與 FinMind (股價) API。
2. **智慧分級**：實作連續處置判斷邏輯 (Strict Overlap)，自動識別第 1, 2...N 次處置事件。
3. **動態標籤**：產生時間軸標籤 `s+N` (處置開始/期間) 與 `e+N` (處置結束後)，包含解禁日 `e+0`。
4. **雙軌輸出**：
   - **Wide Format (`disposal_df_wide.csv`)**：訊號表 (Signal Table)，不含價格，專供回測系統產生交易訊號。
   - **Long Format (`disposal_df_long.csv`)**：分析表 (Analysis Table)，含完整價量與報酬率，專供統計研究與視覺化。

### 使用流程
- **Step 1**: 抓取處置公告。
- **Step 1.5**: 執行前處理 (分級與濾網)。
- **Step 2**: 平行化抓取處置期間股價。
- **Step 3**: 執行 Event Study 轉換，產出 Wide/Long CSV。

In [2]:
# [Step 1] 抓取 Finlab 處置股資料
# 若需要 Token，請在初始化時傳入，例如 FinlabClient(token="YOUR_TOKEN")
finlab_client = FinlabClient()
print("Fetching disposal information from Finlab...")

# 抓取資料 (設定較大的範圍以確保涵蓋需求)
finlab_disposal = finlab_client.get_data("disposal_information", start_date='2018-01-01')

# [Manual Filter] 手動篩選日期 (修正 Finlab API 時間過濾限制)
if not finlab_disposal.empty:
    finlab_disposal['date'] = pd.to_datetime(finlab_disposal['date'])
    finlab_disposal = finlab_disposal[finlab_disposal['date'] >= '2018-01-01']
    print(f"Fetched {len(finlab_disposal)} records from Finlab.")
    print(f"Data Range: {finlab_disposal['date'].min()} to {finlab_disposal['date'].max()}")
else:
    print("No data fetched from Finlab.")

Fetching disposal information from Finlab...
請從 https://ai.finlab.tw/api_token 複製驗證碼，貼於此處:

輸入成功!
之後可以使用以下方法自動登入
import finlab
finlab.login("YOUR API TOKEN")
Daily usage: 0.5 / 500 MB - disposal_information
Fetched 3383 records from Finlab.
Data Range: 2018-01-04 00:00:00 to 2025-09-26 00:00:00


In [3]:
# [Step 1.5] 前處理與分級 (Preprocessing)
# 這一步會先將 Finlab 處置資料整理格式，並標記 First/Second Disposal
if 'finlab_disposal' in locals() and not finlab_disposal.empty:
    print("Processing disposal events...")
    processed_disposal = process_disposal_events(finlab_disposal)
    
    # print(f"Processed Data Shape: {processed_disposal.shape}")
    # display(processed_disposal.head())
    
    # Optional: Save for inspection
    processed_disposal.to_csv('../../data/disposal/processed_disposal_events.csv', index=False, encoding='utf-8-sig')
else:
    print("Finlab data not available. Please run Step 1 first.")
    processed_disposal = pd.DataFrame()

Processing disposal events...
Columns before processing: ['Stock_id', 'date', '證券名稱', 'condition', '處置措施', '處置內容', 'event_start_date', 'event_end_date', 'interval', 'key_date']
Processed 3383 events.
Level Distribution:
disposal_level
1     2702
2      453
3      110
4       36
5       17
6        8
7        6
8        5
9        5
10       4
11       4
12       4
13       3
14       3
15       3
16       3
17       3
18       3
19       3
20       2
21       2
22       2
23       1
24       1
Name: count, dtype: int64


In [4]:
# [Step 2] 平行化抓取股價 (FinMind)
# 使用處理過的事件表 (processed_disposal) 以確保連續處置區間不遺漏
logger.remove()
logger.add(sys.stderr, level="WARNING")

# Initialize FinMind Client
fm_client = FinMindClient()

offset_days = 5

if 'processed_disposal' in locals() and not processed_disposal.empty:
    # 開始抓取
    price_df = batch_fetch_prices(fm_client, processed_disposal, offset_days=offset_days, max_workers=10)

    if not price_df.empty:
        print(f"Fetched Price Data Shape: {price_df.shape}")
        display(price_df.head())
    else:
        print("No price data fetched.")
else:
    print("No processed disposal data found. Please run Step 1.5.")

gc.collect()

Using pre-processed columns 'event_start_date' and 'event_end_date'.
Starting batch fetch for 1317 stocks with 10 workers...


Fetching Prices: 100%|██████████| 1317/1317 [00:37<00:00, 34.85it/s]

Fetched total 37289 rows.
Fetched Price Data Shape: (37289, 8)





Unnamed: 0,Date,Stock_id,Open,High,Low,Close,Volume,TradingAmount
15,2020-03-23,00715L,3.4,4.15,3.37,3.89,127647025,488032823
16,2020-03-24,00715L,4.25,4.4,4.01,4.18,183797200,781181720
17,2020-03-25,00715L,4.18,4.29,4.05,4.26,175083000,735960760
18,2020-03-26,00715L,4.22,4.22,3.77,3.92,102904172,403334525
19,2020-03-27,00715L,3.6,3.86,3.6,3.78,76994328,288716343


2338

In [5]:
# [Step 3] 執行 Event Study 分析
# 使用 processed_disposal，其中已經包含 is_first_disposal 等標記

price_df, processed_disposal = pd.read_csv('../../data/disposal/price_df.csv'), pd.read_csv('../../data/disposal/processed_disposal_events.csv')
disposal_wide, disposal_long = run_event_study(price_df, processed_disposal, offset_days=offset_days)

if not disposal_wide.empty:
    print(f"Wide Format Shape: {disposal_wide.shape}")
    print(f"Long Format Shape: {disposal_long.shape}")
    
    print("\n[Wide Head]")
    display(disposal_wide.head())
    
    print("\n[Long Head]")
    display(disposal_long.head())
    
    # Save both files
    disposal_wide.to_csv('../../data/disposal/disposal_df_wide.csv', index=False, encoding='utf-8-sig')
    disposal_long.to_csv('../../data/disposal/disposal_df_long.csv', index=False, encoding='utf-8-sig')
    print("Saved 'disposal_df_wide.csv' and 'disposal_df_long.csv'.")

else:
    print("Analysis returned empty DataFrame.")

Detected Disposal Levels: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23)]
Converting to Wide Format...
Analysis completed. Wide shape: (12561, 186), Long shape: (16813, 49)
Wide Format Shape: (12561, 186)
Long Format Shape: (16813, 49)

[Wide Head]


Unnamed: 0,Date,Stock_id,t_label_first,condition_first,interval_first,event_start_date_first,event_end_date_first,relative_day_first,gap_days_first,calendar_relative_day_first,...,gap_days_level_22,calendar_relative_day_level_22,t_label_level_23,condition_level_23,interval_level_23,event_start_date_level_23,event_end_date_level_23,relative_day_level_23,gap_days_level_23,calendar_relative_day_level_23
0,2020-03-24,00642U,s-3,監視業務督導會報決議,5.0,2020-03-27,2020-04-13,-3.0,0.0,-3.0,...,,,,,,NaT,NaT,,,
1,2020-03-25,00642U,s-2,監視業務督導會報決議,5.0,2020-03-27,2020-04-13,-2.0,0.0,-2.0,...,,,,,,NaT,NaT,,,
2,2020-03-26,00642U,s-1,監視業務督導會報決議,5.0,2020-03-27,2020-04-13,-1.0,0.0,-1.0,...,,,,,,NaT,NaT,,,
3,2020-03-27,00642U,s+0,監視業務督導會報決議,5.0,2020-03-27,2020-04-13,0.0,0.0,0.0,...,,,,,,NaT,NaT,,,
4,2020-03-30,00642U,s+1,監視業務督導會報決議,5.0,2020-03-27,2020-04-13,1.0,2.0,3.0,...,,,,,,NaT,NaT,,,



[Long Head]


Unnamed: 0.1,Unnamed: 0,Date,Stock_id,Open,High,Low,Close,Volume,TradingAmount,trading_idx,...,t_label_level_15,t_label_level_16,t_label_level_17,t_label_level_18,t_label_level_19,t_label_level_20,t_label_level_21,t_label_level_22,t_label_level_23,daily_ret
0,0,2020-03-24,00642U,10.32,10.57,10.17,10.36,42796200,444039684,0,...,,,,,,,,,,0.003876
12,1,2020-03-25,00642U,10.55,10.59,10.44,10.46,39627340,416239309,1,...,,,,,,,,,,-0.008531
24,2,2020-03-26,00642U,10.44,10.46,10.11,10.33,25842235,264724135,2,...,,,,,,,,,,-0.010536
36,3,2020-03-27,00642U,10.24,10.24,10.1,10.17,20479766,207667697,3,...,,,,,,,,,,-0.006836
48,4,2020-03-30,00642U,9.88,9.88,9.65,9.79,33984332,331368684,4,...,,,,,,,,,,-0.009109


Saved 'disposal_df_wide.csv' and 'disposal_df_long.csv'.


In [6]:
# [Step 3.5] 加入大盤報酬 (Integration of TAIEX Benchmark)
# Check if Step 3 output exists
if 'disposal_long' in locals() and not disposal_long.empty:
    print("Fetching TAIEX data...")
    
    # 1. 決定抓取區間
    start_date = disposal_long['Date'].min().strftime('%Y-%m-%d')
    end_date = disposal_long['Date'].max().strftime('%Y-%m-%d')
    print(f"Date Range: {start_date} to {end_date}")
    
    # 2. 透過 FinMind 抓取 TAIEX
    client = FinMindClient()
    taiex_data = client.get_data(
        dataset='TaiwanStockPrice',
        data_id='TAIEX',
        start_date=start_date,
        end_date=end_date
    )
    
    if taiex_data is not None and not taiex_data.empty:
        # 3. 計算大盤報酬
        taiex_df = taiex_data[['date', 'open', 'close']].rename(columns={'date': 'Date', 'open': 'market_open', 'close': 'market_close'}).copy()
        taiex_df['Date'] = pd.to_datetime(taiex_df['Date'])
        taiex_df = taiex_df.sort_values('Date')
        taiex_df['market_ret'] = (taiex_df['market_close']/taiex_df['market_open']) - 1
        
        # 4. 合併回 disposal_long
        # 若已存在先移除避免重複 join
        cols_to_drop = [c for c in ['market_ret', 'market_close'] if c in disposal_long.columns]
        if cols_to_drop:
            disposal_long = disposal_long.drop(columns=cols_to_drop)
            
        disposal_long = pd.merge(disposal_long, taiex_df, on='Date', how='left')
                
        print("Integration Complete! Added columns: 'market_ret'")
        display(disposal_long)
        
    else:
        print("Failed to fetch TAIEX data. Please check connection or token.")
else:
    print("DataFrame 'disposal_long' not found. Please run Step 3 first.")

Fetching TAIEX data...
Date Range: 2018-01-02 to 2025-10-03
Integration Complete! Added columns: 'market_ret'


Unnamed: 0.1,Unnamed: 0,Date,Stock_id,Open,High,Low,Close,Volume,TradingAmount,trading_idx,...,t_label_level_18,t_label_level_19,t_label_level_20,t_label_level_21,t_label_level_22,t_label_level_23,daily_ret,market_open,market_close,market_ret
0,0,2020-03-24,00642U,10.32,10.57,10.17,10.36,42796200,444039684,0,...,,,,,,,0.003876,9083.78,9285.62,0.022220
1,1,2020-03-25,00642U,10.55,10.59,10.44,10.46,39627340,416239309,1,...,,,,,,,-0.008531,9426.43,9644.75,0.023160
2,2,2020-03-26,00642U,10.44,10.46,10.11,10.33,25842235,264724135,2,...,,,,,,,-0.010536,9667.14,9736.36,0.007160
3,3,2020-03-27,00642U,10.24,10.24,10.10,10.17,20479766,207667697,3,...,,,,,,,-0.006836,9807.90,9698.92,-0.011111
4,4,2020-03-30,00642U,9.88,9.88,9.65,9.79,33984332,331368684,4,...,,,,,,,-0.009109,9571.22,9629.43,0.006082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16808,15,2021-07-23,9962,24.00,25.60,24.00,25.60,22693000,569363250,5,...,,,,,,,0.066667,17603.95,17572.92,-0.001763
16809,16,2021-07-26,9962,23.05,24.70,23.05,23.05,7764000,180799800,6,...,,,,,,,0.000000,17554.93,17403.56,-0.008623
16810,17,2021-07-27,9962,23.80,23.80,22.25,22.50,3173000,71872650,7,...,,,,,,,-0.054622,17394.77,17269.87,-0.007180
16811,18,2021-07-28,9962,22.50,22.60,21.80,21.80,1769000,38968600,8,...,,,,,,,-0.031111,17252.87,17135.22,-0.006819


In [7]:
# [Step 4] 最終篩選 (Final Filter: Common Stocks Only)
# 應用篩選邏輯：只保留代碼長度為 4 且非 00 開頭的股票 (只保留股票)

def is_common_stock(stock_id):
    sid = str(stock_id)
    return len(sid) == 4 and not sid.startswith('00')

if 'disposal_wide' in locals() and not disposal_wide.empty:
    print("Filtering Final Output for Common Stocks Only...")
    
    # Filter Wide Format
    mask_wide = disposal_wide['Stock_id'].apply(is_common_stock)
    final_wide = disposal_wide[mask_wide].copy()
    
    # Filter Long Format
    mask_long = disposal_long['Stock_id'].apply(is_common_stock)
    df = disposal_long[mask_long].copy()
    
    print(f"Wide Format: {len(disposal_wide)} -> {len(final_wide)} rows")
    print(f"Long Format: {len(disposal_long)} -> {len(df)} rows")
    
    # Save filtered versions
    final_wide.to_csv('../../data/disposal/disposal_df_wide.csv', index=False, encoding='utf-8-sig')
    df.to_csv('../../data/disposal/disposal_df_long.csv', index=False, encoding='utf-8-sig')
    print("Saved 'disposal_df_wide.csv' and 'disposal_df_long.csv'.")
    
    # Preview
    display(final_wide.head())
else:
    print("Output dataframes not found. Please run Step 3 first.")

Filtering Final Output for Common Stocks Only...
Wide Format: 12561 -> 11546 rows
Long Format: 16813 -> 14742 rows
Saved 'disposal_df_wide.csv' and 'disposal_df_long.csv'.


Unnamed: 0,Date,Stock_id,t_label_first,condition_first,interval_first,event_start_date_first,event_end_date_first,relative_day_first,gap_days_first,calendar_relative_day_first,...,gap_days_level_22,calendar_relative_day_level_22,t_label_level_23,condition_level_23,interval_level_23,event_start_date_level_23,event_end_date_level_23,relative_day_level_23,gap_days_level_23,calendar_relative_day_level_23
482,2020-07-20,1213,s-1,連續三次,,2020-07-21,2020-08-03,-1.0,0.0,-1.0,...,,,,,,NaT,NaT,,,
483,2020-07-21,1213,s+0,連續三次,,2020-07-21,2020-08-03,0.0,0.0,0.0,...,,,,,,NaT,NaT,,,
484,2020-07-22,1213,s+1,連續三次,,2020-07-21,2020-08-03,1.0,0.0,1.0,...,,,,,,NaT,NaT,,,
485,2020-07-23,1213,s+2,連續三次,,2020-07-21,2020-08-03,2.0,0.0,2.0,...,,,,,,NaT,NaT,,,
486,2020-07-24,1213,s+3,連續三次,,2020-07-21,2020-08-03,3.0,0.0,3.0,...,,,,,,NaT,NaT,,,


---
# Part 2: Statistical Analysis

### 資料欄位說明 (Data Dictionary)

在開始分析前，了解資料集中的關鍵變數定義：

| 欄位名稱 | 說明 | 用途 |
| :--- | :--- | :--- |
| **`t_label`** | 時間軸標籤 | **核心分析欄位**。`s+N` (處置期間), `e+N` (解禁後)。`e+0` 為解禁當日。 |
| **`trading_idx`** | 交易日序號 | 該股票的累積交易日數 (Index)。用來計算精確的 `relative_day`，避免假日干擾。 |
| **`gap_days`** | 交易間隔(日曆日) | 距離上一個交易日過了幾天。`1`=正常, `3`=跨週末, `>3`=長假或暫停交易。 |
| **`relative_day`** | 相對天數 | 距離事件開始日 (`s+0`) 的交易日數差。 |
| **`disposal_level`** | 處置連續等級 | `1`=首次, `2`=續處置 (中間無間斷)。用於觀察連續處置的邊際效應遞減。 |
| **`daily_ret`** | 當日報酬率 | 計算方式：`(Close / Open) - 1`。 |
| **`Stock_id`** | 股票代號 | 經過 Filter 後應僅包含普通股 (4碼)。 |

## overall

In [9]:
# [Analysis Step 1] 載入資料 (Data Loading)
paths = '../../data/disposal/disposal_df_long.csv'

print(f"Reading data from: {paths}")

# 1. 讀取 CSV (解決 DtypeWarning, 解析日期)
df = pd.read_csv(
    paths,
    low_memory=False,
    dtype={'Stock_id': str}, # 強制 Stock_id 為字串
    parse_dates=['Date', 'event_start_date', 'event_end_date'] # 自動轉 datetime
)

print(f"Loaded {len(df):,} rows.")
print(f"Columns: {list(df.columns)} ...")

# Preview
display(df)

Reading data from: ../../data/disposal/disposal_df_long.csv
Loaded 14,742 rows.
Columns: ['Unnamed: 0', 'Date', 'Stock_id', 'Open', 'High', 'Low', 'Close', 'Volume', 'TradingAmount', 'trading_idx', 'prev_trade_date', 'trade_date_diff', 'gap_days', 'event_start_date', 'event_end_date', 'interval', 'condition', 'is_first_disposal', 'is_second_disposal', 'disposal_level', 'calendar_relative_day', 'trading_idx_start', 'trading_idx_end', 'relative_day', 'relative_day_end', 't_label_first', 't_label_second', 't_label_third', 't_label_fourth', 't_label_level_5', 't_label_level_6', 't_label_level_7', 't_label_level_8', 't_label_level_9', 't_label_level_10', 't_label_level_11', 't_label_level_12', 't_label_level_13', 't_label_level_14', 't_label_level_15', 't_label_level_16', 't_label_level_17', 't_label_level_18', 't_label_level_19', 't_label_level_20', 't_label_level_21', 't_label_level_22', 't_label_level_23', 'daily_ret', 'market_open', 'market_close', 'market_ret'] ...


Unnamed: 0.1,Unnamed: 0,Date,Stock_id,Open,High,Low,Close,Volume,TradingAmount,trading_idx,...,t_label_level_18,t_label_level_19,t_label_level_20,t_label_level_21,t_label_level_22,t_label_level_23,daily_ret,market_open,market_close,market_ret
0,0,2020-07-20,1213,10.75,10.75,9.43,10.00,503610,5266828,0,...,,,,,,,-0.069767,12205.25,12174.54,-0.002516
1,0,2020-07-20,1213,10.75,10.75,9.43,10.00,503610,5266828,0,...,,,,,,,-0.069767,12205.25,12174.54,-0.002516
2,1,2020-07-21,1213,11.00,11.00,10.25,10.35,182100,1951863,1,...,,,,,,,-0.059091,12242.32,12397.55,0.012680
3,1,2020-07-21,1213,11.00,11.00,10.25,10.35,182100,1951863,1,...,,,,,,,-0.059091,12242.32,12397.55,0.012680
4,2,2020-07-22,1213,10.80,10.80,10.20,10.50,195049,2045958,2,...,,,,,,,-0.027778,12389.76,12473.27,0.006740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14737,15,2021-07-23,9962,24.00,25.60,24.00,25.60,22693000,569363250,5,...,,,,,,,0.066667,17603.95,17572.92,-0.001763
14738,16,2021-07-26,9962,23.05,24.70,23.05,23.05,7764000,180799800,6,...,,,,,,,0.000000,17554.93,17403.56,-0.008623
14739,17,2021-07-27,9962,23.80,23.80,22.25,22.50,3173000,71872650,7,...,,,,,,,-0.054622,17394.77,17269.87,-0.007180
14740,18,2021-07-28,9962,22.50,22.60,21.80,21.80,1769000,38968600,8,...,,,,,,,-0.031111,17252.87,17135.22,-0.006819


In [15]:
# [Analysis Step 2] 全區間處置事件分析：Long Format 資料生成 (Global Disposal Long Data)
# 目的：產出「以 Global Event 為基準」的每日資料表 (Long Format)
# 特點：
# 1. 統一合併為 Global Event (不分第一次/第二次)
# 2. **t_label 改用「交易日」計算** (避免假日跳號問題)
# 3. **標籤邏輯更新**：處置結束當日標記為 e+0

if 'df' in locals() and not df.empty:
    print("Generating Global Disposal Long Data (Trading Day Logic, e+0 included)...")
    
    # --- 1. 定義 Global Events (Summary) ---
    cols_needed = ['Stock_id', 'event_start_date', 'event_end_date', 'disposal_level']
    df_events = df[cols_needed].drop_duplicates().copy()
    df_events['event_start_date'] = pd.to_datetime(df_events['event_start_date'], errors='coerce')
    df_events['event_end_date'] = pd.to_datetime(df_events['event_end_date'], errors='coerce')
    df_events = df_events.sort_values(['Stock_id', 'event_start_date'])
    
    # 識別連續群組
    df_events['event_group_id'] = (df_events['disposal_level'] == 1).cumsum()
    
    global_events_summary = df_events.groupby(['Stock_id', 'event_group_id']).agg(
        global_start_date=('event_start_date', 'min'),
        global_end_date=('event_end_date', 'max'),
        max_level=('disposal_level', 'max')
    ).reset_index()
    
    global_events_summary['total_calendar_days'] = (global_events_summary['global_end_date'] - global_events_summary['global_start_date']).dt.days + 1
    global_events_summary = global_events_summary.drop(columns=['event_group_id'])
    
    # --- 2. 準備股價資料 ---
    if 'price_df' in locals() and not price_df.empty:
        source_data = price_df.copy()
        if 'market_ret' not in source_data.columns and 'df' in locals() and 'market_ret' in df.columns:
             mkt_map = df[['Date', 'market_ret']].drop_duplicates().set_index('Date')['market_ret'].to_dict()
             source_data['Date'] = pd.to_datetime(source_data['Date'])
             source_data['market_ret'] = source_data['Date'].map(mkt_map)
    else:
        source_data = df.copy()
        
    if not pd.api.types.is_datetime64_any_dtype(source_data['Date']):
        source_data['Date'] = pd.to_datetime(source_data['Date'])
        
    # --- 3. 合併與計算交易日 ---
    merged_chunks = []
    
    events_dict = global_events_summary.groupby('Stock_id')
    print(f"Processing {len(global_events_summary)} global events into Long Format...")

    # Define Buffer 
    BUFFER_PRE = 60
    BUFFER_POST = 60
    
    for stock_id, group in events_dict:
        stock_prices = source_data[source_data['Stock_id'] == stock_id].sort_values('Date')
        if stock_prices.empty:
            continue
            
        stock_prices = stock_prices.reset_index(drop=True)
        # Date -> Rank (Trading Day Index)
        date_rank_map = pd.Series(stock_prices.index.values, index=stock_prices['Date']).to_dict()
        
        for _, event in group.iterrows():
            gs = event['global_start_date']
            ge = event['global_end_date']
            
            # Find Start Rank
            search_date = gs
            while search_date not in date_rank_map and search_date <= ge:
                 search_date += pd.Timedelta(days=1)
            
            if search_date not in date_rank_map:
                 continue 
            
            start_rank = date_rank_map[search_date]
            
            # Find End Rank (for reference)
            search_end = ge
            while search_end not in date_rank_map and search_end >= gs:
                search_end -= pd.Timedelta(days=1)
            
            end_rank = np.nan
            if search_end in date_rank_map:
                end_rank = date_rank_map[search_end]
            
            # Filter Time Window
            mask = (stock_prices['Date'] >= gs - pd.Timedelta(days=BUFFER_PRE + 20)) &                    (stock_prices['Date'] <= ge + pd.Timedelta(days=BUFFER_POST + 20))
            
            event_prices = stock_prices[mask].copy()
            
            if not event_prices.empty:
                event_prices['global_start_date'] = gs
                event_prices['global_end_date'] = ge
                event_prices['max_level'] = event['max_level']

                # Calculate Relative Trading Days
                current_ranks = event_prices['Date'].map(date_rank_map)
                
                event_prices['rel_day_start'] = current_ranks - start_rank
                
                if not pd.isna(end_rank):
                    event_prices['rel_day_end'] = current_ranks - end_rank
                else:
                    event_prices['rel_day_end'] = np.nan
                
                merged_chunks.append(event_prices)

    if merged_chunks:
        global_long_df = pd.concat(merged_chunks, ignore_index=True)
        
        # --- 4. 生成 t_label ---
        def generate_label(row):
            # Logic: 處置期間內用 s+N，但 **處置結束日(含)以後** 用 e+N
            # 這樣 End Date 當天 (rel_day_end=0) 就會變成 e+0
            
            if row['Date'] < row['global_end_date']:
                d = int(row['rel_day_start'])
                return f"s{'+' if d>=0 else ''}{d}"
            else:
                # Date >= End Date (Includes e+0)
                if pd.isna(row['rel_day_end']): return None
                d = int(row['rel_day_end'])
                return f"e{'+' if d>=0 else ''}{d}"
                
        global_long_df['t_label'] = global_long_df.apply(generate_label, axis=1)
        
        print(f"Generated Global Long DataFrame (e+0 included): {len(global_long_df)} rows.")
        display(global_long_df)
        global_events = global_events_summary 

    else:
        print("No matching price data found.")
        global_events = global_events_summary

else:
    print("Error: 'df' not found.")

Generating Global Disposal Long Data (Trading Day Logic, e+0 included)...
Processing 2088 global events into Long Format...
Generated Global Long DataFrame (e+0 included): 20448 rows.


Unnamed: 0.1,Unnamed: 0,Date,Stock_id,Open,High,Low,Close,Volume,TradingAmount,market_ret,global_start_date,global_end_date,max_level,rel_day_start,rel_day_end,t_label
0,0,2020-07-20,1213,10.75,10.75,9.43,10.00,503610,5266828,-0.002516,2020-07-21,2020-08-06,2,-1,-5,s-1
1,1,2020-07-21,1213,11.00,11.00,10.25,10.35,182100,1951863,0.012680,2020-07-21,2020-08-06,2,0,-4,s+0
2,2,2020-07-22,1213,10.80,10.80,10.20,10.50,195049,2045958,0.006740,2020-07-21,2020-08-06,2,1,-3,s+1
3,3,2020-07-23,1213,11.50,11.50,10.30,11.30,198500,2153125,-0.000803,2020-07-21,2020-08-06,2,2,-2,s+2
4,4,2020-07-24,1213,11.90,11.90,10.80,11.80,292929,3430709,-0.013105,2020-07-21,2020-08-06,2,3,-1,s+3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20443,15,2021-07-23,9962,24.00,25.60,24.00,25.60,22693000,569363250,-0.001763,2021-07-26,2021-08-06,1,-1,-4,s-1
20444,16,2021-07-26,9962,23.05,24.70,23.05,23.05,7764000,180799800,-0.008623,2021-07-26,2021-08-06,1,0,-3,s+0
20445,17,2021-07-27,9962,23.80,23.80,22.25,22.50,3173000,71872650,-0.007180,2021-07-26,2021-08-06,1,1,-2,s+1
20446,18,2021-07-28,9962,22.50,22.60,21.80,21.80,1769000,38968600,-0.006819,2021-07-26,2021-08-06,1,2,-1,s+2


In [16]:
global_long_df.to_csv('test.csv', index=False)

In [64]:
# [Analysis Step 2.1] 計算各時間點的平均報酬與樣本數 (Multi-Level)
# 迴圈繪製所有處置層級 (First, Second, Third, ... etc.)

# 使用 df 作為來源資料 (確保 Step 1/3.5/4 已執行)
if 'df' in locals() and not df.empty:
    
    # 1. 定義排序規則
    def parse_t_val(t_str):
        if not isinstance(t_str, str): return 999
        prefix = t_str[0]
        try:
            val = int(t_str.split('+')[-1]) if '+' in t_str else int(t_str.split('s')[-1]) 
            if 's-' in t_str: val = -int(t_str.split('-')[-1])
            if prefix == 's': return val
            elif prefix == 'e': return 1000 + val
        except:
            return 999
        return 999

    # 2. 找出所有 t_label 欄位
    t_cols = [c for c in df.columns if c.startswith('t_label_')]
    print(f"Found {len(t_cols)} levels to analyze: {t_cols}")
    
    # 3. 迴圈繪圖
    for target_col in t_cols:
        # 檢查該層級是否有足夠數據 (至少 50 筆非空值)
        valid_count = df[target_col].notna().sum()
        if valid_count < 50:
            print(f"Skipping {target_col} (only {valid_count} samples)")
            continue
            
        print(f"\nPlotting Dual Bar Chart for {target_col} (Samples: {valid_count})...")
        
        # 統計
        stats = df.groupby(target_col)['abnormal_ret'].agg(['mean', 'count', 'std']).reset_index()
        
        # 排序
        stats['sort_key'] = stats[target_col].apply(parse_t_val)
        stats = stats.sort_values('sort_key').drop(columns=['sort_key'])
        
        # 繪圖配置：
        # ly='mean' (主圖): 平均報酬 -> 畫長條圖 (ly_type='bar')
        # bar_col='count' (副圖): 樣本數 -> 畫長條圖
        plot(
            df=stats,
            x=target_col,
            ly='mean',       # 上圖：平均超額報酬
            bar_col='count', # 下圖：樣本數
            ly_type='bar',   # 上圖強制畫長條
            note=f"Disposal Effect: {target_col}",
            bar_kwargs={'width': 0.8}
        )
        
        # 顯示統計表 (選前 5 行預覽)
        # display(stats.head(5).style.format({'mean': '{:.4%}', 'std': '{:.4%}'}))

else:
    print("Dataframe 'df' not ready. Please run data preparation steps.")

Found 23 levels to analyze: ['t_label_first', 't_label_second', 't_label_third', 't_label_fourth', 't_label_level_5', 't_label_level_6', 't_label_level_7', 't_label_level_8', 't_label_level_9', 't_label_level_10', 't_label_level_11', 't_label_level_12', 't_label_level_13', 't_label_level_14', 't_label_level_15', 't_label_level_16', 't_label_level_17', 't_label_level_18', 't_label_level_19', 't_label_level_20', 't_label_level_21', 't_label_level_22', 't_label_level_23']

Plotting Dual Bar Chart for t_label_first (Samples: 11627)...



Plotting Dual Bar Chart for t_label_second (Samples: 2252)...



Plotting Dual Bar Chart for t_label_third (Samples: 482)...



Plotting Dual Bar Chart for t_label_fourth (Samples: 126)...



Plotting Dual Bar Chart for t_label_level_5 (Samples: 53)...


Skipping t_label_level_6 (only 34 samples)
Skipping t_label_level_7 (only 24 samples)
Skipping t_label_level_8 (only 17 samples)
Skipping t_label_level_9 (only 15 samples)
Skipping t_label_level_10 (only 15 samples)
Skipping t_label_level_11 (only 0 samples)
Skipping t_label_level_12 (only 14 samples)
Skipping t_label_level_13 (only 12 samples)
Skipping t_label_level_14 (only 12 samples)
Skipping t_label_level_15 (only 12 samples)
Skipping t_label_level_16 (only 13 samples)
Skipping t_label_level_17 (only 15 samples)
Skipping t_label_level_18 (only 11 samples)
Skipping t_label_level_19 (only 8 samples)
Skipping t_label_level_20 (only 0 samples)
Skipping t_label_level_21 (only 0 samples)
Skipping t_label_level_22 (only 0 samples)
Skipping t_label_level_23 (only 0 samples)


## interval == [5, 20]

In [33]:
# [Analysis Step 2.5] 依處置撮合時間 (Interval) 與 層級 (Level) 交叉分析
# 針對 Interval (5, 20) x Level (First, Second...) 進行完整剖析

if 'df' in locals() and not df.empty:
    
    # 1. 定義 Helper Function (排序與繪圖)
    def parse_t_val(t_str):
        if not isinstance(t_str, str): return 999
        prefix = t_str[0]
        try:
            val = int(t_str.split('+')[-1]) if '+' in t_str else int(t_str.split('s')[-1]) 
            if 's-' in t_str: val = -int(t_str.split('-')[-1])
            if prefix == 's': return val
            elif prefix == 'e': return 1000 + val
        except:
            return 999
        return 999
        
    def plot_interval_level(sub_df, interval_val, target_col):
        # 檢查樣本數
        valid_count = sub_df[target_col].notna().sum()
        if valid_count < 30: # 門檻稍微降低
            # print(f"Skipping Interval={interval_val}, {target_col} (only {valid_count} samples)")
            return

        print(f"\nPlotting: Interval={interval_val} | {target_col} (Samples: {valid_count})...")
        
        # 統計
        stats = sub_df.groupby(target_col)['abnormal_ret'].agg(['mean', 'count', 'std']).reset_index()
        
        # 排序
        stats['sort_key'] = stats[target_col].apply(parse_t_val)
        stats = stats.sort_values('sort_key').drop(columns=['sort_key'])
        
        # 繪圖
        plot(
            df=stats,
            x=target_col,
            ly='mean',       # 上圖：Mean AR
            bar_col='count', # 下圖：Sample Count
            ly_type='bar',   # 雙長條
            note=f"Disposal Analysis: Interval={interval_val}, {target_col}",
            bar_kwargs={'width': 0.8}
        )

    # 2. 執行交叉分析
    intervals_to_plot = [5, 20] # 指定觀察 5 和 20
    t_cols = [c for c in df.columns if c.startswith('t_label_')] # 所有層級
    
    print(f"Analyzing {len(intervals_to_plot)} intervals x {len(t_cols)} levels...")

    for inv in intervals_to_plot:
        # 第一層迴圈：篩選 Interval
        mask = df['interval'] == inv
        sub_df = df[mask].copy()
        
        if sub_df.empty:
            print(f"No data for Interval = {inv}")
            continue
            
        print(f"\n{'='*20} Interval = {inv} (Total Rows: {len(sub_df)}) {'='*20}")
        
        # 第二層迴圈：針對每個 Level 繪圖
        for target_col in t_cols:
            plot_interval_level(sub_df, inv, target_col)
            
else:
    print("Dataframe 'df' not ready.")

Analyzing 2 intervals x 23 levels...


Plotting: Interval=5 | t_label_first (Samples: 8397)...




Plotting: Interval=20 | t_label_first (Samples: 2013)...



Plotting: Interval=20 | t_label_second (Samples: 1805)...



Plotting: Interval=20 | t_label_third (Samples: 315)...



Plotting: Interval=20 | t_label_fourth (Samples: 60)...


## condition

In [39]:
# [Analysis Step 3] 處置原因 (Condition) 分析
# 統計資料集中出現的處置原因分佈

if 'df' in locals() and not df.empty:
    target_col = 'condition'
    if target_col in df.columns:
        print(f"=== {target_col} Distribution ===")
        # 1. 總體分佈
        counts = df[target_col].value_counts()
        display(counts.to_frame(name='Count'))
        
        # 2. 依照撮合時間 (Interval) 分類
        if 'interval' in df.columns:
            print("\n=== Condition vs Interval Crosstab ===")
            cross = pd.crosstab(df['condition'], df['interval'])
            display(cross.style.background_gradient(axis=1))
            
    else:
        print(f"Column '{target_col}' not found in dataframe.")
else:
    print("Dataframe 'df' not ready.")

=== condition Distribution ===


Unnamed: 0_level_0,Count
condition,Unnamed: 1_level_1
連續三次,10695
最近十個營業日已有六次,923
連續五次,793
最近10個營業日內有6個營業日,695
連續5個營業日,477
連續3個營業日及沖銷標準,394
連續三次及當日沖銷標準,332
連續5個營業日及沖銷標準,165
連續五次及當日沖銷標準,145
最近三十個營業日已有十二次,70



=== Condition vs Interval Crosstab ===


interval,5.000000,20.000000,25.000000,45.000000,60.000000
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
最近10個營業日內有6個營業日,372,300,0,5,15
最近30個營業日內有12個營業日,15,4,0,0,0
最近三十個營業日已有十二次,60,5,0,0,0
最近十個營業日已有六次,436,408,30,19,25
監視業務督導會報決議,0,17,0,0,12
連續3個營業日及沖銷標準,314,80,0,0,0
連續5個營業日,187,277,0,13,0
連續5個營業日及沖銷標準,95,70,0,0,0
連續三次,6208,2839,243,498,422
連續三次及當日沖銷標準,295,37,0,0,0
