In [1]:
import pandas as pd
from pandas import read_excel
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
from datetime import date, datetime, timedelta, timezone
import copy
from pandas import IndexSlice as idx
pd.set_option('display.max_columns', None)  # 当列太多时不换行
from numpy import exp, nan
import quantstats as qs


import warnings
warnings.filterwarnings('ignore') # 忽略警告
import pandas as pd
from pandas import IndexSlice as idx

import talib as ta
# 计算natr
def natr(df, n):
    high = np.array([float(x) for x in df['high']])
    low = np.array([float(x) for x in df['low']])
    close = np.array([float(x) for x in df['close']])
    df['natr'] = ta.NATR(high, low, close, timeperiod=n)
    return df['natr']



In [None]:
# 以下为完整的可转债因子分类与计算顺序整理：

# 一、基本价格与波动类因子（转债本身）

# high, low, close, vol 基础字段

# ma_20, momentum_20, volatility_20

# max_value, max_value_position

# zhengfu, zhengfu_cha

# natr_14, natr_[1,3,5,10,20]

# aft_high_cur_close

# 二、OBV量能指标（转债）

# obv, obv_5, obv_10, obv_ratio_5_10

# 三、换手与市值类因子

# turnover_pct

# cap_float_share_rate

# turnover_[5,10,20,60]_avg

# rolling_[1,5,20,50]_avg

# rolling_1_to_5_avg, rolling_5_to_20_avg, rolling_20_to_50_avg

# 四、区间收益率（转债与股票）

# pct_chg_[5,20]

# pct_chg_stk_[5,20]

# 五、成交量均值比因子（转债）

# vol_[3,5,10,20,30,60]_avg

# vol_[N]to[M]

# 六、波动率与振幅（转债与股票）

# bodong_[5,10,20,60], bodong_[5,10,20,60]_bd, bodong_20_to_bodong_60

# zhengfu_[1,5,10,20,60], zhengfu_[1,5,10,20,60]_bodong

# 七、跳空与缺口类因子（转债）

# high_jump, low_gap, open_jump, gap_body_ratio

# high_jump_count_[20,100,250]

# low_gap_count_[20,100,250]

# high_jump_count_[20,100,250]pct, low_gap_count[20,100,250]_pct

# 八、K线结构因子（转债）

# close_to_high_ratio, close_to_low_ratio

# body_position, upper_shadow_ratio, lower_shadow_ratio

# 九、趋势反转类Alpha因子（转债与股票）

# alpha6, alpha6_stk

# alpha12, alpha12_stk

# alpha83, alpha83_stk

# alpha18, alpha18_stk

# alpha36, alpha36_stk

# alpha89, alpha89_stk

# alpha65, alpha65_stk

# alpha76, alpha76_stk

# alpha92, alpha92_stk

# alpha99, alpha99_stk

# 十、股票与转债联动因子

# stk_up_bond_flat, stk_down_bond_weak, bond_hold_stk_rebound

# stk_chg_[3,5], bond_chg_[3,5]

# stk_up_bond_flat_[3,5]

# stk_down_then_up, bond_rebound, bond_follow_stk_rebound

# 十一、横纵向背离因子（股票与转债）

# dev_bond_vs_stk_[3,5,10]

# dev_bond_short[3]_long[20], dev_bond_short[5]_long[30]

# dev_stk_short[3]_long[20], dev_stk_short[5]_long[30]

# cb_vs_stk_ret_rank_diff_[3,5,10]

# 十二、风险与回撤相关因子（转债）

# cb_dev_from_low_5

# cb_close_std_5

# cb_drawdown_5

# cb_dd_prob_estimate

# 十三、震荡收敛类因子（转债）

# atr_5, atr_10, atr_decay_5_10

# close_std_5, close_std_10, vol_shrink_ratio

# body_pct_mean_5

# shadow_mean_5

# small_body_shadow_ratio, doji_ratio_5

# zhengfu_decay_5_20, range_ratio_5_20

# 十四、脉冲与动能因子（转债）

# high_jump_[15,20,30,40,50,60]

# open_gap_mean_[5,10], open_gap_max_[5,10]

# jump_atr_[3,5,10]

# zscore_pctchg_20

# vol_spike_ratio

# vol_std_decay

# score_high_jump_[15,20,30,40,50,60]_20

# range_jump_potential

# gap_and_go_flag

# 十五、跌不动因子（转债）

# down_freq_[5,10]

# down_amp_[5,10]

# no_fall_score_[5,10]

# 十六、K线结构连续性



In [10]:
def load_and_prepare_data(filepath):
    df = pd.read_parquet(filepath)
    df['high'] = df['high'].astype(float)
    df['low'] = df['low'].astype(float)
    df['close'] = df['close'].astype(float)
    df['vol'] = df['vol'].astype(float)

    # 常用因子
    df['natr_14'] = ta.NATR(df['high'], df['low'], df['close'], timeperiod=14)
    df['ma_20'] = ta.SMA(df['close'], timeperiod=20)
    df['momentum_20'] = df['close'] / df['close'].shift(20)
    df['volatility_20'] = df['close'].rolling(20).std()
    df['max_value'] = df.groupby('code')['close'].cummax().shift(1)
    df['max_value_position'] = df['close'] / df['max_value']
    df['zhengfu'] = (df['high'] - df['low']) / df['close']
    df['zhengfu_cha'] = (df['high'] - df['close']) / (df['open'] - df['close']).abs()

    # OBV（On Balance vol）及其衍生指标
    df['obv'] = df.groupby('code').apply(lambda x: ta.OBV(x['close'], x['vol'])).reset_index(level=0, drop=True)
    df['obv_5'] = df.groupby('code')['obv'].rolling(5).mean().reset_index(level=0, drop=True)
    df['obv_10'] = df.groupby('code')['obv'].rolling(10).mean().reset_index(level=0, drop=True)
    df['obv_ratio_5_10'] = df['obv_5'] / df['obv_10']

    # 多周期NATR
    for n in [1, 3, 5, 10, 20]:
        df[f'natr_{n}'] = df.groupby('code').apply(natr, n=n).reset_index(level=0, drop=True)

    # 次日止盈特征
    df['aft_high1'] = df.groupby('code')['high'].shift(-1)
    df['aft_high_cur_close'] = (df['aft_high1'] - df['close']) / df['close']

    # 换手率百分位、流通市值
    df['turnover_pct'] = df.groupby('trade_date')['turnover'].rank(pct=True)
    df['cap_float_share_rate'] = df['remain_cap'] * 10000 / (df['float_share'] * df['close_stk'])

    # 区间收益率
    for win in [5, 20]:
        df['tmp'] = df['pct_chg'] + 1
        df[f'pct_chg_{win}'] = df.groupby('code')['tmp'].rolling(win, min_periods=1).apply(np.prod, raw=True).reset_index(level=0, drop=True) - 1
        del df['tmp']
        df['tmp2'] = df['pct_chg_stk'] + 1
        df[f'pct_chg_stk_{win}'] = df.groupby('code')['tmp2'].rolling(win, min_periods=1).apply(np.prod, raw=True).reset_index(level=0, drop=True) - 1
        del df['tmp2']

    # 均值换手率
    for win in [5, 10, 20, 60]:
        df[f'turnover_{win}_avg'] = df.groupby('code')['turnover'].rolling(window=win).mean().reset_index(level=0, drop=True)

    # 分位换手率
    for win in [1, 5, 20, 50]:
        df[f'rolling_{win}_avg'] = df.groupby('code')['turnover_pct'].rolling(window=win).mean().reset_index(level=0, drop=True)

    df['rolling_1_to_5_avg'] = df['rolling_1_avg'] / df['rolling_5_avg']
    df['rolling_5_to_20_avg'] = df['rolling_5_avg'] / df['rolling_20_avg']
    df['rolling_20_to_50_avg'] = df['rolling_20_avg'] / df['rolling_50_avg']

    # 自动生成多组 N:M 量能比
    vol_windows = [3, 5, 10, 20, 30, 60]
    for n in vol_windows:
        df[f'vol_{n}_avg'] = df.groupby('code')['vol'].rolling(n).mean().reset_index(level=0, drop=True)
    for n in vol_windows:
        for m in vol_windows:
            if n < m:
                df[f'vol_{n}_to_{m}'] = df[f'vol_{n}_avg'] / df[f'vol_{m}_avg']

    # 波动率
    for win in [5, 10, 20, 60]:
        df[f'bodong_{win}'] = df.groupby('code')['pct_chg_stk'].rolling(win).std().reset_index(level=0, drop=True) * (win ** 0.5)
        df[f'bodong_{win}_bd'] = df.groupby('code')['pct_chg'].rolling(win).std().reset_index(level=0, drop=True) * (win ** 0.5)

    df['bodong_20_to_bodong_60'] = df['bodong_20'] / df['bodong_60']

    # 振幅波动
    for win in [1, 5, 10, 20, 60]:
        df[f'zhengfu_{win}'] = df.groupby('code')['zhengfu'].rolling(win).std().reset_index(level=0, drop=True)
        df[f'zhengfu_{win}_bodong'] = df[f'zhengfu_{win}'] * (win ** 0.5)

    # # 跳空上涨/暴跌统计
    # df['high_jump'] = (df['high'] / df['pre_close'] - 1) > 0.025
    # df['close_drop'] = (df['close'] / df['pre_close'] - 1) < -0.02
    # for win in [100, 250]:
    #     df[f'high_jump_count_{win}'] = df.groupby('code')['high_jump'].rolling(window=win, min_periods=1).sum().reset_index(0, drop=True)
    #     df[f'close_drop_count_{win}'] = df.groupby('code')['close_drop'].rolling(window=win, min_periods=1).sum().reset_index(0, drop=True)
    #     df[f'high_jump_count_{win}_pct'] = df.groupby('trade_date')[f'high_jump_count_{win}'].rank(pct=True)
    #     df.loc[df[f'high_jump_count_{win}_pct'] < 0.1, 'filter'] = True

#     # =========================
# # 原有代码中的指标计算（保留）
# # =========================
# # ... 你的原有指标计算逻辑保留不变 ...

# # =========================
# # 增强跳空与K线结构指标（新增）
# # =========================
#     df['high_jump'] = (df['high'] / df['pre_close'] - 1) > 0.025
#     df['low_gap'] = (df['low'] / df['pre_close'] - 1) < -0.025
#     df['open_jump'] = (df['open'] / df['pre_close'] - 1).abs()
#     df['gap_body_ratio'] = (df['open'] - df['pre_close']) / (df['close'] - df['open']).replace(0, np.nan)

#     for win in [20, 100, 250]:
#         df[f'high_jump_count_{win}'] = df.groupby('code')['high_jump'].rolling(window=win, min_periods=1).sum().reset_index(0, drop=True)
#         df[f'low_gap_count_{win}'] = df.groupby('code')['low_gap'].rolling(window=win, min_periods=1).sum().reset_index(0, drop=True)
#         df[f'high_jump_count_{win}_pct'] = df.groupby('trade_date')[f'high_jump_count_{win}'].rank(pct=True)
#         df[f'low_gap_count_{win}_pct'] = df.groupby('trade_date')[f'low_gap_count_{win}'].rank(pct=True)
#         df.loc[df[f'high_jump_count_{win}_pct'] < 0.1, 'filter'] = True

#     # 收盘价与高低点关系
#     df['close_to_high_ratio'] = (df['close'] - df['low']) / (df['high'] - df['low']).replace(0, np.nan)
#     df['close_to_low_ratio'] = (df['high'] - df['close']) / (df['high'] - df['low']).replace(0, np.nan)
#     df['body_position'] = (df['close'] - df['open']) / (df['high'] - df['low']).replace(0, np.nan)
#     df['upper_shadow_ratio'] = (df['high'] - df[['close', 'open']].max(axis=1)) / (df['high'] - df['low']).replace(0, np.nan)
#     df['lower_shadow_ratio'] = (df[['close', 'open']].min(axis=1) - df['low']) / (df['high'] - df['low']).replace(0, np.nan)

# # =========================
# # 示例策略（攻击型形态选股配置）
# # =========================
# # 攻击形态特征：
# # - 高跳空概率 + 收盘接近最高
# # - K线阳线、实体大、上影线短
# # 可作为打分或过滤条件：
# # config = {
# #     'score_factors': {
# #         'high_jump_count_20': 'desc',
# #         'close_to_high_ratio': 'desc',
# #         'upper_shadow_ratio': 'asc',
# #         'body_position': 'desc'
# #     }
# # }

#     # =========================
# # Alpha101 & 联动因子模块（扩展+注释）
# # =========================
#     # Alpha6: -corr(rank(delta(close, 10)), rank(vol), 10)
#     df['delta_close_10'] = df.groupby('code')['close'].diff(10)
#     df['rank_delta_close_10'] = df.groupby('trade_date')['delta_close_10'].rank()
#     df['rank_vol'] = df.groupby('trade_date')['vol'].rank()
#     df['alpha6'] = df.groupby('code').apply(lambda x: x['rank_delta_close_10'].rolling(10).corr(x['rank_vol'])).reset_index(level=0, drop=True) * -1

#     # Alpha12: sign(delta(vol, 1)) * -1 * delta(close, 1)
#     df['delta_vol_1'] = df.groupby('code')['vol'].diff(1)
#     df['delta_close_1'] = df.groupby('code')['close'].diff(1)
#     df['alpha12'] = np.sign(df['delta_vol_1']) * -1 * df['delta_close_1']

#     # Alpha83: rank(ts_argmax(close, 30)) → 30日内收盘最高价的位置
#     df['alpha83'] = df.groupby('code')['close'].rolling(30).apply(lambda x: 29 - np.argmax(x[::-1]), raw=True).reset_index(level=0, drop=True)

#     # 联动因子：股票涨、可转债没涨
#     df['stk_up_bond_flat'] = ((df['pct_chg_stk'] > 0.03) & (df['pct_chg'] < 0.01)).astype(int)
#     df['stk_down_bond_weak'] = ((df['pct_chg_stk'] < -0.03) & (df['pct_chg'] < df['pct_chg_stk'])).astype(int)
#     df['bond_hold_stk_rebound'] = ((df['pct_chg_stk'].shift(1) < -0.03) & (df['pct_chg_stk'] > 0.01) & (df['pct_chg'] > 0.005)).astype(int)

#     # 保存后返回
#     # Alpha6: 趋势与成交量相关性，反转信号，值越小可能越强势（需做滑动相关）
#     # Alpha6_stk: 正股版本
#     df['delta_close_10_stk'] = df.groupby('code')['close_stk'].diff(10)
#     df['rank_delta_close_10_stk'] = df.groupby('trade_date')['delta_close_10_stk'].rank()
#     df['rank_vol_stk'] = df.groupby('trade_date')['vol_stk'].rank()
#     df['alpha6_stk'] = df.groupby('code').apply(lambda x: x['rank_delta_close_10_stk'].rolling(10).corr(x['rank_vol_stk'])).reset_index(level=0, drop=True) * -1

#     # Alpha12: 成交量变动的反向动量信号
#     df['delta_vol_1_stk'] = df.groupby('code')['vol_stk'].diff(1)
#     df['delta_close_1_stk'] = df.groupby('code')['close_stk'].diff(1)
#     df['alpha12_stk'] = np.sign(df['delta_vol_1_stk']) * -1 * df['delta_close_1_stk']

#     # Alpha83: 近30日高点出现时间，数值越小越强
#     df['alpha83_stk'] = df.groupby('code')['close_stk'].rolling(30).apply(lambda x: 29 - np.argmax(x[::-1]), raw=True).reset_index(level=0, drop=True)

#     # 联动因子增强：跨日多周期版本
#     df['stk_chg_3'] = df.groupby('code')['pct_chg_stk'].rolling(3).mean().reset_index(level=0, drop=True)
#     df['bond_chg_3'] = df.groupby('code')['pct_chg'].rolling(3).mean().reset_index(level=0, drop=True)
#     df['stk_chg_5'] = df.groupby('code')['pct_chg_stk'].rolling(5).mean().reset_index(level=0, drop=True)
#     df['bond_chg_5'] = df.groupby('code')['pct_chg'].rolling(5).mean().reset_index(level=0, drop=True)

#     # 滞涨因子：股票涨，转债不涨（补涨潜力）
#     df['stk_up_bond_flat_3'] = ((df['stk_chg_3'] > 0.03) & (df['bond_chg_3'] < 0.01)).astype(int)
#     df['stk_up_bond_flat_5'] = ((df['stk_chg_5'] > 0.05) & (df['bond_chg_5'] < 0.01)).astype(int)

#     # 联动反转：股票大跌后反弹，转债跟涨（弹性机会）
#     df['stk_down_then_up'] = ((df['pct_chg_stk'].shift(2) < -0.03) & (df['pct_chg_stk'] > 0.02)).astype(int)
#     df['bond_rebound'] = (df['pct_chg'] > 0.01).astype(int)
#     df['bond_follow_stk_rebound'] = ((df['stk_down_then_up'] == 1) & (df['bond_rebound'] == 1)).astype(int)

    # =========================
# Alpha101 & 联动因子模块（大幅扩展 + 注释 + 股票/转债联动）
# =========================
    # Alpha6: -corr(rank(delta(close, 10)), rank(vol), 10)
    df['delta_close_10'] = df.groupby('code')['close'].diff(10)
    df['rank_delta_close_10'] = df.groupby('trade_date')['delta_close_10'].rank()
    df['rank_vol'] = df.groupby('trade_date')['vol'].rank()
    df['alpha6'] = df.groupby('code').apply(lambda x: x['rank_delta_close_10'].rolling(10).corr(x['rank_vol'])).reset_index(level=0, drop=True) * -1

    # Alpha12: sign(delta(vol, 1)) * -1 * delta(close, 1)
    df['delta_vol_1'] = df.groupby('code')['vol'].diff(1)
    df['delta_close_1'] = df.groupby('code')['close'].diff(1)
    df['alpha12'] = np.sign(df['delta_vol_1']) * -1 * df['delta_close_1']

    # Alpha83: rank(ts_argmax(close, 30)) → 30日内收盘最高价的位置
    df['alpha83'] = df.groupby('code')['close'].rolling(30).apply(lambda x: 29 - np.argmax(x[::-1]), raw=True).reset_index(level=0, drop=True)

    # 联动因子：股票涨、可转债没涨
    df['stk_up_bond_flat'] = ((df['pct_chg_stk'] > 0.03) & (df['pct_chg'] < 0.01)).astype(int)
    df['stk_down_bond_weak'] = ((df['pct_chg_stk'] < -0.03) & (df['pct_chg'] < df['pct_chg_stk'])).astype(int)
    df['bond_hold_stk_rebound'] = ((df['pct_chg_stk'].shift(1) < -0.03) & (df['pct_chg_stk'] > 0.01) & (df['pct_chg'] > 0.005)).astype(int)

    # 保存后返回
    # Alpha6: 趋势与成交量相关性，反转信号，值越小可能越强势（需做滑动相关）
    # Alpha6_stk: 正股版本
    df['delta_close_10_stk'] = df.groupby('code')['close_stk'].diff(10)
    df['rank_delta_close_10_stk'] = df.groupby('trade_date')['delta_close_10_stk'].rank()
    df['rank_vol_stk'] = df.groupby('trade_date')['vol_stk'].rank()
    df['alpha6_stk'] = df.groupby('code').apply(lambda x: x['rank_delta_close_10_stk'].rolling(10).corr(x['rank_vol_stk'])).reset_index(level=0, drop=True) * -1

    # Alpha12: 成交量变动的反向动量信号
    df['delta_vol_1_stk'] = df.groupby('code')['vol_stk'].diff(1)
    df['delta_close_1_stk'] = df.groupby('code')['close_stk'].diff(1)
    df['alpha12_stk'] = np.sign(df['delta_vol_1_stk']) * -1 * df['delta_close_1_stk']

    # Alpha83: 近30日高点出现时间，数值越小越强
    df['alpha83_stk'] = df.groupby('code')['close_stk'].rolling(30).apply(lambda x: 29 - np.argmax(x[::-1]), raw=True).reset_index(level=0, drop=True)

    # 联动因子增强：跨日多周期版本（横向/纵向联动背离分析）
    # 股票 & 转债收益横向对比（市场滞涨、异动识别）
    for win in [3, 5, 10]:
        df[f'stk_ret_{win}'] = df.groupby('code')['pct_chg_stk'].rolling(win).mean().reset_index(level=0, drop=True)
        df[f'bond_ret_{win}'] = df.groupby('code')['pct_chg'].rolling(win).mean().reset_index(level=0, drop=True)
        df[f'dev_bond_vs_stk_{win}'] = df[f'bond_ret_{win}'] - df[f'stk_ret_{win}']  # 横向背离值

    # 转债自身历史偏离（纵向）：近期表现 vs 长期均值
    for win_short, win_long in [(3, 20), (5, 30)]:
        short = df.groupby('code')['pct_chg'].rolling(win_short).mean().reset_index(level=0, drop=True)
        long = df.groupby('code')['pct_chg'].rolling(win_long).mean().reset_index(level=0, drop=True)
        df[f'dev_bond_short{win_short}_long{win_long}'] = short - long

    # 正股自身历史偏离（纵向）：最近几天表现 vs 自身长期均值
    for win_short, win_long in [(3, 20), (5, 30)]:
        short = df.groupby('code')['pct_chg_stk'].rolling(win_short).mean().reset_index(level=0, drop=True)
        long = df.groupby('code')['pct_chg_stk'].rolling(win_long).mean().reset_index(level=0, drop=True)
        df[f'dev_stk_short{win_short}_long{win_long}'] = short - long
    df['stk_chg_3'] = df.groupby('code')['pct_chg_stk'].rolling(3).mean().reset_index(level=0, drop=True)
    df['bond_chg_3'] = df.groupby('code')['pct_chg'].rolling(3).mean().reset_index(level=0, drop=True)
    df['stk_chg_5'] = df.groupby('code')['pct_chg_stk'].rolling(5).mean().reset_index(level=0, drop=True)
    df['bond_chg_5'] = df.groupby('code')['pct_chg'].rolling(5).mean().reset_index(level=0, drop=True)

    # 滞涨因子：股票涨，转债不涨（补涨潜力）
    df['stk_up_bond_flat_3'] = ((df['stk_chg_3'] > 0.03) & (df['bond_chg_3'] < 0.01)).astype(int)
    df['stk_up_bond_flat_5'] = ((df['stk_chg_5'] > 0.05) & (df['bond_chg_5'] < 0.01)).astype(int)

    # 联动反转：股票大跌后反弹，转债跟涨（弹性机会）
    df['stk_down_then_up'] = ((df['pct_chg_stk'].shift(2) < -0.03) & (df['pct_chg_stk'] > 0.02)).astype(int)
    df['bond_rebound'] = (df['pct_chg'] > 0.01).astype(int)
    df['bond_follow_stk_rebound'] = ((df['stk_down_then_up'] == 1) & (df['bond_rebound'] == 1)).astype(int)

    # Alpha101 附加因子（转债 & 股票双版本）

    # Alpha18: close / rank(mean(close, 20))，动量均值偏离（越大越强）
    df['mean_close_20'] = df.groupby('code')['close'].rolling(20).mean().reset_index(level=0, drop=True)
    df['rank_mean_close_20'] = df.groupby('trade_date')['mean_close_20'].rank()
    df['alpha18'] = df['close'] / df['rank_mean_close_20']

    df['mean_close_20_stk'] = df.groupby('code')['close_stk'].rolling(20).mean().reset_index(level=0, drop=True)
    df['rank_mean_close_20_stk'] = df.groupby('trade_date')['mean_close_20_stk'].rank()
    df['alpha18_stk'] = df['close_stk'] / df['rank_mean_close_20_stk']

    # Alpha36: (rank(correlation(vol, close, 5)) + rank(correlation(vol, open, 5)))，量价相关性
    df['alpha36'] = df.groupby('code').apply(
        lambda x: x['vol'].rolling(5).corr(x['close']) + x['vol'].rolling(5).corr(x['open'])
    ).reset_index(level=0, drop=True)

    df['alpha36_stk'] = df.groupby('code').apply(
        lambda x: x['vol_stk'].rolling(5).corr(x['close_stk']) + x['vol_stk'].rolling(5).corr(x['open_stk'])
    ).reset_index(level=0, drop=True)

    # Alpha89: (rank(ts_argmax(close, 30)) / rank(ts_argmin(close, 30)))，反转时机
    max_idx = df.groupby('code')['close'].rolling(30).apply(lambda x: 29 - np.argmax(x[::-1]), raw=True).reset_index(level=0, drop=True)
    min_idx = df.groupby('code')['close'].rolling(30).apply(lambda x: 29 - np.argmin(x[::-1]), raw=True).reset_index(level=0, drop=True)
    df['alpha89'] = max_idx / (min_idx + 1e-9)

    max_idx_stk = df.groupby('code')['close_stk'].rolling(30).apply(lambda x: 29 - np.argmax(x[::-1]), raw=True).reset_index(level=0, drop=True)
    min_idx_stk = df.groupby('code')['close_stk'].rolling(30).apply(lambda x: 29 - np.argmin(x[::-1]), raw=True).reset_index(level=0, drop=True)
    df['alpha89_stk'] = max_idx_stk / (min_idx_stk + 1e-9)

    # 使用说明（注释）：
    # alpha6, alpha12, alpha83: 反转类因子，基于价格变化和成交量方向识别趋势临界点
    # alpha18: 动量偏离（强者恒强）
    # alpha36: 量价联动性，适合确认放量跟涨或缩量滞涨
    # alpha89: 极端走势时机判断（如低点反转）
    # 
    # 适用于组合：如“价格滞涨 + 正股持续强势 + OBV上升”筛选补涨可转债
    # 后续建议使用 evaluate_factors(df, groupby='转股溢价') 分析每组有效性

    # Alpha65: correlation(rank(close), rank(vol), 6) → 趋势伴随放量（正相关为强）
    df['alpha65'] = df.groupby('code').apply(
        lambda x: x['close'].rank().rolling(6).corr(x['vol'].rank())
    ).reset_index(level=0, drop=True)
    df['alpha65_stk'] = df.groupby('code').apply(
        lambda x: x['close_stk'].rank().rolling(6).corr(x['vol_stk'].rank())
    ).reset_index(level=0, drop=True)

    # Alpha76: -1 * ts_rank(correlation(close, vol, 10), 10)
    df['alpha76'] = df.groupby('code').apply(
        lambda x: -1 * x['close'].rolling(10).corr(x['vol']).rolling(10).apply(lambda x: pd.Series(x).rank().iloc[-1])
    ).reset_index(level=0, drop=True)
    df['alpha76_stk'] = df.groupby('code').apply(
        lambda x: -1 * x['close_stk'].rolling(10).corr(x['vol_stk']).rolling(10).apply(lambda x: pd.Series(x).rank().iloc[-1])
    ).reset_index(level=0, drop=True)

    # Alpha92: (delta(close, 5)/close) * vol → 回调幅度与量能结合判断洗盘/反转
    df['alpha92'] = df.groupby('code').apply(
        lambda x: (x['close'].diff(5) / x['close']) * x['vol']
    ).reset_index(level=0, drop=True)
    df['alpha92_stk'] = df.groupby('code').apply(
        lambda x: (x['close_stk'].diff(5) / x['close_stk']) * x['vol_stk']
    ).reset_index(level=0, drop=True)

    # Alpha99: -1 * ts_rank(cov(rank(close), rank(vol), 5), 5)
    df['alpha99'] = df.groupby('code').apply(
        lambda x: -1 * x['close'].rank().rolling(5).cov(x['vol'].rank()).rolling(5).apply(lambda x: pd.Series(x).rank().iloc[-1])
    ).reset_index(level=0, drop=True)
    df['alpha99_stk'] = df.groupby('code').apply(
        lambda x: -1 * x['close_stk'].rank().rolling(5).cov(x['vol_stk'].rank()).rolling(5).apply(lambda x: pd.Series(x).rank().iloc[-1])
    ).reset_index(level=0, drop=True)

    # =========================
    # 📉 可转债-股票反转与回撤风险因子
    # =========================

    # 1. 可转债 vs 股票 收益强弱横向对比（相对强度）
    for win in [3, 5, 10]:
        df[f'cb_ret_rank_{win}'] = df.groupby('trade_date')['bond_ret_' + str(win)].rank()
        df[f'stk_ret_rank_{win}'] = df.groupby('trade_date')['stk_ret_' + str(win)].rank()
        df[f'cb_vs_stk_ret_rank_diff_{win}'] = df[f'cb_ret_rank_{win}'] - df[f'stk_ret_rank_{win}']  # 越高说明转债相对强

    # 2. 可转债距近期低点距离（是否已超跌）
    df['cb_low_5'] = df.groupby('code')['close'].rolling(5).min().reset_index(level=0, drop=True)
    df['cb_dev_from_low_5'] = (df['close'] - df['cb_low_5']) / df['cb_low_5']  # 越大说明已反弹

    # 3. 可转债价格波动性（风险识别）
    df['cb_close_std_5'] = df.groupby('code')['close'].rolling(5).std().reset_index(level=0, drop=True)

    # 4. 可转债近5日最大回撤（最高点到当前）
    df['cb_high_5'] = df.groupby('code')['close'].rolling(5).max().reset_index(level=0, drop=True)
    df['cb_drawdown_5'] = (df['close'] - df['cb_high_5']) / df['cb_high_5']  # 越负说明风险释放

    # 5. 下跌风险预估（历史概率 × 幅度）
    df['cb_ret_1'] = df.groupby('code')['pct_chg'].shift(1)
    df['cb_fall_flag'] = (df['cb_ret_1'] < 0).astype(int)
    df['cb_fall_freq_10'] = df.groupby('code')['cb_fall_flag'].rolling(10).mean().reset_index(level=0, drop=True)
    df['cb_fall_amp_10'] = df.groupby('code')['cb_ret_1'].rolling(10).apply(lambda x: x[x < 0].mean() if (x < 0).any() else 0).reset_index(level=0, drop=True)
    df['cb_dd_prob_estimate'] = df['cb_fall_freq_10'] * df['cb_fall_amp_10']  # 越负风险越大

    df['high_jump'] = (df['high'] / df['pre_close'] - 1) > 0.025
    df['low_gap'] = (df['low'] / df['pre_close'] - 1) < -0.025
    df['open_jump'] = (df['open'] / df['pre_close'] - 1).abs()
    df['gap_body_ratio'] = (df['open'] - df['pre_close']) / (df['close'] - df['open']).replace(0, np.nan)

    for win in [20, 100, 250]:
        df[f'high_jump_count_{win}'] = df.groupby('code')['high_jump'].rolling(window=win, min_periods=1).sum().reset_index(0, drop=True)
        df[f'low_gap_count_{win}'] = df.groupby('code')['low_gap'].rolling(window=win, min_periods=1).sum().reset_index(0, drop=True)
        df[f'high_jump_count_{win}_pct'] = df.groupby('trade_date')[f'high_jump_count_{win}'].rank(pct=True)
        df[f'low_gap_count_{win}_pct'] = df.groupby('trade_date')[f'low_gap_count_{win}'].rank(pct=True)
        df.loc[df[f'high_jump_count_{win}_pct'] < 0.1, 'filter'] = True

    # 收盘价与高低点关系
    df['close_to_high_ratio'] = (df['close'] - df['low']) / (df['high'] - df['low']).replace(0, np.nan)
    df['close_to_low_ratio'] = (df['high'] - df['close']) / (df['high'] - df['low']).replace(0, np.nan)
    df['body_position'] = (df['close'] - df['open']) / (df['high'] - df['low']).replace(0, np.nan)
    df['upper_shadow_ratio'] = (df['high'] - df[['close', 'open']].max(axis=1)) / (df['high'] - df['low']).replace(0, np.nan)
    df['lower_shadow_ratio'] = (df[['close', 'open']].min(axis=1) - df['low']) / (df['high'] - df['low']).replace(0, np.nan)



    # 新增部分：涨不动 + 跌不动 + 脉冲可能性因子组合（含补充因子）
# =========================

# 1. 涨不动 & 跌不动（震荡收敛类）
# -----------------------------------
# ATR 衰减率（震荡幅度变窄）
    df['atr_5'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(5).mean()).reset_index(0, drop=True)
    df['atr_20'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(20).mean()).reset_index(0, drop=True)
    df['atr_5_decay'] = df['atr_5'] / df['atr_20']

# 振幅衰减（高低价差缩小）
    df['zhengfu_5'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(5).mean()).reset_index(0, drop=True)
    df['zhengfu_20'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(20).mean()).reset_index(0, drop=True)
    df['zhengfu_decay_5_20'] = df['zhengfu_5'] / df['zhengfu_20']

# 高低价差比均值
    range_5 = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(5).mean()).reset_index(0, drop=True)
    range_20 = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(20).mean()).reset_index(0, drop=True)
    df['range_ratio_5_20'] = range_5 / range_20

# 极小实体 + 长影线结构
    body = (df['close'] - df['open']).abs()
    shadow = (df['high'] - df['low']) - body
    df['small_body_shadow_ratio'] = shadow / (body + 1e-6)

# 十字星出现频率
    is_doji = (body / (df['high'] - df['low'] + 1e-6)) < 0.15
    df['doji_ratio_5'] = is_doji.groupby(df['code']).rolling(5).mean().reset_index(0, drop=True)

# 2. 脉冲概率 × 脉冲幅度 × 情绪波动
# -----------------------------------
# high_jump 多档位
    for thres in [0.015, 0.02, 0.03, 0.04, 0.05, 0.06]:
        df[f'high_jump_{int(thres*1000)}'] = ((df['high'] / df['pre_close'] - 1) > thres).astype(int)
        df[f'count_high_jump_{int(thres*1000)}_20'] = df.groupby('code')[f'high_jump_{int(thres*1000)}'].rolling(20).sum().reset_index(0, drop=True)
        df[f'mean_high_jump_{int(thres*1000)}_20'] = df.groupby('code')['pct_chg'].where(df[f'high_jump_{int(thres*1000)}'] == 1).rolling(20).mean().reset_index(0, drop=True)
        df[f'score_high_jump_{int(thres*1000)}_20'] = df[f'count_high_jump_{int(thres*1000)}_20'] * df[f'mean_high_jump_{int(thres*1000)}_20']

# 脉冲 Z-score（涨幅异常性）
    df['zscore_pctchg_20'] = df.groupby('code')['pct_chg'].transform(lambda x: (x - x.rolling(20).mean()) / (x.rolling(20).std() + 1e-6))

# 成交量脉冲
    vol_ma20 = df.groupby('code')['vol'].rolling(20).mean().reset_index(0, drop=True)
    df['vol_spike_ratio'] = df['vol'] / (vol_ma20 + 1e-6)

# 成交量波动收敛
    vol_std_5 = df.groupby('code')['vol'].rolling(5).std().reset_index(0, drop=True)
    vol_std_20 = df.groupby('code')['vol'].rolling(20).std().reset_index(0, drop=True)
    df['vol_std_decay'] = vol_std_5 / (vol_std_20 + 1e-6)

# 3. 脉冲持续性（跳空+阳线）
# -----------------------------------
    df['gap_and_go_flag'] = ((df['open'] > df['pre_close'] * 1.02) & (df['close'] > df['open'])).astype(int)
    df['gap_body_ratio'] = (df['open'] - df['pre_close']) / (df['close'] - df['open']).replace(0, np.nan)
    df.to_parquet('/Users/yiwei/Desktop/git/cb_data_with_factors.pq')



    # 新增部分：涨不动 + 跌不动 + 脉冲可能性因子组合
# =========================

# 1. 涨不动 & 跌不动（震荡收敛类）
# -----------------------------------
# ATR 衰减率（震荡幅度变窄）
    df['atr_5'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(5).mean()).reset_index(0, drop=True)
    df['atr_10'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(10).mean()).reset_index(0, drop=True)
    df['atr_decay_5_10'] = df['atr_5'] / df['atr_10']  # 趋近 1 为震荡，远小于 1 为收敛

# 收盘价波动率缩小（标准差下降）
    df['close_std_5'] = df.groupby('code')['close'].rolling(5).std().reset_index(0, drop=True)
    df['close_std_10'] = df.groupby('code')['close'].rolling(10).std().reset_index(0, drop=True)
    df['vol_shrink_ratio'] = df['close_std_5'] / df['close_std_10']  # 小于1说明震荡收敛

# K线实体变短（绝对涨跌幅变小）
    df['body_pct'] = (df['close'] - df['open']).abs() / df['pre_close']
    df['body_pct_mean_5'] = df.groupby('code')['body_pct'].rolling(5).mean().reset_index(0, drop=True)

# 上下影线增多（震荡特征）
    df['shadow_ratio'] = ((df['high'] - df['low']) - (df['close'] - df['open']).abs()) / df['pre_close']
    df['shadow_mean_5'] = df.groupby('code')['shadow_ratio'].rolling(5).mean().reset_index(0, drop=True)

# 2. 脉冲概率 × 脉冲幅度
# -----------------------------------
# high_jump 多阈值
    for thres in [0.015, 0.02, 0.03, 0.04, 0.05, 0.06]:
        df[f'high_jump_{int(thres*1000)}'] = ((df['high'] / df['pre_close'] - 1) > thres).astype(int)

# 跳空幅度（带方向）统计
    for n in [5, 10]:
        df[f'open_gap_mean_{n}'] = df.groupby('code')['open_jump'].rolling(n).mean().reset_index(0, drop=True)
        df[f'open_gap_max_{n}'] = df.groupby('code')['open_jump'].rolling(n).max().reset_index(0, drop=True)

# N 日脉冲 ATR：高点远离均值（短期剧烈拉升）
    for n in [3, 5, 10]:
        high_mean = df.groupby('code')['high'].rolling(n).mean().reset_index(0, drop=True)
        close_mean = df.groupby('code')['close'].rolling(n).mean().reset_index(0, drop=True)
        df[f'jump_atr_{n}'] = (df['high'] - close_mean) / (df.groupby('code')['close'].rolling(n).std().reset_index(0, drop=True) + 1e-6)

# 3. 跌不动（下跌概率低 + 幅度小）
# -----------------------------------
    for win in [5, 10]:
        df[f'down_freq_{win}'] = df.groupby('code')['pct_chg'].apply(lambda x: x.rolling(win).apply(lambda s: (s < 0).mean())).reset_index(0, drop=True)
        df[f'down_amp_{win}'] = df.groupby('code')['pct_chg'].apply(lambda x: x.rolling(win).apply(lambda s: s[s < 0].mean() if (s < 0).any() else 0)).reset_index(0, drop=True)
        df[f'no_fall_score_{win}'] = (1 - df[f'down_freq_{win}']) * (-df[f'down_amp_{win}'])  # 越大越“跌不动”

# 4. 高脉冲动能（high_jump / atr / shadow 等集中爆发）
# -----------------------------------
    df['range_today'] = df['high'] - df['low']
    df['range_atr_5'] = df['range_today'] / df.groupby('code')['range_today'].rolling(5).mean().reset_index(0, drop=True)
    df['range_jump_potential'] = (df['range_atr_5'] > 1.5).astype(int)

# 5. K线结构连续性判断（阴阳交错、跳空接力）
# -----------------------------------
    df['kline_direction'] = np.sign(df['close'] - df['open'])
    df['kline_direction_shift1'] = df.groupby('code')['kline_direction'].shift(1)
    df['kline_flip'] = (df['kline_direction'] * df['kline_direction_shift1'] < 0).astype(int)
    df['kline_flip_ratio_5'] = df.groupby('code')['kline_flip'].rolling(5).mean().reset_index(0, drop=True)  # 多为 0 则趋势稳定

# =========================
# 所有新增字段在后续可组合使用：如（涨不动 + 跌不动 + low_gap_count 小）识别蓄力震荡；或（脉冲概率高 × 最近收敛）识别爆发行情前兆

    return df

In [16]:
# Helper functions (safe_division, rolling_downside_stats, rolling_high_jump_stats)
# ... (keep the helper functions as they were) ...
def safe_division(numerator, denominator, default=np.nan):
    """Performs division, returning default value if denominator is zero or NaN."""
    denominator = denominator.replace(0, np.nan)
    result = numerator / denominator
    return result.fillna(default)

def rolling_downside_stats(series, window):
    """Calculates downside frequency, mean amplitude, and std amplitude."""
    is_down = series < 0
    freq = is_down.rolling(window, min_periods=max(1, int(window * 0.6))).mean().fillna(0)
    down_series = series.where(is_down)
    mean_amp = down_series.rolling(window, min_periods=max(1, int(window * 0.6))).mean().fillna(0)
    std_amp = down_series.rolling(window, min_periods=max(2, int(window * 0.6))).std().fillna(0)
    return freq, mean_amp, std_amp

def rolling_high_jump_stats(jump_flag, pct_chg, window):
    """Calculates high jump count, mean jump return, and std jump return."""
    count = jump_flag.rolling(window, min_periods=max(1, int(window * 0.6))).sum().fillna(0)
    jump_returns = pct_chg.where(jump_flag)
    mean_ret = jump_returns.rolling(window, min_periods=max(1, int(window * 0.6))).mean().fillna(0)
    std_ret = jump_returns.rolling(window, min_periods=max(2, int(window * 0.6))).std().fillna(0)
    return count, mean_ret, std_ret
# --------------------------------------------------------------------------

def calculate_factors(df, restore_multiindex=False): # Added option to restore index
    """
    Calculates convertible bond and corresponding stock factors based on the checklist.
    Handles DataFrame with 'code' and 'trade_date' as columns OR MultiIndex levels.
    Excludes rank, percentage rank (pct=True), and explicit 'score_' factors.

    Args:
        df (pd.DataFrame): Input DataFrame.
        restore_multiindex (bool): If True, sets ['code', 'trade_date'] back as index at the end.

    Returns:
        pd.DataFrame: DataFrame with added factor columns.
    """
    print("Starting factor calculation...")

    # --- Input Validation and Index Handling ---
    has_code_col = 'code' in df.columns
    has_date_col = 'trade_date' in df.columns
    has_code_idx = 'code' in df.index.names
    has_date_idx = 'trade_date' in df.index.names

    if has_code_col and has_date_col:
        print("Found 'code' and 'trade_date' in columns.")
        # Sort directly if columns exist
        df = df.sort_values(by=['code', 'trade_date']).copy() # Use copy to avoid SettingWithCopyWarning later
    elif has_code_idx and has_date_idx:
        print("Found 'code' and 'trade_date' in MultiIndex. Resetting index.")
        df = df.reset_index()
        # Now sort by the newly created columns
        df = df.sort_values(by=['code', 'trade_date']).copy() # Use copy
    else:
        missing = []
        if not (has_code_col or has_code_idx):
            missing.append('code')
        if not (has_date_col or has_date_idx):
            missing.append('trade_date')
        raise ValueError(f"DataFrame must contain 'code' and 'trade_date' either as columns or index levels. Missing: {missing}")
    # --- End Index Handling ---


    # 0. Data Type Preparation & Safety
    base_cols = ['high', 'low', 'close', 'open', 'vol', 'pre_close', 'pct_chg', 'turnover', 'remain_cap', 'float_share']
    stk_cols = ['high_stk', 'low_stk', 'close_stk', 'open_stk', 'vol_stk', 'pct_chg_stk']
    # Include 'code', 'trade_date' now they are guaranteed columns
    all_req_cols = base_cols + stk_cols + ['code', 'trade_date']

    for col in all_req_cols:
        if col in df.columns:
            # Don't coerce code/date if they became columns
            if col not in ['code', 'trade_date']:
                df[col] = pd.to_numeric(df[col], errors='coerce') # Convert non-numeric to NaN
                df[col] = df[col].replace([np.inf, -np.inf], np.nan)
        # else: # Removed redundant check as missing cols are handled by later logic gracefully
        #    print(f"Warning: Column '{col}' not found in DataFrame.")


    # === Factor Calculations Start Here ===
    # The rest of the code (Sections I to XVI) remains exactly the same
    # as it now operates on a DataFrame where 'code' and 'trade_date'
    # are guaranteed to be columns, and the data is sorted.
    # ======================================

    # === I. Basic Price & Volatility (CB) ===
    print("Calculating: I. Basic Price & Volatility (CB)")
    df['ma_20'] = df.groupby('code')['close'].transform(lambda x: ta.SMA(x, timeperiod=20))
    df['momentum_20'] = df.groupby('code')['close'].transform(lambda x: safe_division(x, x.shift(20)))
    df['volatility_20'] = df.groupby('code')['close'].transform(lambda x: x.rolling(20, min_periods=10).std())
    df['max_value'] = df.groupby('code')['close'].transform(lambda x: x.cummax().shift(1))
    df['max_value_position'] = safe_division(df['close'], df['max_value'])
    if 'high' in df.columns and 'low' in df.columns and 'close' in df.columns:
        df['zhengfu'] = safe_division(df['high'] - df['low'], df['close'])
        df['zhengfu_cha'] = safe_division(df['high'] - df['close'], (df['open'] - df['close']).abs())
        # NATR - Needs apply which handles MultiIndex implicitly if we hadn't reset
        # Since we reset, groupby('code').apply works fine
        df['natr_14'] = df.groupby('code').apply(lambda x: ta.NATR(x['high'], x['low'], x['close'], timeperiod=14) if not x[['high','low','close']].isnull().all().all() else pd.Series(index=x.index, dtype=float)).reset_index(level=0, drop=True)
        for n in [1, 3, 5, 10, 20]:
             df[f'natr_{n}'] = df.groupby('code').apply(lambda x: ta.NATR(x['high'], x['low'], x['close'], timeperiod=n) if not x[['high','low','close']].isnull().all().all() else pd.Series(index=x.index, dtype=float)).reset_index(level=0, drop=True)
    # Future return (Label)
    df['aft_high1'] = df.groupby('code')['high'].shift(-1)
    df['aft_high_cur_close'] = safe_division(df['aft_high1'] - df['close'], df['close'])


    # === II. OBV (CB) ===
    print("Calculating: II. OBV (CB)")
    if 'close' in df.columns and 'vol' in df.columns:
        df['obv'] = df.groupby('code').apply(lambda x: ta.OBV(x['close'], x['vol']) if not x[['close','vol']].isnull().all().all() else pd.Series(index=x.index, dtype=float)).reset_index(level=0, drop=True)
        df['obv_5'] = df.groupby('code')['obv'].transform(lambda x: x.rolling(5, min_periods=3).mean())
        df['obv_10'] = df.groupby('code')['obv'].transform(lambda x: x.rolling(10, min_periods=5).mean())
        df['obv_ratio_5_10'] = safe_division(df['obv_5'], df['obv_10'])


    # === III. Turnover & Cap ===
    print("Calculating: III. Turnover & Cap")
    print("  - Skipping: turnover_pct, rolling_*_avg, rolling_*_to_*_avg (rank/pct based)")
    if 'turnover' in df.columns:
        for win in [5, 10, 20, 60]:
            df[f'turnover_{win}_avg'] = df.groupby('code')['turnover'].transform(lambda x: x.rolling(window=win, min_periods=int(win*0.6)).mean())
    if all(col in df.columns for col in ['remain_cap', 'float_share', 'close_stk']):
        df['cap_float_share_rate'] = safe_division(df['remain_cap'] * 10000, (df['float_share'] * df['close_stk']))


    # === IV. Rolling Returns (CB & Stock) ===
    print("Calculating: IV. Rolling Returns (CB & Stock)")
    if 'pct_chg' in df.columns:
        for win in [3, 5, 10, 20]:
            df[f'pct_chg_{win}'] = df.groupby('code')['pct_chg'].transform(
                lambda x: (x + 1).rolling(win, min_periods=max(1,int(win*0.6))).apply(np.prod, raw=True) - 1
            )
            df[f'bond_ret_mean_{win}'] = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(win, min_periods=max(1,int(win*0.6))).mean())

    if 'pct_chg_stk' in df.columns:
        for win in [3, 5, 10, 20]:
            df[f'pct_chg_stk_{win}'] = df.groupby('code')['pct_chg_stk'].transform(
                lambda x: (x + 1).rolling(win, min_periods=max(1,int(win*0.6))).apply(np.prod, raw=True) - 1
            )
            df[f'stk_ret_mean_{win}'] = df.groupby('code')['pct_chg_stk'].transform(lambda x: x.rolling(win, min_periods=max(1,int(win*0.6))).mean())


    # === V. Volume Avg Ratio (CB) ===
    print("Calculating: V. Volume Avg Ratio (CB)")
    if 'vol' in df.columns:
        vol_windows = [3, 5, 10, 20, 30, 60]
        for n in vol_windows:
            df[f'vol_{n}_avg'] = df.groupby('code')['vol'].transform(lambda x: x.rolling(n, min_periods=int(n*0.6)).mean())
        for n in vol_windows:
            for m in vol_windows:
                if n < m and f'vol_{n}_avg' in df.columns and f'vol_{m}_avg' in df.columns:
                    df[f'vol_{n}_to_{m}'] = safe_division(df[f'vol_{n}_avg'], df[f'vol_{m}_avg'])


    # === VI. Volatility & Amplitude (CB & Stock) ===
    print("Calculating: VI. Volatility & Amplitude (CB & Stock)")
    if 'pct_chg_stk' in df.columns:
        for win in [5, 10, 20, 60]:
            df[f'bodong_{win}'] = df.groupby('code')['pct_chg_stk'].transform(lambda x: x.rolling(win, min_periods=int(win*0.6)).std() * (win ** 0.5))
        if 'bodong_20' in df.columns and 'bodong_60' in df.columns:
            df['bodong_20_to_bodong_60'] = safe_division(df['bodong_20'], df['bodong_60'])

    if 'pct_chg' in df.columns:
        for win in [5, 10, 20, 60]:
             df[f'bodong_{win}_bd'] = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(win, min_periods=int(win*0.6)).std() * (win ** 0.5))

    if 'zhengfu' in df.columns:
        for win in [1, 5, 10, 20, 60]:
            df[f'zhengfu_{win}'] = df.groupby('code')['zhengfu'].transform(lambda x: x.rolling(win, min_periods=max(1,int(win*0.6))).std())
            df[f'zhengfu_{win}_bodong'] = df[f'zhengfu_{win}'] * (win ** 0.5)


    # === VII. Jump & Gap (CB) ===
    print("Calculating: VII. Jump & Gap (CB)")
    print("  - Skipping: high_jump_count_*_pct, low_gap_count_*_pct (pct based)")
    if all(c in df.columns for c in ['high', 'low', 'open', 'close', 'pre_close']):
        # Use a temp name to avoid conflict later if needed
        df['_high_jump_flag_temp'] = (safe_division(df['high'], df['pre_close']) - 1) > 0.025
        df['_low_gap_flag_temp'] = (safe_division(df['low'], df['pre_close']) - 1) < -0.025
        df['open_jump'] = (safe_division(df['open'], df['pre_close']) - 1).abs()
        df['gap_body_ratio'] = safe_division(df['open'] - df['pre_close'], (df['close'] - df['open']))

        for win in [20, 100, 250]:
             df[f'high_jump_count_{win}'] = df.groupby('code')['_high_jump_flag_temp'].transform(lambda x: x.rolling(window=win, min_periods=int(win*0.6)).sum())
             df[f'low_gap_count_{win}'] = df.groupby('code')['_low_gap_flag_temp'].transform(lambda x: x.rolling(window=win, min_periods=int(win*0.6)).sum())


    # === VIII. K-Line Structure (CB) ===
    print("Calculating: VIII. K-Line Structure (CB)")
    if all(c in df.columns for c in ['high', 'low', 'open', 'close']):
        high_low_diff = (df['high'] - df['low']).replace(0, np.nan)
        df['close_to_high_ratio'] = safe_division(df['close'] - df['low'], high_low_diff)
        df['close_to_low_ratio'] = safe_division(df['high'] - df['close'], high_low_diff)
        df['body_position'] = safe_division(df['close'] - df['open'], high_low_diff)
        df['upper_shadow_ratio'] = safe_division(df['high'] - df[['close', 'open']].max(axis=1), high_low_diff)
        df['lower_shadow_ratio'] = safe_division(df[['close', 'open']].min(axis=1) - df['low'], high_low_diff)


    # === IX. Trend Reversal Alpha Factors (CB & Stock) ===
    print("Calculating: IX. Trend Reversal Alpha Factors (CB & Stock)")
    print("  - Skipping: alpha6, alpha18, alpha65, alpha76, alpha99 (rank based)")
    df['delta_vol_1'] = df.groupby('code')['vol'].transform(lambda x: x.diff(1))
    df['delta_close_1'] = df.groupby('code')['close'].transform(lambda x: x.diff(1))
    df['delta_close_5'] = df.groupby('code')['close'].transform(lambda x: x.diff(5))
    if 'vol_stk' in df.columns:
        df['delta_vol_1_stk'] = df.groupby('code')['vol_stk'].transform(lambda x: x.diff(1))
    if 'close_stk' in df.columns:
        df['delta_close_1_stk'] = df.groupby('code')['close_stk'].transform(lambda x: x.diff(1))
        df['delta_close_5_stk'] = df.groupby('code')['close_stk'].transform(lambda x: x.diff(5))

    # Alpha12
    if all(c in df.columns for c in ['delta_vol_1', 'delta_close_1']):
        df['alpha12'] = np.sign(df['delta_vol_1']) * -1 * df['delta_close_1']
    if all(c in df.columns for c in ['delta_vol_1_stk', 'delta_close_1_stk']):
        df['alpha12_stk'] = np.sign(df['delta_vol_1_stk']) * -1 * df['delta_close_1_stk']

    # # Alpha83 (Days since 30d high)
    # df['alpha83'] = df.groupby('code')['close'].transform(
    #     lambda x: x.rolling(30, min_periods=15).apply(lambda s: 29 - np.argmax(s.to_numpy()[::-1]) if not s.isnull().all() else np.nan, raw=True)
    # )
    # if 'close_stk' in df.columns:
    #     df['alpha83_stk'] = df.groupby('code')['close_stk'].transform(
    #         lambda x: x.rolling(30, min_periods=15).apply(lambda s: 29 - np.argmax(s.to_numpy()[::-1]) if not s.isnull().all() else np.nan, raw=True)
    #     )

    # === IX. Trend Reversal Alpha Factors (CB & Stock) ===
    print("Calculating: IX. Trend Reversal Alpha Factors (CB & Stock)")
    # ... (other alpha calculations before 83) ...

    # Alpha83 (Days since 30d high) - Corrected
    print("  - Calculating alpha83...")
    df['alpha83'] = df.groupby('code')['close'].transform(
        lambda x: x.rolling(30, min_periods=15).apply(
            lambda s: (len(s) - 1) - np.nanargmax(s.to_numpy()) if not s.isnull().all() else np.nan,
            raw=False # <--- REMOVED raw=True, forces s to be a Series
        )
    )
    # Also corrected the logic: days_ago = window_size - 1 - position_of_max
    # np.nanargmax ignores NaNs and finds the position of the first max

    if 'close_stk' in df.columns:
        print("  - Calculating alpha83_stk...")
        df['alpha83_stk'] = df.groupby('code')['close_stk'].transform(
            lambda x: x.rolling(30, min_periods=15).apply(
                lambda s: (len(s) - 1) - np.nanargmax(s.to_numpy()) if not s.isnull().all() else np.nan,
                raw=False # <--- REMOVED raw=True
            )
        )

    # Alpha36 (Volume-Price Correlation)
    if all(c in df.columns for c in ['vol', 'close', 'open']):
        # transform with rolling corr can be tricky with multiple columns, use apply carefully
        def calc_alpha36(x):
             corr_close = x['vol'].rolling(5, min_periods=3).corr(x['close'])
             corr_open = x['vol'].rolling(5, min_periods=3).corr(x['open'])
             return corr_close.add(corr_open, fill_value=0)
        df['alpha36'] = df.groupby('code', group_keys=False).apply(calc_alpha36)

    if all(c in df.columns for c in ['vol_stk', 'close_stk', 'open_stk']):
        def calc_alpha36_stk(x):
             corr_close = x['vol_stk'].rolling(5, min_periods=3).corr(x['close_stk'])
             corr_open = x['vol_stk'].rolling(5, min_periods=3).corr(x['open_stk'])
             return corr_close.add(corr_open, fill_value=0)
        df['alpha36_stk'] = df.groupby('code', group_keys=False).apply(calc_alpha36_stk)


    # # Alpha89 (High position / Low position)
    # df['argmin_close_30_idx'] = df.groupby('code')['close'].transform(
    #     lambda x: x.rolling(30, min_periods=15).apply(lambda s: 29 - np.argmin(s.to_numpy()[::-1]) if not s.isnull().all() else np.nan, raw=True)
    # )
    # if 'alpha83' in df.columns: # Check dependencies
    #     df['alpha89'] = safe_division(df['alpha83'], df['argmin_close_30_idx'])

    # if 'close_stk' in df.columns:
    #     df['argmin_close_30_idx_stk'] = df.groupby('code')['close_stk'].transform(
    #          lambda x: x.rolling(30, min_periods=15).apply(lambda s: 29 - np.argmin(s.to_numpy()[::-1]) if not s.isnull().all() else np.nan, raw=True)
    #     )
    #     if 'alpha83_stk' in df.columns: # Check dependencies
    #         df['alpha89_stk'] = safe_division(df['alpha83_stk'], df['argmin_close_30_idx_stk'])

    # Alpha89 (High position / Low position) - Also update this to use correct logic
    print("  - Calculating alpha89 (dependent on corrected alpha83)...")
    df['argmin_close_30_idx_pos'] = df.groupby('code')['close'].transform( # Calculate days since min
        lambda x: x.rolling(30, min_periods=15).apply(
            lambda s: (len(s) - 1) - np.nanargmin(s.to_numpy()) if not s.isnull().all() else np.nan,
            raw=False
        )
    )
    if 'alpha83' in df.columns: # Check dependencies
        # alpha89 = (days since high) / (days since low + epsilon)
        # Smaller value means high is recent relative to low
        df['alpha89'] = safe_division(df['alpha83'], df['argmin_close_30_idx_pos'])

    if 'close_stk' in df.columns:
        df['argmin_close_30_idx_pos_stk'] = df.groupby('code')['close_stk'].transform(
             lambda x: x.rolling(30, min_periods=15).apply(
                 lambda s: (len(s) - 1) - np.nanargmin(s.to_numpy()) if not s.isnull().all() else np.nan,
                 raw=False
             )
        )
        if 'alpha83_stk' in df.columns: # Check dependencies
            df['alpha89_stk'] = safe_division(df['alpha83_stk'], df['argmin_close_30_idx_pos_stk'])

    # Alpha92 (Price Change * Volume)
    if all(c in df.columns for c in ['delta_close_5', 'close', 'vol']):
        df['alpha92'] = safe_division(df['delta_close_5'], df['close']) * df['vol']
    if all(c in df.columns for c in ['delta_close_5_stk', 'close_stk', 'vol_stk']):
        df['alpha92_stk'] = safe_division(df['delta_close_5_stk'], df['close_stk']) * df['vol_stk']


    # === X. Stock & CB Linkage ===
    print("Calculating: X. Stock & CB Linkage")
    if 'pct_chg' in df.columns and 'pct_chg_stk' in df.columns:
        df['stk_up_bond_flat'] = ((df['pct_chg_stk'] > 0.03) & (df['pct_chg'] < 0.01)).astype(int)
        df['stk_down_bond_weak'] = ((df['pct_chg_stk'] < -0.03) & (df['pct_chg'] < df['pct_chg_stk'])).astype(int)
        # Lagged vars
        df['pct_chg_stk_lag1'] = df.groupby('code')['pct_chg_stk'].shift(1)
        df['pct_chg_stk_lag2'] = df.groupby('code')['pct_chg_stk'].shift(2)
        # Check if lags were created before using them
        if 'pct_chg_stk_lag1' in df.columns:
            df['bond_hold_stk_rebound'] = ((df['pct_chg_stk_lag1'] < -0.03) & (df['pct_chg_stk'] > 0.01) & (df['pct_chg'] > 0.005)).astype(int)
        if 'pct_chg_stk_lag2' in df.columns:
            df['stk_down_then_up'] = ((df['pct_chg_stk_lag2'] < -0.03) & (df['pct_chg_stk'] > 0.02)).astype(int)
        df['bond_rebound'] = (df['pct_chg'] > 0.01).astype(int)
        if 'stk_down_then_up' in df.columns: # Check dependency
            df['bond_follow_stk_rebound'] = ((df['stk_down_then_up'] == 1) & (df['bond_rebound'] == 1)).astype(int)
        # Multi-day linkage
        if all(c in df.columns for c in ['stk_ret_mean_3', 'bond_ret_mean_3']):
             df['stk_up_bond_flat_3'] = ((df['stk_ret_mean_3'] > 0.01) & (df['bond_ret_mean_3'] < 0.003)).astype(int)
        if all(c in df.columns for c in ['stk_ret_mean_5', 'bond_ret_mean_5']):
             df['stk_up_bond_flat_5'] = ((df['stk_ret_mean_5'] > 0.015) & (df['bond_ret_mean_5'] < 0.005)).astype(int)


    # === XI. Horizontal & Vertical Deviation ===
    print("Calculating: XI. Horizontal & Vertical Deviation")
    print("  - Skipping: cb_vs_stk_ret_rank_diff (rank based)")
    for win in [3, 5, 10]:
        if f'bond_ret_mean_{win}' in df.columns and f'stk_ret_mean_{win}' in df.columns:
            df[f'dev_bond_vs_stk_{win}'] = df[f'bond_ret_mean_{win}'] - df[f'stk_ret_mean_{win}']

    # Vertical requires longer term means calculated here
    if 'pct_chg' in df.columns:
        df['bond_ret_mean_20'] = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        df['bond_ret_mean_30'] = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(30, min_periods=18).mean())
        if 'bond_ret_mean_3' in df.columns and 'bond_ret_mean_20' in df.columns:
            df['dev_bond_short3_long20'] = df['bond_ret_mean_3'] - df['bond_ret_mean_20']
        if 'bond_ret_mean_5' in df.columns and 'bond_ret_mean_30' in df.columns:
            df['dev_bond_short5_long30'] = df['bond_ret_mean_5'] - df['bond_ret_mean_30']

    if 'pct_chg_stk' in df.columns:
        df['stk_ret_mean_20'] = df.groupby('code')['pct_chg_stk'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        df['stk_ret_mean_30'] = df.groupby('code')['pct_chg_stk'].transform(lambda x: x.rolling(30, min_periods=18).mean())
        if 'stk_ret_mean_3' in df.columns and 'stk_ret_mean_20' in df.columns:
            df['dev_stk_short3_long20'] = df['stk_ret_mean_3'] - df['stk_ret_mean_20']
        if 'stk_ret_mean_5' in df.columns and 'stk_ret_mean_30' in df.columns:
            df['dev_stk_short5_long30'] = df['stk_ret_mean_5'] - df['stk_ret_mean_30']


    # === XII. Risk & Drawdown (CB) ===
    print("Calculating: XII. Risk & Drawdown (CB)")
    df['cb_low_5'] = df.groupby('code')['close'].transform(lambda x: x.rolling(5, min_periods=3).min())
    df['cb_dev_from_low_5'] = safe_division(df['close'] - df['cb_low_5'], df['cb_low_5'])
    df['cb_close_std_5'] = df.groupby('code')['close'].transform(lambda x: x.rolling(5, min_periods=3).std())
    df['cb_high_5'] = df.groupby('code')['close'].transform(lambda x: x.rolling(5, min_periods=3).max())
    df['cb_drawdown_5'] = safe_division(df['close'] - df['cb_high_5'], df['cb_high_5'])
    # cb_dd_prob_estimate moved to XV where its components are calculated


    # === XIII. Consolidation (CB) ===
    print("Calculating: XIII. Consolidation (CB)")
    if all(c in df.columns for c in ['high', 'low', 'close', 'open', 'pre_close']):
        df['range_hl'] = df['high'] - df['low']
        df['atr_5'] = df.groupby('code')['range_hl'].transform(lambda x: x.rolling(5, min_periods=3).mean())
        df['atr_10'] = df.groupby('code')['range_hl'].transform(lambda x: x.rolling(10, min_periods=6).mean())
        df['atr_20'] = df.groupby('code')['range_hl'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        df['atr_decay_5_10'] = safe_division(df['atr_5'], df['atr_10'])
        df['atr_decay_5_20'] = safe_division(df['atr_5'], df['atr_20'])

        df['close_std_10'] = df.groupby('code')['close'].transform(lambda x: x.rolling(10, min_periods=6).std())
        if 'cb_close_std_5' in df.columns: # Dependency check
             df['vol_shrink_ratio'] = safe_division(df['cb_close_std_5'], df['close_std_10'])

        df['body_abs'] = (df['close'] - df['open']).abs()
        df['body_pct'] = safe_division(df['body_abs'], df['pre_close'])
        df['body_pct_mean_5'] = df.groupby('code')['body_pct'].transform(lambda x: x.rolling(5, min_periods=3).mean())
        df['shadow'] = df['range_hl'] - df['body_abs']
        df['shadow_ratio'] = safe_division(df['shadow'], df['pre_close'])
        df['shadow_mean_5'] = df.groupby('code')['shadow_ratio'].transform(lambda x: x.rolling(5, min_periods=3).mean())
        df['small_body_shadow_ratio'] = safe_division(df['shadow'], df['body_abs'], default=100)

        df['is_doji'] = safe_division(df['body_abs'], df['range_hl']) < 0.15
        df['doji_ratio_5'] = df.groupby('code')['is_doji'].transform(lambda x: x.rolling(5, min_periods=3).mean())


    # === XIV. Impulse & Momentum (CB) ===
    print("Calculating: XIV. Impulse & Momentum (CB)")
    print("  - Skipping: score_high_jump_* (score based)")
    if all(c in df.columns for c in ['high', 'pre_close', 'pct_chg']):
        thresholds = [0.015, 0.02, 0.03, 0.04, 0.05, 0.06]
        windows = [20, 120, 250, 500]
        grouped_pct_chg = df.groupby('code')['pct_chg'] # Pre-group for efficiency

        for thres in thresholds:
            thres_name = int(thres*1000)
            df[f'high_jump_{thres_name}_flag'] = (safe_division(df['high'], df['pre_close']) - 1) > thres
            grouped_flag = df.groupby('code')[f'high_jump_{thres_name}_flag']

            for win in windows:
                print(f"  - Calculating high_jump stats for thres={thres}, win={win}...")
                # Use helper function via transform if possible, otherwise apply
                # Count is easy with transform
                df[f'hj_count_{thres_name}_{win}'] = grouped_flag.transform(lambda x: x.rolling(win, min_periods=max(1, int(win*0.6))).sum().fillna(0))

                # Mean and Std require apply because they condition on the flag
                def calc_hj_mean_std(group):
                    flag = group[f'high_jump_{thres_name}_flag']
                    pct = group['pct_chg']
                    _, mean_s, std_s = rolling_high_jump_stats(flag, pct, win)
                    return pd.DataFrame({f'hj_mean_{thres_name}_{win}': mean_s, f'hj_std_{thres_name}_{win}': std_s})

                # Apply and join back - ensure index is handled correctly
                stats_df = df.groupby('code', group_keys=False).apply(calc_hj_mean_std)
                df = df.join(stats_df) # Join based on index (which includes code, trade_date after reset)


    if 'open_jump' in df.columns:
        for n in [5, 10]:
            df[f'open_gap_mean_{n}'] = df.groupby('code')['open_jump'].transform(lambda x: x.rolling(n, min_periods=int(n*0.6)).mean())
            df[f'open_gap_max_{n}'] = df.groupby('code')['open_jump'].transform(lambda x: x.rolling(n, min_periods=int(n*0.6)).max())

    if 'high' in df.columns and 'close' in df.columns:
        for n in [3, 5, 10]:
            close_mean_n = df.groupby('code')['close'].transform(lambda x: x.rolling(n, min_periods=max(1,int(n*0.6))).mean())
            close_std_n = df.groupby('code')['close'].transform(lambda x: x.rolling(n, min_periods=max(1,int(n*0.6))).std())
            df[f'jump_atr_{n}'] = safe_division(df['high'] - close_mean_n, close_std_n)

    if 'pct_chg' in df.columns:
        pct_mean_20 = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        pct_std_20 = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(20, min_periods=12).std())
        df['zscore_pctchg_20'] = safe_division(df['pct_chg'] - pct_mean_20, pct_std_20)

    if 'vol' in df.columns:
        df['vol_ma20'] = df.groupby('code')['vol'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        df['vol_spike_ratio'] = safe_division(df['vol'], df['vol_ma20'], default=1.0)
        df['vol_std_5'] = df.groupby('code')['vol'].transform(lambda x: x.rolling(5, min_periods=3).std())
        df['vol_std_20'] = df.groupby('code')['vol'].transform(lambda x: x.rolling(20, min_periods=12).std())
        df['vol_std_decay'] = safe_division(df['vol_std_5'], df['vol_std_20'])

    if 'range_hl' in df.columns and 'atr_5' in df.columns:
        df['range_today'] = df['range_hl']
        df['range_atr_5'] = safe_division(df['range_today'], df['atr_5'])
        df['range_jump_potential'] = (df['range_atr_5'] > 1.5).astype(int)

    if all(c in df.columns for c in ['open', 'pre_close', 'close']):
        df['gap_and_go_flag'] = ((safe_division(df['open'], df['pre_close']) - 1 > 0.01) & (df['close'] > df['open'])).astype(int)


    # === XV. Downside Resilience (CB) ===
    print("Calculating: XV. Downside Resilience (CB)")
    print("  - Skipping: no_fall_score_* (score based)")
    if 'pct_chg' in df.columns:
        windows = [20, 60, 120, 250]
        grouped_pct_chg = df.groupby('code')['pct_chg'] # Pre-group

        for win in windows:
            print(f"  - Calculating downside stats for win={win}...")

            def calc_downside(group):
                 pct = group['pct_chg']
                 freq_s, mean_s, std_s = rolling_downside_stats(pct, win)
                 return pd.DataFrame({
                     f'down_freq_{win}': freq_s,
                     f'down_amp_mean_{win}': mean_s,
                     f'down_amp_std_{win}': std_s
                 })

            stats_df = df.groupby('code', group_keys=False).apply(calc_downside)
            df = df.join(stats_df) # Join based on index

        # Original cb_dd_prob_estimate (10d lag1 based)
        df['cb_ret_lag1'] = df.groupby('code')['pct_chg'].shift(1)
        if 'cb_ret_lag1' in df.columns:
            df['cb_fall_flag'] = (df['cb_ret_lag1'] < 0).astype(int)
            df['cb_fall_freq_10'] = df.groupby('code')['cb_fall_flag'].transform(lambda x: x.rolling(10, min_periods=6).mean())
            df['cb_fall_amp_10'] = df.groupby('code')['cb_ret_lag1'].transform(
                lambda x: x.rolling(10, min_periods=6).apply(lambda s: s[s < 0].mean() if (s < 0).any() else 0, raw=True)
            )
            df['cb_dd_prob_estimate'] = df['cb_fall_freq_10'] * df['cb_fall_amp_10']


    # === XVI. K-Line Structure Continuity ===
    print("Calculating: XVI. K-Line Structure Continuity")
    if all(c in df.columns for c in ['close', 'open']):
        df['kline_direction'] = np.sign(df['close'] - df['open'])
        df['kline_direction_shift1'] = df.groupby('code')['kline_direction'].shift(1)
        if 'kline_direction_shift1' in df.columns: # Check dependency
            df['kline_flip'] = (df['kline_direction'] * df['kline_direction_shift1'] < 0).astype(int)
            df['kline_flip_ratio_5'] = df.groupby('code')['kline_flip'].transform(lambda x: x.rolling(5, min_periods=3).mean())


    # --- Final Cleanup & Optional Index Restore ---
    # Drop temporary columns if any (like _high_jump_flag_temp)
    temp_cols = [col for col in df.columns if col.startswith('_') and col.endswith('_temp')]
    df = df.drop(columns=temp_cols, errors='ignore')

    if restore_multiindex:
        print("Restoring MultiIndex ['code', 'trade_date']...")
        df = df.set_index(['code', 'trade_date'])

    print("Factor calculation finished.")
    return df

In [18]:
# --- Helper Functions ---
def safe_division(numerator, denominator, default=np.nan):
    """Performs division, returning default value if denominator is zero, NaN, or invalid."""
    try:
        # Ensure inputs are numeric if they are series/arrays
        if hasattr(numerator, '__iter__'):
            numerator = pd.to_numeric(numerator, errors='coerce')
        if hasattr(denominator, '__iter__'):
            denominator = pd.to_numeric(denominator, errors='coerce')
            denominator = denominator.replace(0, np.nan)
        elif isinstance(denominator, (int, float)) and denominator == 0:
            denominator = np.nan

        result = numerator / denominator
        if hasattr(result, '__iter__'):
             # Replace inf/-inf that might result from large numbers / small numbers
             result = result.replace([np.inf, -np.inf], np.nan)
             return result.fillna(default)
        elif np.isinf(result) or np.isnan(result):
             return default
        else:
             return result

    except (TypeError, ValueError):
        # Handle cases where inputs cannot be converted to numeric
        if hasattr(numerator, 'shape'):
             return pd.Series(default, index=getattr(numerator, 'index', None), dtype=float)
        elif hasattr(denominator, 'shape'):
             return pd.Series(default, index=getattr(denominator, 'index', None), dtype=float)
        else:
             return default

def ts_rank(series, window):
    """Calculates the rank of the last value in a rolling window."""
    if series.isnull().all(): # Handle all NaN window
        return np.nan
    # Rank within the window, get rank of the last element (-1 index)
    # pct=True gives rank from 0 to 1
    return series.rank(pct=True).iloc[-1]

# Rolling correlation helper
def rolling_corr(x_series, y_series, window, min_periods):
    """Safely compute rolling correlation"""
    return x_series.rolling(window=window, min_periods=min_periods).corr(y_series)

# Rolling covariance helper
def rolling_cov(x_series, y_series, window, min_periods):
    """Safely compute rolling covariance"""
    return x_series.rolling(window=window, min_periods=min_periods).cov(y_series)

# Rolling rank helper (needed for Alpha 65, 99 inner rank)
def rolling_series_rank(series, window, min_periods):
     # Note: This ranks *within* the rolling window, might not be the same as daily rank
     # For Alpha 65/99, the rank is applied *before* rolling.
     # This helper is more for concept, usually rank is cross-sectional first.
     # We will apply rank cross-sectionally before rolling for alpha factors.
     # Keeping this placeholder in case needed for other rolling rank concepts.
     # return series.rolling(window=window, min_periods=min_periods).apply(lambda x: x.rank().iloc[-1], raw=False)
     pass # Not directly used for the current Alphas as rank is cross-sectional

# Assume 'natr' function uses TA-Lib's NATR if not provided externally
def apply_natr(group, n):
     """Applies TA-Lib NATR safely within a group."""
     if group[['high', 'low', 'close']].isnull().all().all() or len(group) < n:
         return pd.Series(np.nan, index=group.index)
     # Ensure float type for TA-Lib
     high = group['high'].astype(float)
     low = group['low'].astype(float)
     close = group['close'].astype(float)
     return ta.NATR(high, low, close, timeperiod=n)

# --- Main Factor Calculation Function ---
def calculate_factors(df, restore_multiindex=False):
    """
    计算可转债及其对应正股的衍生因子 (包含基于排名的Alpha因子)。
    Handles DataFrame with 'code' and 'trade_date' as columns OR MultiIndex levels.

    Args:
        df (pd.DataFrame): 输入DataFrame.
        restore_multiindex (bool): 若为True, 在末尾将 ['code', 'trade_date'] 设回索引.

    Returns:
        pd.DataFrame: 添加了因子列的DataFrame.
    """
    print("开始因子计算...")

    # --- 输入验证和索引处理 ---
    original_index = df.index # Store original index if needed
    if isinstance(df.index, pd.MultiIndex) and all(name in df.index.names for name in ['code', 'trade_date']):
        print("检测到 'code' 和 'trade_date' 在 MultiIndex 中，正在重置索引...")
        df = df.reset_index()
        is_multiindex_input = True
    elif all(col in df.columns for col in ['code', 'trade_date']):
        print("检测到 'code' 和 'trade_date' 在列中。")
        is_multiindex_input = False
    else:
        raise ValueError("DataFrame 必须包含 'code' 和 'trade_date'，可以是在列中或作为 MultiIndex 的层级。")

    # 确保排序
    df = df.sort_values(by=['code', 'trade_date']).copy()
    # --- 结束索引处理 ---


    # 0. 数据类型准备与安全检查
    print("步骤 0: 准备数据类型...")
    numeric_cols = ['high', 'low', 'close', 'open', 'vol', 'pre_close', 'pct_chg', 'turnover', 'remain_cap', 'float_share',
                    'high_stk', 'low_stk', 'close_stk', 'open_stk', 'vol_stk', 'pct_chg_stk']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            df[col] = df[col].replace([np.inf, -np.inf], np.nan)
        # else:
            # print(f"  警告: 列 '{col}' 不存在.")


    # === I. 基本价格与波动类因子（转债本身） ===
    print("计算: I. 基本价格与波动类因子（转债本身）")
    if all(c in df.columns for c in ['high', 'low', 'close']):
        # NATR
        print("  - 计算 NATR...")
        df['natr_14'] = df.groupby('code', group_keys=False).apply(apply_natr, n=14)
        for n in [1, 3, 5, 10, 20]:
            df[f'natr_{n}'] = df.groupby('code', group_keys=False).apply(apply_natr, n=n)

        # 振幅
        df['zhengfu'] = safe_division(df['high'] - df['low'], df['close'])
        if 'open' in df.columns:
             df['zhengfu_cha'] = safe_division(df['high'] - df['close'], (df['open'] - df['close']).abs())

    if 'close' in df.columns:
        # MA, Momentum, Volatility
        print("  - 计算 MA, Momentum, Volatility...")
        df['ma_20'] = df.groupby('code')['close'].transform(lambda x: ta.SMA(x.astype(float), timeperiod=20))
        df['momentum_20'] = df.groupby('code')['close'].transform(lambda x: safe_division(x, x.shift(20)))
        df['volatility_20'] = df.groupby('code')['close'].transform(lambda x: x.rolling(20, min_periods=10).std())
        # Max Value
        df['max_value'] = df.groupby('code')['close'].transform(lambda x: x.cummax().shift(1))
        df['max_value_position'] = safe_division(df['close'], df['max_value'])

    # 次日止盈特征 (标签)
    if 'high' in df.columns and 'close' in df.columns:
        print("  - 计算次日止盈特征...")
        df['aft_high1'] = df.groupby('code')['high'].shift(-1)
        df['aft_high_cur_close'] = safe_division(df['aft_high1'] - df['close'], df['close'])


    # === II. OBV量能指标（转债） ===
    print("计算: II. OBV量能指标（转债）")
    if all(c in df.columns for c in ['close', 'vol']):
        df['obv'] = df.groupby('code').apply(
             lambda x: ta.OBV(x['close'].astype(float), x['vol'].astype(float)) if not x[['close','vol']].isnull().all().all() else pd.Series(index=x.index, dtype=float)
        ).reset_index(level=0, drop=True) # Retain reset_index as used in original for apply
        if 'obv' in df.columns:
             df['obv_5'] = df.groupby('code')['obv'].transform(lambda x: x.rolling(5, min_periods=3).mean())
             df['obv_10'] = df.groupby('code')['obv'].transform(lambda x: x.rolling(10, min_periods=5).mean())
             df['obv_ratio_5_10'] = safe_division(df['obv_5'], df['obv_10'])


    # === III. 换手与市值类因子 ===
    print("计算: III. 换手与市值类因子")
    if 'turnover' in df.columns:
        print("  - 计算 turnover 相关因子...")
        # Calculate turnover_pct as intermediate step (needed for rolling_avg)
        df['turnover_pct_temp'] = df.groupby('trade_date')['turnover'].rank(pct=True)
        # 均值换手率
        for win in [5, 10, 20, 60]:
            df[f'turnover_{win}_avg'] = df.groupby('code')['turnover'].transform(lambda x: x.rolling(window=win, min_periods=int(win*0.6)).mean())
        # 分位换手率均值 (依赖 turnover_pct_temp)
        if 'turnover_pct_temp' in df.columns:
            for win in [1, 5, 20, 50]:
                 df[f'rolling_{win}_avg'] = df.groupby('code')['turnover_pct_temp'].transform(lambda x: x.rolling(window=win, min_periods=max(1,int(win*0.6))).mean())
            # 分位换手率比率
            if all(c in df.columns for c in ['rolling_1_avg', 'rolling_5_avg', 'rolling_20_avg', 'rolling_50_avg']):
                 df['rolling_1_to_5_avg'] = safe_division(df['rolling_1_avg'], df['rolling_5_avg'])
                 df['rolling_5_to_20_avg'] = safe_division(df['rolling_5_avg'], df['rolling_20_avg'])
                 df['rolling_20_to_50_avg'] = safe_division(df['rolling_20_avg'], df['rolling_50_avg'])
            # Drop intermediate temp column
            df = df.drop(columns=['turnover_pct_temp'])
        else:
            print("  警告: 无法计算 rolling_avg 等因子，因为 turnover_pct_temp 未成功计算。")

    if all(col in df.columns for col in ['remain_cap', 'float_share', 'close_stk']):
        print("  - 计算 cap_float_share_rate...")
        df['cap_float_share_rate'] = safe_division(df['remain_cap'] * 10000, (df['float_share'] * df['close_stk']))


    # === IV. 区间收益率（转债与股票） ===
    print("计算: IV. 区间收益率（转债与股票）")
    # Use mean return naming consistently for deviation factors later
    windows_ret = [3, 5, 10, 20]
    if 'pct_chg' in df.columns:
        print("  - 计算转债区间收益率...")
        for win in windows_ret:
            # Cumulative Product Return
            df[f'pct_chg_{win}'] = df.groupby('code')['pct_chg'].transform(
                lambda x: (x + 1).rolling(win, min_periods=max(1,int(win*0.6))).apply(np.prod, raw=True) - 1
            )
            # Mean Arithmetic Return
            df[f'bond_ret_mean_{win}'] = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(win, min_periods=max(1,int(win*0.6))).mean())

    if 'pct_chg_stk' in df.columns:
        print("  - 计算股票区间收益率...")
        for win in windows_ret:
            df[f'pct_chg_stk_{win}'] = df.groupby('code')['pct_chg_stk'].transform(
                lambda x: (x + 1).rolling(win, min_periods=max(1,int(win*0.6))).apply(np.prod, raw=True) - 1
            )
            df[f'stk_ret_mean_{win}'] = df.groupby('code')['pct_chg_stk'].transform(lambda x: x.rolling(win, min_periods=max(1,int(win*0.6))).mean())


    # === V. 成交量均值比因子（转债） ===
    print("计算: V. 成交量均值比因子（转债）")
    if 'vol' in df.columns:
        vol_windows = [3, 5, 10, 20, 30, 60]
        print("  - 计算均量...")
        for n in vol_windows:
            df[f'vol_{n}_avg'] = df.groupby('code')['vol'].transform(lambda x: x.rolling(n, min_periods=int(n*0.6)).mean())
        print("  - 计算量比...")
        for n in vol_windows:
            for m in vol_windows:
                if n < m and f'vol_{n}_avg' in df.columns and f'vol_{m}_avg' in df.columns:
                    df[f'vol_{n}_to_{m}'] = safe_division(df[f'vol_{n}_avg'], df[f'vol_{m}_avg'])


    # === VI. 波动率与振幅（转债与股票） ===
    print("计算: VI. 波动率与振幅（转债与股票）")
    bodong_windows = [5, 10, 20, 60]
    if 'pct_chg_stk' in df.columns:
        print("  - 计算股票波动率...")
        for win in bodong_windows:
            df[f'bodong_{win}'] = df.groupby('code')['pct_chg_stk'].transform(lambda x: x.rolling(win, min_periods=int(win*0.6)).std() * (win ** 0.5))
        if all(c in df.columns for c in ['bodong_20', 'bodong_60']):
            df['bodong_20_to_bodong_60'] = safe_division(df['bodong_20'], df['bodong_60'])

    if 'pct_chg' in df.columns:
        print("  - 计算转债波动率...")
        for win in bodong_windows:
             df[f'bodong_{win}_bd'] = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(win, min_periods=int(win*0.6)).std() * (win ** 0.5))

    if 'zhengfu' in df.columns:
        print("  - 计算振幅波动...")
        for win in [1, 5, 10, 20, 60]:
            df[f'zhengfu_{win}'] = df.groupby('code')['zhengfu'].transform(lambda x: x.rolling(win, min_periods=max(1,int(win*0.6))).std())
            df[f'zhengfu_{win}_bodong'] = df[f'zhengfu_{win}'] * (win ** 0.5)


    # === VII. 跳空与缺口类因子（转债） ===
    print("计算: VII. 跳空与缺口类因子（转债）")
    if all(c in df.columns for c in ['high', 'low', 'open', 'close', 'pre_close']):
        print("  - 计算基础跳空/缺口指标...")
        df['high_jump'] = (safe_division(df['high'], df['pre_close']) - 1) > 0.025 # Used in count
        df['low_gap'] = (safe_division(df['low'], df['pre_close']) - 1) < -0.025   # Used in count
        df['open_jump'] = (safe_division(df['open'], df['pre_close']) - 1).abs()
        df['gap_body_ratio'] = safe_division(df['open'] - df['pre_close'], (df['close'] - df['open']))

        jump_windows = [20, 100, 250] # Windows from original code
        print("  - 计算跳空/缺口统计...")
        if 'high_jump' in df.columns:
            for win in jump_windows:
                 df[f'high_jump_count_{win}'] = df.groupby('code')['high_jump'].transform(lambda x: x.rolling(window=win, min_periods=int(win*0.6)).sum())
                 # Calculate pct rank based on count
                 df[f'high_jump_count_{win}_pct'] = df.groupby('trade_date')[f'high_jump_count_{win}'].rank(pct=True)
        if 'low_gap' in df.columns:
            for win in jump_windows:
                 df[f'low_gap_count_{win}'] = df.groupby('code')['low_gap'].transform(lambda x: x.rolling(window=win, min_periods=int(win*0.6)).sum())
                 df[f'low_gap_count_{win}_pct'] = df.groupby('trade_date')[f'low_gap_count_{win}'].rank(pct=True)


    # === VIII. K线结构因子（转债） ===
    print("计算: VIII. K线结构因子（转债）")
    if all(c in df.columns for c in ['high', 'low', 'open', 'close']):
        high_low_diff = safe_division(1.0, df['high'] - df['low']) # Precompute inverse for safety/efficiency
        df['close_to_high_ratio'] = (df['close'] - df['low']) * high_low_diff
        df['close_to_low_ratio'] = (df['high'] - df['close']) * high_low_diff
        df['body_position'] = (df['close'] - df['open']) * high_low_diff
        df['upper_shadow_ratio'] = (df['high'] - df[['close', 'open']].max(axis=1)) * high_low_diff
        df['lower_shadow_ratio'] = (df[['close', 'open']].min(axis=1) - df['low']) * high_low_diff


    # === IX. 趋势反转类Alpha因子（转债与股票） ===
    print("计算: IX. 趋势反转类Alpha因子（转债与股票）")
    # --- Prerequisites ---
    print("  - 计算 Alpha 因子前置数据...")
    df['delta_close_1'] = df.groupby('code')['close'].transform(lambda x: x.diff(1))
    df['delta_vol_1'] = df.groupby('code')['vol'].transform(lambda x: x.diff(1))
    df['delta_close_5'] = df.groupby('code')['close'].transform(lambda x: x.diff(5))
    df['delta_close_10'] = df.groupby('code')['close'].transform(lambda x: x.diff(10))
    df['mean_close_20'] = df.groupby('code')['close'].transform(lambda x: x.rolling(20, min_periods=10).mean())

    if 'close_stk' in df.columns:
        df['delta_close_1_stk'] = df.groupby('code')['close_stk'].transform(lambda x: x.diff(1))
        df['delta_close_5_stk'] = df.groupby('code')['close_stk'].transform(lambda x: x.diff(5))
        df['delta_close_10_stk'] = df.groupby('code')['close_stk'].transform(lambda x: x.diff(10))
        df['mean_close_20_stk'] = df.groupby('code')['close_stk'].transform(lambda x: x.rolling(20, min_periods=10).mean())
    if 'vol_stk' in df.columns:
        df['delta_vol_1_stk'] = df.groupby('code')['vol_stk'].transform(lambda x: x.diff(1))

    # --- Cross-sectional Ranks (can be slow) ---
    print("  - 计算截面排名 (可能较慢)...")
    if 'delta_close_10' in df.columns:
        df['rank_delta_close_10'] = df.groupby('trade_date')['delta_close_10'].rank()
    if 'vol' in df.columns:
        df['rank_vol'] = df.groupby('trade_date')['vol'].rank()
    if 'mean_close_20' in df.columns:
        df['rank_mean_close_20'] = df.groupby('trade_date')['mean_close_20'].rank()
    if 'close' in df.columns: # Rank close needed for Alpha65, 99
         df['rank_close'] = df.groupby('trade_date')['close'].rank()

    if 'delta_close_10_stk' in df.columns:
        df['rank_delta_close_10_stk'] = df.groupby('trade_date')['delta_close_10_stk'].rank()
    if 'vol_stk' in df.columns:
        df['rank_vol_stk'] = df.groupby('trade_date')['vol_stk'].rank()
    if 'mean_close_20_stk' in df.columns:
        df['rank_mean_close_20_stk'] = df.groupby('trade_date')['mean_close_20_stk'].rank()
    if 'close_stk' in df.columns: # Rank close_stk needed for Alpha65_stk, 99_stk
         df['rank_close_stk'] = df.groupby('trade_date')['close_stk'].rank()

    # --- Alpha Calculations ---
    # Alpha6: -corr(rank(delta(close, 10)), rank(vol), 10)
    print("  - 计算 Alpha6...")
    if all(c in df.columns for c in ['rank_delta_close_10', 'rank_vol']):
         df['alpha6'] = df.groupby('code').apply(
             lambda x: rolling_corr(x['rank_delta_close_10'], x['rank_vol'], 10, 6) * -1
         ).reset_index(level=0, drop=True)
    if all(c in df.columns for c in ['rank_delta_close_10_stk', 'rank_vol_stk']):
         df['alpha6_stk'] = df.groupby('code').apply(
             lambda x: rolling_corr(x['rank_delta_close_10_stk'], x['rank_vol_stk'], 10, 6) * -1
         ).reset_index(level=0, drop=True)

    # Alpha12: sign(delta(vol, 1)) * -1 * delta(close, 1)
    print("  - 计算 Alpha12...")
    if all(c in df.columns for c in ['delta_vol_1', 'delta_close_1']):
        df['alpha12'] = np.sign(df['delta_vol_1']) * -1 * df['delta_close_1']
    if all(c in df.columns for c in ['delta_vol_1_stk', 'delta_close_1_stk']):
        df['alpha12_stk'] = np.sign(df['delta_vol_1_stk']) * -1 * df['delta_close_1_stk']

    # Alpha83: Days since 30d high (Corrected)
    print("  - 计算 Alpha83...")
    df['alpha83'] = df.groupby('code')['close'].transform(
        lambda x: x.rolling(30, min_periods=15).apply(
            lambda s: (len(s) - 1) - np.nanargmax(s.to_numpy()) if not s.isnull().all() else np.nan, raw=False
        )
    )
    if 'close_stk' in df.columns:
        df['alpha83_stk'] = df.groupby('code')['close_stk'].transform(
            lambda x: x.rolling(30, min_periods=15).apply(
                lambda s: (len(s) - 1) - np.nanargmax(s.to_numpy()) if not s.isnull().all() else np.nan, raw=False
            )
        )

    # Alpha18: close / rank(mean(close, 20))
    print("  - 计算 Alpha18...")
    if all(c in df.columns for c in ['close', 'rank_mean_close_20']):
        df['alpha18'] = safe_division(df['close'], df['rank_mean_close_20'])
    if all(c in df.columns for c in ['close_stk', 'rank_mean_close_20_stk']):
        df['alpha18_stk'] = safe_division(df['close_stk'], df['rank_mean_close_20_stk'])

    # Alpha36: (correlation(vol, close, 5)) + (correlation(vol, open, 5))
    print("  - 计算 Alpha36...")
    if all(c in df.columns for c in ['vol', 'close', 'open']):
        def calc_alpha36(x):
             corr_close = rolling_corr(x['vol'], x['close'], 5, 3)
             corr_open = rolling_corr(x['vol'], x['open'], 5, 3)
             return corr_close.add(corr_open, fill_value=0) # Handle potential NaNs
        df['alpha36'] = df.groupby('code', group_keys=False).apply(calc_alpha36)
    if all(c in df.columns for c in ['vol_stk', 'close_stk', 'open_stk']):
        def calc_alpha36_stk(x):
             corr_close = rolling_corr(x['vol_stk'], x['close_stk'], 5, 3)
             corr_open = rolling_corr(x['vol_stk'], x['open_stk'], 5, 3)
             return corr_close.add(corr_open, fill_value=0)
        df['alpha36_stk'] = df.groupby('code', group_keys=False).apply(calc_alpha36_stk)

    # Alpha89: (days since high) / (days since low + eps) (Corrected)
    print("  - 计算 Alpha89...")
    df['argmin_close_30_idx_pos'] = df.groupby('code')['close'].transform(
        lambda x: x.rolling(30, min_periods=15).apply(
            lambda s: (len(s) - 1) - np.nanargmin(s.to_numpy()) if not s.isnull().all() else np.nan, raw=False
        )
    )
    if 'alpha83' in df.columns: # Check dependencies
        df['alpha89'] = safe_division(df['alpha83'], df['argmin_close_30_idx_pos'])
    if 'close_stk' in df.columns:
        df['argmin_close_30_idx_pos_stk'] = df.groupby('code')['close_stk'].transform(
             lambda x: x.rolling(30, min_periods=15).apply(
                 lambda s: (len(s) - 1) - np.nanargmin(s.to_numpy()) if not s.isnull().all() else np.nan, raw=False
             )
        )
        if 'alpha83_stk' in df.columns: # Check dependencies
            df['alpha89_stk'] = safe_division(df['alpha83_stk'], df['argmin_close_30_idx_pos_stk'])

    # Alpha65: correlation(rank(close), rank(vol), 6)
    print("  - 计算 Alpha65...")
    if all(c in df.columns for c in ['rank_close', 'rank_vol']):
         df['alpha65'] = df.groupby('code').apply(
             lambda x: rolling_corr(x['rank_close'], x['rank_vol'], 6, 4)
         ).reset_index(level=0, drop=True)
    if all(c in df.columns for c in ['rank_close_stk', 'rank_vol_stk']):
         df['alpha65_stk'] = df.groupby('code').apply(
             lambda x: rolling_corr(x['rank_close_stk'], x['rank_vol_stk'], 6, 4)
         ).reset_index(level=0, drop=True)

    # Alpha76: -1 * ts_rank(correlation(close, vol, 10), 10)
    print("  - 计算 Alpha76...")
    if all(c in df.columns for c in ['close', 'vol']):
        df['corr_close_vol_10'] = df.groupby('code').apply(
            lambda x: rolling_corr(x['close'], x['vol'], 10, 6)
        ).reset_index(level=0, drop=True)
        # Apply ts_rank using rolling apply
        df['alpha76'] = df.groupby('code')['corr_close_vol_10'].transform(
             lambda x: -1 * x.rolling(10, min_periods=6).apply(ts_rank, raw=False, args=(10,))
        )
    if all(c in df.columns for c in ['close_stk', 'vol_stk']):
        df['corr_close_vol_10_stk'] = df.groupby('code').apply(
            lambda x: rolling_corr(x['close_stk'], x['vol_stk'], 10, 6)
        ).reset_index(level=0, drop=True)
        df['alpha76_stk'] = df.groupby('code')['corr_close_vol_10_stk'].transform(
             lambda x: -1 * x.rolling(10, min_periods=6).apply(ts_rank, raw=False, args=(10,))
        )

    # Alpha92: (delta(close, 5)/close) * vol
    print("  - 计算 Alpha92...")
    if all(c in df.columns for c in ['delta_close_5', 'close', 'vol']):
        df['alpha92'] = safe_division(df['delta_close_5'], df['close']) * df['vol']
    if all(c in df.columns for c in ['delta_close_5_stk', 'close_stk', 'vol_stk']):
        df['alpha92_stk'] = safe_division(df['delta_close_5_stk'], df['close_stk']) * df['vol_stk']

    # Alpha99: -1 * ts_rank(cov(rank(close), rank(vol), 5), 5)
    print("  - 计算 Alpha99...")
    if all(c in df.columns for c in ['rank_close', 'rank_vol']):
         df['cov_rank_close_vol_5'] = df.groupby('code').apply(
             lambda x: rolling_cov(x['rank_close'], x['rank_vol'], 5, 3)
         ).reset_index(level=0, drop=True)
         df['alpha99'] = df.groupby('code')['cov_rank_close_vol_5'].transform(
              lambda x: -1 * x.rolling(5, min_periods=3).apply(ts_rank, raw=False, args=(5,))
         )
    if all(c in df.columns for c in ['rank_close_stk', 'rank_vol_stk']):
         df['cov_rank_close_vol_5_stk'] = df.groupby('code').apply(
             lambda x: rolling_cov(x['rank_close_stk'], x['rank_vol_stk'], 5, 3)
         ).reset_index(level=0, drop=True)
         df['alpha99_stk'] = df.groupby('code')['cov_rank_close_vol_5_stk'].transform(
              lambda x: -1 * x.rolling(5, min_periods=3).apply(ts_rank, raw=False, args=(5,))
         )


    # === X. 股票与转债联动因子 ===
    print("计算: X. 股票与转债联动因子")
    if 'pct_chg' in df.columns and 'pct_chg_stk' in df.columns:
        print("  - 计算日内联动...")
        df['stk_up_bond_flat'] = ((df['pct_chg_stk'] > 0.03) & (df['pct_chg'] < 0.01)).astype(int)
        df['stk_down_bond_weak'] = ((df['pct_chg_stk'] < -0.03) & (df['pct_chg'] < df['pct_chg_stk'])).astype(int)
        # Lagged vars
        df['pct_chg_stk_lag1'] = df.groupby('code')['pct_chg_stk'].shift(1)
        df['pct_chg_stk_lag2'] = df.groupby('code')['pct_chg_stk'].shift(2)
        if 'pct_chg_stk_lag1' in df.columns:
            df['bond_hold_stk_rebound'] = ((df['pct_chg_stk_lag1'] < -0.03) & (df['pct_chg_stk'] > 0.01) & (df['pct_chg'] > 0.005)).astype(int)
        if 'pct_chg_stk_lag2' in df.columns:
            df['stk_down_then_up'] = ((df['pct_chg_stk_lag2'] < -0.03) & (df['pct_chg_stk'] > 0.02)).astype(int)
        df['bond_rebound'] = (df['pct_chg'] > 0.01).astype(int)
        if 'stk_down_then_up' in df.columns: # Check dependency
            df['bond_follow_stk_rebound'] = ((df['stk_down_then_up'] == 1) & (df['bond_rebound'] == 1)).astype(int)

        print("  - 计算多日联动 (滞涨)...")
        # Multi-day linkage (using mean returns calculated in section IV)
        # Naming stk_chg_N/bond_chg_N based on original code, points to mean returns
        df['stk_chg_3'] = df['stk_ret_mean_3'] if 'stk_ret_mean_3' in df.columns else np.nan
        df['bond_chg_3'] = df['bond_ret_mean_3'] if 'bond_ret_mean_3' in df.columns else np.nan
        df['stk_chg_5'] = df['stk_ret_mean_5'] if 'stk_ret_mean_5' in df.columns else np.nan
        df['bond_chg_5'] = df['bond_ret_mean_5'] if 'bond_ret_mean_5' in df.columns else np.nan

        if all(c in df.columns for c in ['stk_chg_3', 'bond_chg_3']):
             df['stk_up_bond_flat_3'] = ((df['stk_chg_3'] > 0.03) & (df['bond_chg_3'] < 0.01)).astype(int)
        if all(c in df.columns for c in ['stk_chg_5', 'bond_chg_5']):
             df['stk_up_bond_flat_5'] = ((df['stk_chg_5'] > 0.05) & (df['bond_chg_5'] < 0.01)).astype(int)


    # === XI. 横纵向背离因子（股票与转债） ===
    print("计算: XI. 横纵向背离因子（股票与转债）")
    print("  - 计算横向背离...")
    for win in [3, 5, 10]:
        # Deviation using mean returns
        if f'bond_ret_mean_{win}' in df.columns and f'stk_ret_mean_{win}' in df.columns:
            df[f'dev_bond_vs_stk_{win}'] = df[f'bond_ret_mean_{win}'] - df[f'stk_ret_mean_{win}']
        # Rank difference (requires returns calculated in IV)
        if f'pct_chg_{win}' in df.columns :
             df[f'cb_ret_rank_{win}'] = df.groupby('trade_date')[f'pct_chg_{win}'].rank() # Rank based on cumulative return
        if f'pct_chg_stk_{win}' in df.columns:
             df[f'stk_ret_rank_{win}'] = df.groupby('trade_date')[f'pct_chg_stk_{win}'].rank()
        if f'cb_ret_rank_{win}' in df.columns and f'stk_ret_rank_{win}' in df.columns:
             df[f'cb_vs_stk_ret_rank_diff_{win}'] = df[f'cb_ret_rank_{win}'] - df[f'stk_ret_rank_{win}']

    print("  - 计算纵向背离...")
    # Longer term means needed
    if 'pct_chg' in df.columns:
        df['bond_ret_mean_20'] = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        df['bond_ret_mean_30'] = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(30, min_periods=18).mean())
        if 'bond_ret_mean_3' in df.columns and 'bond_ret_mean_20' in df.columns:
            df['dev_bond_short3_long20'] = df['bond_ret_mean_3'] - df['bond_ret_mean_20']
        if 'bond_ret_mean_5' in df.columns and 'bond_ret_mean_30' in df.columns:
            df['dev_bond_short5_long30'] = df['bond_ret_mean_5'] - df['bond_ret_mean_30']

    if 'pct_chg_stk' in df.columns:
        df['stk_ret_mean_20'] = df.groupby('code')['pct_chg_stk'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        df['stk_ret_mean_30'] = df.groupby('code')['pct_chg_stk'].transform(lambda x: x.rolling(30, min_periods=18).mean())
        if 'stk_ret_mean_3' in df.columns and 'stk_ret_mean_20' in df.columns:
            df['dev_stk_short3_long20'] = df['stk_ret_mean_3'] - df['stk_ret_mean_20']
        if 'stk_ret_mean_5' in df.columns and 'stk_ret_mean_30' in df.columns:
            df['dev_stk_short5_long30'] = df['stk_ret_mean_5'] - df['stk_ret_mean_30']


    # === XII. 风险与回撤相关因子（转债） ===
    print("计算: XII. 风险与回撤相关因子（转债）")
    if 'close' in df.columns:
        print("  - 计算低点距离/标准差/回撤...")
        df['cb_low_5'] = df.groupby('code')['close'].transform(lambda x: x.rolling(5, min_periods=3).min())
        df['cb_dev_from_low_5'] = safe_division(df['close'] - df['cb_low_5'], df['cb_low_5'])
        df['cb_close_std_5'] = df.groupby('code')['close'].transform(lambda x: x.rolling(5, min_periods=3).std())
        df['cb_high_5'] = df.groupby('code')['close'].transform(lambda x: x.rolling(5, min_periods=3).max())
        df['cb_drawdown_5'] = safe_division(df['close'] - df['cb_high_5'], df['cb_high_5'])

    if 'pct_chg' in df.columns:
        print("  - 计算下跌风险预估...")
        df['cb_ret_1'] = df.groupby('code')['pct_chg'].shift(1) # Renamed from original cb_ret_1
        df['cb_fall_flag'] = (df['cb_ret_1'] < 0).astype(int)
        df['cb_fall_freq_10'] = df.groupby('code')['cb_fall_flag'].transform(lambda x: x.rolling(10, min_periods=6).mean())
        df['cb_fall_amp_10'] = df.groupby('code')['cb_ret_1'].transform(
            lambda x: x.rolling(10, min_periods=6).apply(lambda s: s[s < 0].mean() if (s < 0).any() else 0, raw=True)
        )
        df['cb_dd_prob_estimate'] = df['cb_fall_freq_10'] * df['cb_fall_amp_10']


    # === XIII. 震荡收敛类因子（转债） ===
    print("计算: XIII. 震荡收敛类因子（转债）")
    if all(c in df.columns for c in ['high', 'low', 'close', 'open', 'pre_close']):
        print("  - 计算 ATR/振幅/价格波动 收敛...")
        df['range_hl'] = df['high'] - df['low'] # Reusable range
        df['atr_5'] = df.groupby('code')['range_hl'].transform(lambda x: x.rolling(5, min_periods=3).mean())
        df['atr_10'] = df.groupby('code')['range_hl'].transform(lambda x: x.rolling(10, min_periods=6).mean()) # Needed in one snippet
        df['atr_20'] = df.groupby('code')['range_hl'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        df['atr_5_decay'] = safe_division(df['atr_5'], df['atr_20']) # Based on snippet
        df['atr_decay_5_10'] = safe_division(df['atr_5'], df['atr_10']) # Based on snippet

        # Zhengfu decay / Range ratio (similar to atr decay)
        # df['zhengfu_5'] = df['atr_5'] # Redundant if atr_5 exists
        # df['zhengfu_20'] = df['atr_20']
        df['zhengfu_decay_5_20'] = safe_division(df['atr_5'], df['atr_20']) # Reusing ATR calc
        df['range_ratio_5_20'] = safe_division(df['atr_5'], df['atr_20']) # Reusing ATR calc

        # Close std deviation shrink
        df['close_std_10'] = df.groupby('code')['close'].transform(lambda x: x.rolling(10, min_periods=6).std())
        if 'cb_close_std_5' in df.columns: # Dependency from Sec XII
             df['vol_shrink_ratio'] = safe_division(df['cb_close_std_5'], df['close_std_10'])

        print("  - 计算 K线实体/影线/十字星 特征...")
        df['body_abs'] = (df['close'] - df['open']).abs()
        df['body_pct'] = safe_division(df['body_abs'], df['pre_close'])
        df['body_pct_mean_5'] = df.groupby('code')['body_pct'].transform(lambda x: x.rolling(5, min_periods=3).mean())

        df['shadow'] = df['range_hl'] - df['body_abs']
        df['shadow_ratio'] = safe_division(df['shadow'], df['pre_close'])
        df['shadow_mean_5'] = df.groupby('code')['shadow_ratio'].transform(lambda x: x.rolling(5, min_periods=3).mean())
        df['small_body_shadow_ratio'] = safe_division(df['shadow'], df['body_abs'], default=100) # Assign large number if body is zero

        df['is_doji'] = safe_division(df['body_abs'], df['range_hl']) < 0.15
        df['doji_ratio_5'] = df.groupby('code')['is_doji'].transform(lambda x: x.rolling(5, min_periods=3).mean())


    # === XIV. 脉冲与动能因子（转债） ===
    print("计算: XIV. 脉冲与动能因子（转债）")
    if all(c in df.columns for c in ['high', 'pre_close', 'pct_chg', 'vol', 'close', 'low']):
        print("  - 计算高脉冲统计 (count, mean, score)...")
        thresholds = [0.015, 0.02, 0.03, 0.04, 0.05, 0.06]
        pulse_window = 20 # Window used for score in original code
        for thres in thresholds:
            thres_name = int(thres*1000)
            # Flag
            df[f'high_jump_{thres_name}'] = (safe_division(df['high'], df['pre_close']) - 1) > thres
            # Count (rolling sum of flags)
            df[f'count_high_jump_{thres_name}_{pulse_window}'] = df.groupby('code')[f'high_jump_{thres_name}'].transform(
                lambda x: x.rolling(pulse_window, min_periods=int(pulse_window*0.6)).sum()
            )
            # Mean return on jump days
            df[f'mean_high_jump_{thres_name}_{pulse_window}'] = df.groupby('code').apply(
                 lambda x: x['pct_chg'].where(x[f'high_jump_{thres_name}']).rolling(pulse_window, min_periods=1).mean() # Need at least 1 jump day for mean
            ).reset_index(level=0, drop=True)
            # Score
            if f'count_high_jump_{thres_name}_{pulse_window}' in df.columns and f'mean_high_jump_{thres_name}_{pulse_window}' in df.columns:
                 df[f'score_high_jump_{thres_name}_{pulse_window}'] = df[f'count_high_jump_{thres_name}_{pulse_window}'] * df[f'mean_high_jump_{thres_name}_{pulse_window}']

        print("  - 计算其他脉冲指标...")
        # Z-score
        pct_mean_20 = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        pct_std_20 = df.groupby('code')['pct_chg'].transform(lambda x: x.rolling(20, min_periods=12).std())
        df['zscore_pctchg_20'] = safe_division(df['pct_chg'] - pct_mean_20, pct_std_20)

        # Volume spike & decay
        df['vol_ma20'] = df.groupby('code')['vol'].transform(lambda x: x.rolling(20, min_periods=12).mean())
        df['vol_spike_ratio'] = safe_division(df['vol'], df['vol_ma20'], default=1.0)
        df['vol_std_5'] = df.groupby('code')['vol'].transform(lambda x: x.rolling(5, min_periods=3).std())
        df['vol_std_20'] = df.groupby('code')['vol'].transform(lambda x: x.rolling(20, min_periods=12).std())
        df['vol_std_decay'] = safe_division(df['vol_std_5'], df['vol_std_20'])

        # Open gap stats
        if 'open_jump' in df.columns: # Calculated in Sec VII
             for n in [5, 10]:
                 df[f'open_gap_mean_{n}'] = df.groupby('code')['open_jump'].transform(lambda x: x.rolling(n, min_periods=int(n*0.6)).mean())
                 df[f'open_gap_max_{n}'] = df.groupby('code')['open_jump'].transform(lambda x: x.rolling(n, min_periods=int(n*0.6)).max())

        # Jump ATR
        for n in [3, 5, 10]:
            close_mean_n = df.groupby('code')['close'].transform(lambda x: x.rolling(n, min_periods=max(1,int(n*0.6))).mean())
            close_std_n = df.groupby('code')['close'].transform(lambda x: x.rolling(n, min_periods=max(1,int(n*0.6))).std())
            df[f'jump_atr_{n}'] = safe_division(df['high'] - close_mean_n, close_std_n)

        # Range jump potential
        if 'range_hl' in df.columns and 'atr_5' in df.columns: # Calculated in XIII
            df['range_today'] = df['range_hl'] # Alias for clarity
            df['range_atr_5'] = safe_division(df['range_today'], df['atr_5'])
            df['range_jump_potential'] = (df['range_atr_5'] > 1.5).astype(int)

        # Gap and Go flag
        if 'open' in df.columns and 'pre_close' in df.columns:
            df['gap_and_go_flag'] = ((safe_division(df['open'], df['pre_close']) - 1 > 0.02) & (df['close'] > df['open'])).astype(int) # Using 2% threshold from snippet


    # === XV. 跌不动因子（转债） ===
    print("计算: XV. 跌不动因子（转债）")
    if 'pct_chg' in df.columns:
        print("  - 计算下跌频率/幅度/评分...")
        for win in [5, 10]: # Windows from original code
            # Down frequency
            df[f'down_freq_{win}'] = df.groupby('code')['pct_chg'].transform(
                lambda x: x.rolling(win, min_periods=int(win*0.6)).apply(lambda s: (s < 0).mean(), raw=True)
            )
            # Down amplitude (mean of negative returns)
            df[f'down_amp_{win}'] = df.groupby('code')['pct_chg'].transform(
                lambda x: x.rolling(win, min_periods=int(win*0.6)).apply(lambda s: s[s < 0].mean() if (s < 0).any() else 0, raw=True)
            )
            # Score
            df[f'no_fall_score_{win}'] = (1 - df[f'down_freq_{win}']) * (-df[f'down_amp_{win}'])


    # === XVI. K线结构连续性 ===
    print("计算: XVI. K线结构连续性")
    if all(c in df.columns for c in ['close', 'open']):
        print("  - 计算K线方向反转率...")
        df['kline_direction'] = np.sign(df['close'] - df['open'])
        df['kline_direction_shift1'] = df.groupby('code')['kline_direction'].shift(1)
        if 'kline_direction_shift1' in df.columns: # Check dependency
            df['kline_flip'] = (df['kline_direction'] * df['kline_direction_shift1'] < 0).astype(int)
            df['kline_flip_ratio_5'] = df.groupby('code')['kline_flip'].transform(lambda x: x.rolling(5, min_periods=3).mean())


    # --- Final Cleanup & Optional Index Restore ---
    print("步骤 XVII: 清理临时列和恢复索引 (如果需要)...")
    # Drop intermediate columns used only for calculation (if any)
    cols_to_drop = [
        'rank_delta_close_10', 'rank_vol', 'rank_mean_close_20', 'rank_close',
        'rank_delta_close_10_stk', 'rank_vol_stk', 'rank_mean_close_20_stk', 'rank_close_stk',
        'corr_close_vol_10', 'corr_close_vol_10_stk',
        'cov_rank_close_vol_5', 'cov_rank_close_vol_5_stk',
        'argmin_close_30_idx_pos', 'argmin_close_30_idx_pos_stk',
        # Add other intermediate columns if created e.g. '_high_jump_flag_temp' if used
    ]
    # Check if columns exist before dropping
    cols_exist = [col for col in cols_to_drop if col in df.columns]
    if cols_exist:
         df = df.drop(columns=cols_exist)

    if restore_multiindex and is_multiindex_input:
        print("  - 恢复 MultiIndex ['code', 'trade_date']...")
        df = df.set_index(['code', 'trade_date'])
    elif restore_multiindex and not is_multiindex_input:
        print("  - 警告: 原始输入没有 MultiIndex，无法恢复。")


    print("因子计算完成。")
    return df

In [24]:
# --- Helper Functions (from previous code, ensure they are defined) ---
def safe_division(numerator, denominator, default=np.nan):
    """Performs division, returning default value if denominator is zero, NaN, or invalid."""
    try:
        if hasattr(numerator, '__iter__'): numerator = pd.to_numeric(numerator, errors='coerce')
        if hasattr(denominator, '__iter__'):
            denominator = pd.to_numeric(denominator, errors='coerce')
            denominator = denominator.replace(0, np.nan)
        elif isinstance(denominator, (int, float)) and denominator == 0:
            denominator = np.nan

        result = numerator / denominator

        if hasattr(result, '__iter__'):
            result = result.replace([np.inf, -np.inf], np.nan)
            return result.fillna(default)
        elif np.isinf(result) or np.isnan(result):
            return default
        else:
            return result
    except (TypeError, ValueError):
        shape = getattr(numerator, 'shape', getattr(denominator, 'shape', None))
        index = getattr(numerator, 'index', getattr(denominator, 'index', None))
        if shape is not None:
            return pd.Series(default, index=index, dtype=float)
        else:
            return default

def apply_ta_func(func, group, required_cols, **kwargs):
    """Safely applies a TA-Lib function to a group."""
    if group[required_cols].isnull().all().all() or len(group) < kwargs.get('timeperiod', 1)*1.5: # Basic check
        return pd.Series(np.nan, index=group.index)
    try:
        # Prepare arguments for TA-Lib function
        args = {col: group[col].astype(float) for col in required_cols}
        return func(**args, **kwargs)
    except Exception as e:
        # print(f"Error applying {func.__name__} to group: {e}") # Optional: for debugging
        return pd.Series(np.nan, index=group.index)

# --- New Advanced Factor Calculation Function ---

def apply_ta_func(group, func, required_cols, **kwargs): # group FIRST, then func, required_cols, **kwargs
    """Safely applies a TA-Lib function to a group."""
    min_len_needed = 1
    if 'timeperiod' in kwargs:
        min_len_needed = kwargs['timeperiod']
    # Add extra buffer, e.g., 1.5 times the timeperiod, minimum 5 periods for robustness
    min_len_needed = max(5, int(min_len_needed * 1.5))

    # Check for sufficient non-null data points in required columns
    sufficient_data = True
    if len(group) < min_len_needed:
        sufficient_data = False
    else:
        # Ensure enough *non-null* values exist in the rolling window equivalent
        # This is a proxy check; the actual rolling window might have NaNs internally
        non_null_counts = group[required_cols].iloc[-min_len_needed:].notnull().sum()
        if any(count < kwargs.get('timeperiod', 1) for count in non_null_counts): # Check if any col has < timeperiod non-nulls
             sufficient_data = False

    # Handle cases with insufficient or all-NaN data
    if not sufficient_data or group[required_cols].isnull().all().all():
        return pd.Series(np.nan, index=group.index, dtype=float)

    try:
        # Prepare arguments for TA-Lib function - ensure they are float arrays
        # TA-Lib functions generally expect numpy arrays of float64
        args = {col: group[col].astype(float).to_numpy() for col in required_cols}

        # Call the TA-Lib function using keyword arguments
        result_array = func(**args, **kwargs)

        # Return as a pandas Series aligned with the group's index
        return pd.Series(result_array, index=group.index, dtype=float)

    except Exception as e:
        # print(f"Error applying {func.__name__} to group {group.name if hasattr(group, 'name') else 'N/A'}: {e}") # Debugging
        return pd.Series(np.nan, index=group.index, dtype=float)

def calculate_advanced_factors(df, restore_multiindex=False):
    """
    计算补充的高级可转债轮动因子，假设基础因子已通过 calculate_factors 计算。

    Args:
        df (pd.DataFrame): 包含基础因子的 DataFrame (由 calculate_factors 输出)。
        restore_multiindex (bool): 若为True, 在末尾将 ['code', 'trade_date'] 设回索引.

    Returns:
        pd.DataFrame: 添加了高级因子列的DataFrame.
    """
    print("开始计算高级因子...")

    required_base_cols = ['code', 'trade_date', 'close', 'high', 'low', 'open', 'pct_chg'] # Minimum required
    if not all(col in df.columns or col in df.index.names for col in required_base_cols):
         raise ValueError(f"输入 DataFrame 缺少必要的基础列: {required_base_cols}")

    # --- 索引处理 (与 calculate_factors 类似) ---
    original_index = df.index
    if isinstance(df.index, pd.MultiIndex) and all(name in df.index.names for name in ['code', 'trade_date']):
        print("检测到 MultiIndex，正在重置...")
        df = df.reset_index()
        is_multiindex_input = True
    elif all(col in df.columns for col in ['code', 'trade_date']):
        print("检测到列 'code', 'trade_date'。")
        is_multiindex_input = False
    else: # Should not happen if input comes from calculate_factors
        raise ValueError("输入 DataFrame 必须包含 'code' 和 'trade_date' 列或索引。")

    # 确保排序 (非常重要)
    if not df.index.is_monotonic_increasing or not df.index.is_unique: # Check if previous sort might be disturbed
       df = df.sort_values(by=['code', 'trade_date']).copy()
    else:
       df = df.copy() # Still make a copy to avoid SettingWithCopyWarning
    # --- 结束索引处理 ---


    # === XVI.b 移动平均线系统 (MA & EMA) 及其偏离度 ===
    print("计算: XVI.b 移动平均线系统 (MA & EMA) 及其偏离度")
    ma_windows = [5, 10, 20, 30, 50, 60, 120, 250, 500]
    if 'close' in df.columns:
        print("  - 计算转债 MA/EMA 及偏离...")
        for n in ma_windows:
            # SMA
            df[f'ma_{n}'] = df.groupby('code')['close'].transform(lambda x: ta.SMA(x.astype(float), timeperiod=n))
            df[f'ma_dev_{n}'] = safe_division(df['close'], df[f'ma_{n}']) - 1
            # EMA
            df[f'ema_{n}'] = df.groupby('code')['close'].transform(lambda x: ta.EMA(x.astype(float), timeperiod=n))
            df[f'ema_dev_{n}'] = safe_division(df['close'], df[f'ema_{n}']) - 1

    if 'close_stk' in df.columns:
        print("  - 计算股票 MA/EMA 及偏离...")
        for n in ma_windows:
            # SMA Stk
            df[f'ma_{n}_stk'] = df.groupby('code')['close_stk'].transform(lambda x: ta.SMA(x.astype(float), timeperiod=n))
            df[f'ma_dev_{n}_stk'] = safe_division(df['close_stk'], df[f'ma_{n}_stk']) - 1
            # EMA Stk
            df[f'ema_{n}_stk'] = df.groupby('code')['close_stk'].transform(lambda x: ta.EMA(x.astype(float), timeperiod=n))
            df[f'ema_dev_{n}_stk'] = safe_division(df['close_stk'], df[f'ema_{n}_stk']) - 1


    # === XVII. 增强型股债背离因子 ===
    print("计算: XVII. 增强型股债背离因子")
    # 前提: calculate_factors 已计算 stk_ret_mean_X, bond_ret_mean_X, dev_bond_vs_stk_X
    required_divergence_cols = ['stk_ret_mean_1', 'stk_ret_mean_3', 'bond_ret_mean_1', 'bond_ret_mean_3', 'dev_bond_vs_stk_1', 'dev_bond_vs_stk_3']
    if all(c in df.columns for c in required_divergence_cols):
        print("  - 计算股强债弱/债超跌/债抗跌 信号...")
        # 股强债弱信号 (示例)
        stk_trend_up = (df['stk_ret_mean_3'] > 0.015) & df.get('ma_5_stk', df['close_stk']) > df.get('ma_10_stk', df['close_stk'].shift(5)) # 使用 get 兼容缺失 MA
        bond_lagging = (df['bond_ret_mean_3'] < 0.005) & (df['dev_bond_vs_stk_3'] < -0.01)
        df['stk_strong_bond_lag_signal'] = (stk_trend_up & bond_lagging).astype(int)

        # 债超跌信号
        df['bond_oversold_vs_stk_signal'] = ((df['stk_ret_mean_1'] >= -0.01) & (df['bond_ret_mean_1'] < -0.015) & (df['dev_bond_vs_stk_1'] < -0.01)).astype(int) # 调整阈值

        # 债抗跌信号
        df['bond_resilient_signal'] = ((df['stk_ret_mean_1'] < -0.01) & (df['bond_ret_mean_1'] > -0.005) & (df['dev_bond_vs_stk_1'] > 0.005)).astype(int)

        print("  - 计算股债背离 Z-Score...")
        # 股债收益差 Z-Score
        dev_mean = df.groupby('code')['dev_bond_vs_stk_3'].transform(lambda x: x.rolling(20, min_periods=10).mean())
        dev_std = df.groupby('code')['dev_bond_vs_stk_3'].transform(lambda x: x.rolling(20, min_periods=10).std())
        df['dev_bond_vs_stk_zscore_3'] = safe_division(df['dev_bond_vs_stk_3'] - dev_mean, dev_std)
    else:
        print("  警告: 缺少计算增强背离因子所需的基础因子。")


    # === XVIII. 动量加速与趋势持续性因子 ===
    print("计算: XVIII. 动量加速与趋势持续性因子")
    if all(c in df.columns for c in ['bond_ret_mean_1', 'bond_ret_mean_3']):
        print("  - 计算转债动量加速...")
        df['bond_ret_accel_1_3'] = df['bond_ret_mean_1'] - df.groupby('code')['bond_ret_mean_3'].shift(1)
    if all(c in df.columns for c in ['stk_ret_mean_1', 'stk_ret_mean_3']):
        print("  - 计算股票动量加速...")
        df['stk_ret_accel_1_3'] = df['stk_ret_mean_1'] - df.groupby('code')['stk_ret_mean_3'].shift(1)

    # # ADX / CCI
    # ta_adx_cci_cols = ['high', 'low', 'close']
    # if all(c in df.columns for c in ta_adx_cci_cols):
    #     print("  - 计算转债 ADX/CCI...")
    #     df['adx_14'] = df.groupby('code', group_keys=False).apply(apply_ta_func, func=ta.ADX, required_cols=ta_adx_cci_cols, timeperiod=14)
    #     df['cci_14'] = df.groupby('code', group_keys=False).apply(apply_ta_func, func=ta.CCI, required_cols=ta_adx_cci_cols, timeperiod=14)

    # ta_adx_cci_cols_stk = ['high_stk', 'low_stk', 'close_stk']
    # if all(c in df.columns for c in ta_adx_cci_cols_stk):
    #     print("  - 计算股票 ADX/CCI...")
    #     df['adx_14_stk'] = df.groupby('code', group_keys=False).apply(apply_ta_func, func=ta.ADX, required_cols=ta_adx_cci_cols_stk, timeperiod=14)
    #     df['cci_14_stk'] = df.groupby('code', group_keys=False).apply(apply_ta_func, func=ta.CCI, required_cols=ta_adx_cci_cols_stk, timeperiod=14)
     # ADX / CCI
    # ADX / CCI
    ta_adx_cci_cols = ['high', 'low', 'close']
    if all(c in df.columns for c in ta_adx_cci_cols):
        print("  - 计算转债 ADX/CCI (using lambda)...")
        # Use lambda to explicitly pass args to the re-defined helper
        df['adx_14'] = df.groupby('code', group_keys=False).apply(
            lambda group: apply_ta_func(group, func=ta.ADX, required_cols=ta_adx_cci_cols, timeperiod=14)
        )
        df['cci_14'] = df.groupby('code', group_keys=False).apply(
            lambda group: apply_ta_func(group, func=ta.CCI, required_cols=ta_adx_cci_cols, timeperiod=14)
        )

    ta_adx_cci_cols_stk = ['high_stk', 'low_stk', 'close_stk']
    if all(c in df.columns for c in ta_adx_cci_cols_stk):
        print("  - 计算股票 ADX/CCI (using lambda)...")
        df['adx_14_stk'] = df.groupby('code', group_keys=False).apply(
            lambda group: apply_ta_func(group, func=ta.ADX, required_cols=ta_adx_cci_cols_stk, timeperiod=14)
        )
        df['cci_14_stk'] = df.groupby('code', group_keys=False).apply(
            lambda group: apply_ta_func(group, func=ta.CCI, required_cols=ta_adx_cci_cols_stk, timeperiod=14)
        )

    # === XIX. 脉冲潜力与精确风险评估 ===
    print("计算: XIX. 脉冲潜力与精确风险评估")
    # 脉冲准备分数 (示例)
    print("  - 计算脉冲准备分数...")
    # 使用 .get() 以防基础因子缺失
    df['pulse_readiness_score'] = (df.get('vol_shrink_ratio', 1) + # 越小越好
                                 df.get('vol_std_decay', 1) +    # 越小越好
                                 df.get('doji_ratio_5', 1) +     # 越多可能震荡末端
                                 df.get('body_pct_mean_5', 1) * 5 # 实体越小越好，放大权重
                                 ).rank(pct=True) # 直接转换为百分位排名，值越小越好

    # 上下行波动率对比
    print("  - 计算上下行波动率对比...")
    if 'pct_chg' in df.columns:
        def calc_up_down_vol(group, window=20): # Use longer window
            series = group['pct_chg']
            up_vol = series.where(series > 0).rolling(window, min_periods=max(2, int(window*0.5))).std().fillna(0)
            down_vol = series.where(series < 0).rolling(window, min_periods=max(2, int(window*0.5))).std().fillna(0)
            return pd.DataFrame({'upside_vol_20': up_vol, 'downside_vol_20': down_vol}, index=group.index)

        vol_df = df.groupby('code', group_keys=False).apply(calc_up_down_vol)
        df = df.join(vol_df)
        df['upside_bias_vol_20'] = safe_division(df.get('upside_vol_20'), df.get('downside_vol_20'))

    # 近期脉冲成功率
    print("  - 计算近期脉冲成功率...")
    if all(c in df.columns for c in ['high', 'open', 'close']):
        df['intra_pulse_15'] = (safe_division(df['high'], df['open']) - 1) > 0.015 # 日内脉冲 > 1.5%
        df['pulse_success_15'] = df['intra_pulse_15'] & (df['close'] > df['open']) # 脉冲且收阳
        df['recent_pulse_success_rate_20'] = df.groupby('code')['pulse_success_15'].transform(lambda x: x.rolling(20, min_periods=10).mean())

    # 历史风险回报比
    print("  - 计算历史风险回报比...")
    if 'pct_chg' in df.columns:
        mean_up = df.groupby('code')['pct_chg'].transform(lambda x: x[x > 0].rolling(60, min_periods=20).mean())
        mean_down_abs = df.groupby('code')['pct_chg'].transform(lambda x: x[x < 0].abs().rolling(60, min_periods=20).mean())
        df['risk_reward_ratio_hist_60'] = safe_division(mean_up, mean_down_abs)


    # === XX. 市场情绪与相对强度因子 ===
    print("计算: XX. 市场情绪与相对强度因子")
    # Beta (需要指数数据，此处注释掉)
    # if 'index_ret' in df.columns and 'pct_chg' in df.columns:
    #     print("  - 计算滚动 Beta...")
    #     cov = df.groupby('code').apply(lambda x: x['pct_chg'].rolling(20, min_periods=12).cov(x['index_ret'])).reset_index(level=0, drop=True)
    #     var_index = df.groupby('code')['index_ret'].transform(lambda x: x.rolling(20, min_periods=12).var())
    #     df['beta_rolling_20'] = safe_division(cov, var_index)
    # else:
    #     print("  - 跳过 Beta 计算 (缺少 'index_ret' 列)。")

    # 行业相对强度 (需要行业数据，此处注释掉)
    # if 'sector_ret' in df.columns and 'pct_chg_stk' in df.columns:
    #     print("  - 计算行业相对强度...")
    #     df['relative_strength_sector'] = df['pct_chg_stk'] - df['sector_ret']
    # else:
    #     print("  - 跳过行业相对强度计算 (缺少 'sector_ret' 列)。")

    # 关键因子截面排名
    print("  - 计算关键因子截面排名...")
    factors_to_rank = {
        'dev_bond_vs_stk_3': True,         # 背离越大越差? (False) or 越小越好 (True)? 假设 True: 小 (滞涨) 好
        'stk_strong_bond_lag_signal': False, # 信号=1 好
        'pulse_readiness_score': True,     # 分数越小越好
        # 'down_freq_20': True,            # 假设 calculate_factors 已计算, 频率越小越好
        'risk_reward_ratio_hist_60': False,# 比率越大越好
        'upside_bias_vol_20': False,       # 比率越大越好
    }
    for factor, ascending in factors_to_rank.items():
        if factor in df.columns:
            df[f'rank_{factor}'] = df.groupby('trade_date')[factor].rank(method='first', ascending=ascending, pct=True) # 使用百分位排名
        else:
            print(f"  警告: 因子 '{factor}' 不存在，无法计算排名。")

    # --- Final Cleanup & Optional Index Restore ---
    print("步骤 XXI: 清理和恢复索引 (如果需要)...")

    if restore_multiindex and is_multiindex_input:
        print("  - 恢复 MultiIndex ['code', 'trade_date']...")
        df = df.set_index(['code', 'trade_date'])
        # 确保索引仍然排序
        if not df.index.is_monotonic_increasing:
             df = df.sort_index()
    elif restore_multiindex and not is_multiindex_input:
        print("  - 警告: 原始输入没有 MultiIndex，无法恢复。")


    print("高级因子计算完成。")
    return df

# --- Example Usage ---
# 1. 首先运行基础因子计算
# df_base_factors = calculate_factors(df_raw.copy(), restore_multiindex=False) # Ensure output has columns

# 2. 然后运行高级因子计算
# df_advanced_factors = calculate_advanced_factors(df_base_factors.copy(), restore_multiindex=True) # Can restore index at the end

# print(df_advanced_factors.info())
# print(df_advanced_factors.tail())

In [26]:
pd.set_option('display.max_columns', None)  # 当列太多时不换行
df = pd.read_parquet('/Users/yiwei/Desktop/git/cb_data.pq') # 导入转债数据
# index = pd.read_parquet('/Users/yiwei/Desktop/git/index.pq') # 导入指数数据

# df_all = load_and_prepare_data('/Users/yiwei/Desktop/git/cb_data.pq')

df_with_factors = calculate_factors(df)


df_with_factors.to_parquet('/Users/yiwei/Desktop/git/cb_data_with_factors2.pq')

开始因子计算...
检测到 'code' 和 'trade_date' 在 MultiIndex 中，正在重置索引...
步骤 0: 准备数据类型...
计算: I. 基本价格与波动类因子（转债本身）
  - 计算 NATR...
  - 计算 MA, Momentum, Volatility...
  - 计算次日止盈特征...
计算: II. OBV量能指标（转债）
计算: III. 换手与市值类因子
  - 计算 turnover 相关因子...
  - 计算 cap_float_share_rate...
计算: IV. 区间收益率（转债与股票）
  - 计算转债区间收益率...
  - 计算股票区间收益率...
计算: V. 成交量均值比因子（转债）
  - 计算均量...
  - 计算量比...
计算: VI. 波动率与振幅（转债与股票）
  - 计算股票波动率...
  - 计算转债波动率...
  - 计算振幅波动...
计算: VII. 跳空与缺口类因子（转债）
  - 计算基础跳空/缺口指标...
  - 计算跳空/缺口统计...
计算: VIII. K线结构因子（转债）
计算: IX. 趋势反转类Alpha因子（转债与股票）
  - 计算 Alpha 因子前置数据...
  - 计算截面排名 (可能较慢)...
  - 计算 Alpha6...
  - 计算 Alpha12...
  - 计算 Alpha83...
  - 计算 Alpha18...
  - 计算 Alpha36...
  - 计算 Alpha89...
  - 计算 Alpha65...
  - 计算 Alpha76...
  - 计算 Alpha92...
  - 计算 Alpha99...
计算: X. 股票与转债联动因子
  - 计算日内联动...
  - 计算多日联动 (滞涨)...
计算: XI. 横纵向背离因子（股票与转债）
  - 计算横向背离...
  - 计算纵向背离...
计算: XII. 风险与回撤相关因子（转债）
  - 计算低点距离/标准差/回撤...
  - 计算下跌风险预估...
计算: XIII. 震荡收敛类因子（转债）
  - 计算 ATR/振幅/价格波动 收敛...
  - 计算 K线实体/影线/十字星 特征...
计算: XIV. 脉冲与

In [25]:
pd.set_option('display.max_columns', None)  # 当列太多时不换行
df = pd.read_parquet('/Users/yiwei/Desktop/git/cb_data_with_factors2.pq') # 导入转债数据

cb_data_with_factors_enhanced = calculate_advanced_factors(df)

cb_data_with_factors_enhanced.to_parquet('/Users/yiwei/Desktop/git/cb_data_with_factors_enhanced.pq')

开始计算高级因子...
检测到列 'code', 'trade_date'。
计算: XVI.b 移动平均线系统 (MA & EMA) 及其偏离度
  - 计算转债 MA/EMA 及偏离...
  - 计算股票 MA/EMA 及偏离...
计算: XVII. 增强型股债背离因子
  警告: 缺少计算增强背离因子所需的基础因子。
计算: XVIII. 动量加速与趋势持续性因子
  - 计算转债 ADX/CCI (using lambda)...
  - 计算股票 ADX/CCI (using lambda)...
计算: XIX. 脉冲潜力与精确风险评估
  - 计算脉冲准备分数...
  - 计算上下行波动率对比...
  - 计算近期脉冲成功率...
  - 计算历史风险回报比...
计算: XX. 市场情绪与相对强度因子
  - 计算关键因子截面排名...
  警告: 因子 'stk_strong_bond_lag_signal' 不存在，无法计算排名。
步骤 XXI: 清理和恢复索引 (如果需要)...
高级因子计算完成。


In [None]:
# 新增部分：涨不动 + 跌不动 + 脉冲可能性因子组合（含组合筛选示例）
# =========================

# ...（原有因子保留，此处省略）...

# =========================
# DEMO：组合 signal 示例（筛选后重新排名）
# =========================

# 目标：选出“跌不动 + 收敛”中的个股，再对其在 turnover 和脉冲潜力上重新打分

# 1️⃣ 筛选条件（如：跌不动 + 收敛）
filter_mask = (df['no_fall_score_10'] > 0.01) & (df['atr_decay_5_10'] < 0.8)
df_filtered = df[filter_mask].copy()

# 2️⃣ 在子集内重新横截面排名（打分因子：turnover + 脉冲潜力）
df_filtered['turnover_score'] = df_filtered.groupby('trade_date')['turnover'].rank(pct=True)
df_filtered['surge_score'] = df_filtered.groupby('trade_date')['jump_atr_5'].rank(pct=True)

# 3️⃣ 综合打分
# 权重可以调整，这里默认 0.5 + 0.5
df_filtered['combo_score'] = 0.5 * df_filtered['turnover_score'] + 0.5 * df_filtered['surge_score']

# 4️⃣ 输出最终 signal（如：得分 > 80%）
df_filtered['signal_combo_top20'] = (df_filtered.groupby('trade_date')['combo_score'].rank(pct=True) > 0.8).astype(int)

# 5️⃣ 可选：将信号回填回主 df（非必须）
df = df.merge(df_filtered[['code', 'trade_date', 'signal_combo_top20']], on=['code', 'trade_date'], how='left')

In [None]:
# gpt v1:
# 可转债 + 正股 因子计算模块（剔除打分/排名逻辑，按分类顺序整理）
# =========================

# === 🟡 波动/收敛类因子 ===
df['atr_5'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(5).mean()).reset_index(0, drop=True)
df['atr_20'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(20).mean()).reset_index(0, drop=True)
df['atr_5_decay'] = df['atr_5'] / df['atr_20']

# 振幅收敛
df['zhengfu_5'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(5).mean()).reset_index(0, drop=True)
df['zhengfu_20'] = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(20).mean()).reset_index(0, drop=True)
df['zhengfu_decay_5_20'] = df['zhengfu_5'] / df['zhengfu_20']
range_5 = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(5).mean()).reset_index(0, drop=True)
range_20 = df.groupby('code').apply(lambda x: (x['high'] - x['low']).rolling(20).mean()).reset_index(0, drop=True)
df['range_ratio_5_20'] = range_5 / range_20

# K线结构收敛
df['body'] = (df['close'] - df['open']).abs()
df['shadow'] = (df['high'] - df['low']) - df['body']
df['small_body_shadow_ratio'] = df['shadow'] / (df['body'] + 1e-6)
df['is_doji'] = (df['body'] / (df['high'] - df['low'] + 1e-6)) < 0.15
df['doji_ratio_5'] = df.groupby('code')['is_doji'].rolling(5).mean().reset_index(0, drop=True)

# === ⚡️ 脉冲类因子 ===
for thres in [0.015, 0.02, 0.03, 0.04, 0.05, 0.06]:
    df[f'high_jump_{int(thres*1000)}'] = ((df['high'] / df['pre_close'] - 1) > thres).astype(int)

for n in [3, 5, 10]:
    high_mean = df.groupby('code')['high'].rolling(n).mean().reset_index(0, drop=True)
    close_mean = df.groupby('code')['close'].rolling(n).mean().reset_index(0, drop=True)
    close_std = df.groupby('code')['close'].rolling(n).std().reset_index(0, drop=True)
    df[f'jump_atr_{n}'] = (df['high'] - close_mean) / (close_std + 1e-6)

df['zscore_pctchg_20'] = df.groupby('code')['pct_chg'].transform(lambda x: (x - x.rolling(20).mean()) / (x.rolling(20).std() + 1e-6))
df['range_today'] = df['high'] - df['low']
df['range_atr_5'] = df['range_today'] / df.groupby('code')['range_today'].rolling(5).mean().reset_index(0, drop=True)
df['range_jump_potential'] = (df['range_atr_5'] > 1.5).astype(int)

# === 📉 跌不动类因子 ===
for win in [5, 10]:
    df[f'down_freq_{win}'] = df.groupby('code')['pct_chg'].apply(lambda x: x.rolling(win).apply(lambda s: (s < 0).mean())).reset_index(0, drop=True)
    df[f'down_amp_{win}'] = df.groupby('code')['pct_chg'].apply(lambda x: x.rolling(win).apply(lambda s: s[s < 0].mean() if (s < 0).any() else 0)).reset_index(0, drop=True)
    df[f'no_fall_score_{win}'] = (1 - df[f'down_freq_{win}']) * (-df[f'down_amp_{win}'])

# === 🔁 情绪与结构类因子 ===
vol_ma20 = df.groupby('code')['volume'].rolling(20).mean().reset_index(0, drop=True)
df['vol_spike_ratio'] = df['volume'] / (vol_ma20 + 1e-6)
vol_std_5 = df.groupby('code')['volume'].rolling(5).std().reset_index(0, drop=True)
vol_std_20 = df.groupby('code')['volume'].rolling(20).std().reset_index(0, drop=True)
df['vol_std_decay'] = vol_std_5 / (vol_std_20 + 1e-6)
df['gap_and_go_flag'] = ((df['open'] > df['pre_close'] * 1.02) & (df['close'] > df['open'])).astype(int)
df['gap_body_ratio'] = (df['open'] - df['pre_close']) / (df['close'] - df['open']).replace(0, np.nan)

# === 📈 正股版本（带 _stk）可选镜像字段 ===
# 注：下方是正股与转债因子镜像，便于后续联动对比分析
df['jump_atr_5_stk'] = (df['high_stk'] - df.groupby('code')['close_stk'].rolling(5).mean().reset_index(0, drop=True)) / \
                        (df.groupby('code')['close_stk'].rolling(5).std().reset_index(0, drop=True) + 1e-6)
df['vol_spike_ratio_stk'] = df['vol_stk'] / (df.groupby('code')['vol_stk'].rolling(20).mean().reset_index(0, drop=True) + 1e-6)
df['gap_and_go_flag_stk'] = ((df['open_stk'] > df['pre_close_stk'] * 1.02) & (df['close_stk'] > df['open_stk'])).astype(int)
