In [54]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import warnings
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from hyperopt import fmin, tpe, hp, Trials


# from hyperopt import fmin, tpe, hp, Trials
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error, r2_score

# 忽略所有警告
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体，或者使用你系统上可用的其他字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

In [55]:
#利用AskPrice1和BidPrice1计算价格的平均值，进而计算分钟频的收益率
def mid_price(df):
    mid = (df['AskPrice1'] + df['BidPrice1'])/2
    mid = mid.astype(float)
    return mid


In [56]:
def weighted_price(df):
    weighted = (df['AskPrice1']*df['AskVolume'] + df['BidPrice1']*df['BidVolume1'])/(df['AskVolume1'] + df['BidVolume1'])
    weighted = weighted.astype(float)
    return weighted

In [57]:
# 构造滞后特征
def create_lagged_features(df, window_size1,window_size2):
    for i in range(1, window_size1 + 1):
        df[f'factor_lag{window_size1}_{i}'] = df['factor'].shift(i)
    for j in range(1, window_size2 + 1):
        df[f'factor1_lag{window_size2}_{j}'] = df['factor1'].shift(j)
    df = df.dropna()  # 删除NaN行，因为前几行会有缺失值
    return df

In [58]:
def calculate_ic(factors, returns):
    """
    计算单因子或多因子模型的IC。
    :param factors: 预测因子,可以是单因子,也可以是多因子拟合后得到的y-pred。
    :param returns: 实际的未来回报,Series 格式。
    :return: 信息系数(IC)
    """
    # 计算皮尔逊相关系数
    ic, _ = pearsonr(factors, returns)
    return ic

In [59]:
#由于日盘无法获取前一期夜盘数据，夜盘也无法获取前一期日盘的数据
def process_day(group, prev_period, back_period):
    split_time = pd.to_datetime(group['trading_date'].iloc[0].strftime('%Y-%m-%d') + ' 08:00:00')
    before_8 = group[group['exchange_time'] < split_time].iloc[prev_period:-back_period]
    after_8 = group[group['exchange_time'] >= split_time].iloc[prev_period:-back_period]
    return pd.concat([before_8, after_8])


In [60]:
def calculate_ic_multiple_factors(factors, returns):
    """
    计算多因子模型的IC（信息系数），通过回归预测值与实际收益的相关系数。
    
    参数:
    factors (DataFrame): 每列为一个因子，行是样本（股票、日期等）。
    returns (array-like): 实际的未来收益，和因子数据对应。
    
    返回:
    float: 信息系数（IC），即回归预测值与实际收益的皮尔逊相关系数。
    """
    
    # if factors.isnull().any().any() or np.isnan(returns).any():
    #     raise ValueError("因子数据或实际收益包含NaN值，无法进行计算。")
    
    # if factors.shape[0] != len(returns):
    #     raise ValueError("因子数据和实际收益的样本数不匹配。")
    
    # 数据标准化
    scaler = StandardScaler()
    factors_scaled = scaler.fit_transform(factors)
    
    # 创建回归模型
    model = LinearRegression()
    
    # 进行回归训练：因子作为自变量，收益作为因变量
    model.fit(factors_scaled, returns)
    
    # 获取回归预测值
    predicted_returns = model.predict(factors_scaled)
    
    # 计算预测值与实际收益的相关系数（即IC）
    ic, _ = pearsonr(predicted_returns, returns)
    
    return ic


In [61]:
def rolling_mad_optimized(series, window):
    # 计算滚动窗口的中位数
    rolling_median = series.rolling(window=window, min_periods=1).median()

    # 计算每个元素与窗口中位数的绝对偏差
    abs_deviation = np.abs(series - rolling_median)

    # 计算滚动窗口的绝对偏差的中位数（即MAD）
    mad = abs_deviation.rolling(window=window, min_periods=1).median()

    return mad

In [None]:
"""def calculate_ic_multiple_factors(factors, returns):
    
    # 计算多因子模型的IC，通过回归预测值与实际收益的相关系数。
    
    # 参数:
    # factors (DataFrame): 每列为一个因子，行是样本（股票、日期等）。
    # returns (array-like): 实际的未来收益，和因子数据对应。
    
    # 返回:
    # float: 信息系数（IC），即回归预测值与实际收益的皮尔逊相关系数。
    
    # 创建弹性网回归模型，并通过交叉验证自动选择最优的 alpha 和 l1_ratio
    elastic_net_model = ElasticNetCV(alphas=np.logspace(-6, 6, 13), l1_ratio=np.linspace(0, 1, 11), cv=5)

    # 进行回归训练：因子作为自变量，收益作为因变量
    elastic_net_model.fit(factors, returns)

    # 获取回归预测值
    predicted_returns = elastic_net_model.predict(factors)

    # 计算预测值与实际收益的相关系数（即IC）
    ic, _ = pearsonr(predicted_returns, returns)

    # 输出结果
    print(f"最优的正则化参数 alpha: {elastic_net_model.alpha_}")
    print(f"最优的 L1/L2 比例 (l1_ratio): {elastic_net_model.l1_ratio_}")
    print(f"信息系数（IC）：{ic}")

    return ic"""

In [None]:
table = pd.read_parquet('ag')


table['trading_date'] = pd.to_datetime(table['trading_date']) 

# 设置开始和结束时间
start_time_0 = pd.to_datetime('2023-07-01')
end_time_0 = pd.to_datetime('2024-06-30')

# 提取时间范围内的数据
table = table[(table['trading_date'] >= start_time_0) & (table['trading_date'] <= end_time_0)]

# 当 AskPrice1 为 0 时，用 BidPrice1 替换
table['AskPrice1'] = table['AskPrice1'].where(table['AskPrice1'] != 0, table['BidPrice1'])

# 当 BidPrice1 为 0 时，用 AskPrice1 替换
table['BidPrice1'] = table['BidPrice1'].where(table['BidPrice1'] != 0, table['AskPrice1'])

In [27]:
table['current_volume'] = table['Volume'].diff()
table['Position Increase'] = table['OpenInterest'].diff()
table['current_turnover'] = table['Turnover'].diff()
table['current_avg_price'] = table['current_turnover']/(table['current_volume']*15)
table['mid_price'] = mid_price(table)


In [28]:
#判断是否主动买卖
table['主动买or主动卖'] = '被动买卖'
table.loc[table['last'] >= table['AskPrice1'].shift(1),'主动买or主动卖'] = '主动买'
table.loc[table['last'] <= table['BidPrice1'].shift(1),'主动买or主动卖'] = '主动卖'

In [29]:
#这500ms中成交的价格对为Low Price和High Price（假设只有两个价格成交，这里的价格对指的是当期的价格）
#思考：用当期数据比较合理还是用前一期数据比较合理？
#table['Low Price比例'] = (table['current_avg_price'] - table['AskPrice1'])/(table['BidPrice1'] - table['AskPrice1'])
#table['High Price比例'] = 1 - table['Low Price比例']
#'High Price比例'反映高价交易的成交比例；'Low Price比例'反映低价交易的成交比例
#主动买：当High Price比例大于1时，说明交易以超过卖一价达成，即交易的买一价可能是卖二、卖三，反映市场的看多力量；如果出现Low Price比例大于1，可能是因为出现大单，在切片时间段内价格持续快速上涨，导致此时的买二价可能是原来的卖二、卖三等
#主动卖：当Low Price比例大于1时，说明交易以低于买一价达成，即交易的卖一价可能是买二、买三，反映市场的看空力量；如果出现High Price比例大于1，也可能是因为出现大单，在切片时间段内价格持续快速下跌，导致此时的卖二价可能是原来的买二、买三等

In [30]:
#构造基础信息：成交价格对，以及以高价成交和低价成交的比例
table['前一期成交价格对'] = 'BidPrice1 and AskPrice1'
table['前一期Low Price比例'] = (table['current_avg_price'] - table['AskPrice1'].shift(1))/(table['BidPrice1'].shift(1) - table['AskPrice1'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice1 and AskPrice2'
table.loc[(table['前一期Low Price比例'] > 1),'前一期成交价格对'] = 'BidPrice2 and BidPrice1'
table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice2'].shift(1))/(table['AskPrice1'].shift(1) - table['AskPrice2'].shift(1))
table['前一期Low Price比例'].loc[(table['前一期Low Price比例'] > 1)] = (table['current_avg_price'] - table['BidPrice1'].shift(1))/(table['BidPrice2'].shift(1) - table['BidPrice1'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice2 and AskPrice3'
table.loc[(table['前一期Low Price比例'] > 1),'前一期成交价格对'] = 'BidPrice3 and BidPrice2'
table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice3'].shift(1))/(table['AskPrice2'].shift(1) - table['AskPrice3'].shift(1))
table['前一期Low Price比例'].loc[(table['前一期Low Price比例'] > 1)] = (table['current_avg_price'] - table['BidPrice2'].shift(1))/(table['BidPrice3'].shift(1) - table['BidPrice2'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice3 and AskPrice4'
table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice4'].shift(1))/(table['AskPrice3'].shift(1) - table['AskPrice4'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']

In [31]:
#计算收益，即y值
table['return'] = -mid_price(table).diff(-120)
table['return'].fillna(0,inplace=True)

In [32]:
#处理缺失值
table['前一期Low Price比例'].replace([np.inf, -np.inf], np.nan, inplace=True)
table['前一期High Price比例'].replace([np.inf, -np.inf], np.nan, inplace=True)

table['current_volume'].fillna(0, inplace=True)
table['Position Increase'].fillna(0, inplace=True)
table['current_turnover'].fillna(0, inplace=True)
table['current_avg_price'].fillna(method='ffill',inplace=True)

#table['Low Price比例'].fillna(0, inplace=True)
#table['High Price比例'].fillna(0, inplace=True)
table['前一期Low Price比例'].fillna(0, inplace=True)
table['前一期High Price比例'].fillna(0, inplace=True)
table['return'].fillna(0, inplace=True)



In [33]:
#table['Difference in Trading Volume'] = table['current_volume']*(table['High Price比例'] -table['Low Price比例'])
#table['factor3'] = (table['Difference in Trading Volume'] - table['Difference in Trading Volume'].mean())/table['Difference in Trading Volume'].std()

In [34]:
#因子：前一期不同成交价格比例之差
#同时我们对成交价格对不为“AskPrice1 and BidPrice1”的数据进行惩罚。
#如果全都以买价成交，说明有下降趋势，我们给他一个负的惩罚项；如果全都以卖价成交，说明有上升趋势，我们给一个正的惩罚项。

# table['Difference in Trading Pct'] = table['前一期High Price比例'] - table['前一期Low Price比例']
# table['New_Difference in Trading Pct'] = 0
# table['New_Difference in Trading Pct'].loc[table['前一期成交价格对'] == 'BidPrice1 and AskPrice1'] = table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = w1 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = w2 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice3 and AskPrice4')] = w3 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -w1 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -w2 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice4 and BidPrice3')] = -w3 + table['Difference in Trading Pct']
# table['factor'] = table['New_Difference in Trading Pct']
# table['factor'] = (table['Difference in Trading Pct'] -  table['Difference in Trading Pct'].mean())/table['Difference in Trading Pct'].std()
# table['factor'] = table['前一期High Price比例'] - table['前一期Low Price比例']

In [35]:
#因子：主动买卖信号
# table['factor1'] = 0
# table['factor1'].loc[table['主动买or主动卖']=='主动买'] = 1
# table['factor1'].loc[(table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = 1.5
# table['factor1'].loc[(table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = 2
# table['factor1'].loc[table['主动买or主动卖']=='主动卖'] = -1
# table['factor1'].loc[(table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -1.5
# table['factor1'].loc[(table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -2




In [36]:
#想法是用短期std()相对于长期std()来反映比较大的波动，但是std()无法反映变动方向，于是考虑加入主动买卖信号来判断方向
#用“-1、0、1”做解释变量还是用其他指标
#首先尝试，std（）的相对强弱* factor1
# table['std_5_tick'] = table['current_volume'].rolling(window_size1).std()
# table['std_120_tick'] = table['current_volume'].rolling(window_size2).std()
# table['factor2'] = table['std_5_tick']-table['std_120_tick']


In [37]:
# table['factor3'] = (table['current_avg_price'] - table['mid_price'])
# table['factor3'] = table['factor3']-table['factor3'].rolling(window_size3).mean()

# table['factor3'] = table['current_avg_price'] - table['mid_price']
# table[['factor3','return']].corr()

In [38]:
# table['factor'].fillna(0,inplace=True)
# table['factor1'].fillna(0,inplace=True)
# table['factor2'].fillna(0,inplace=True)
# table['factor3'].fillna(0,inplace=True)

In [39]:
table['exchange_time'] = pd.to_datetime(table['exchange_time'])  # 确保 exchange_time 是 datetime 类型

# 设置开始和结束时间
start_time = pd.to_datetime('2023-12-01 09:09:00.500')
end_time = pd.to_datetime('2023-12-01 09:48:00.500')

# 提取时间范围内的数据
filtered_table = table[(table['exchange_time'] >= start_time) & (table['exchange_time'] <= end_time)]

In [41]:
# 对数据进行处理
# grouped_table = table.groupby('trading_date')
# grouped_table = grouped_table.filter(lambda x: x['trading_date'].iloc[0] != pd.to_datetime('2024-04-08') and x['trading_date'].iloc[0] != pd.to_datetime('2024-05-20'))
#删除 'trading_date' 为 '2024-04-08' 和 '2024-05-20' 的行
table['trading_date'] = pd.to_datetime(table['trading_date'])
# 删除 'trading_date' 为 '2024-04-08' 和 '2024-05-20' 的行
new_table = table[~table['trading_date'].isin([pd.to_datetime('2024-04-08'), pd.to_datetime('2024-05-20')])]
new_table = table.groupby('trading_date').apply(process_day, prev_period=120, back_period=120)

In [None]:
def objective(params):
    w1 = params['w1']
    w2 = params['w2']
    w3 = params['w3']

    # 强制约束 w3 > w2 > w1
    if not (w3 > w2 > w1):
        return 1e10  # 给予一个非常大的惩罚值
    
    # window_size1 = params['window_size1']  # 当前的window_size1:短期标准差窗口期
    # window_size2 = params['window_size2']  # 当前的window_size2:长期标准差窗口期
    # window_size3 = params['window_size3']  # 当前的window_size3：factor3滚动动量窗口期
    # window_size4 = params['window_size4']  # factor的滚动窗口期
    # # window_size5 = params['window_size5']
    

    # new_table['factor_weight'] = 0
    # new_table['factor_weight'].loc[new_table['主动买or主动卖'] == '主动买'] = w1
    # new_table['factor_weight'].loc[(new_table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = w2
    # new_table['factor_weight'].loc[(new_table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = w3
    # new_table['factor_weight'].loc[new_table['主动买or主动卖'] == '主动卖'] = -w1
    # new_table['factor_weight'].loc[(new_table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -w2
    # new_table['factor_weight'].loc[(new_table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -w3
    # new_table['factor'] = new_table['factor_weight'] * new_table['current_volume']
    # new_table['factor'] = new_table['factor'] - new_table['factor'].rolling(window_size4).mean()


    new_table['Difference in Trading Pct'] = new_table['前一期High Price比例'] - new_table['前一期Low Price比例']
    new_table['New_Difference in Trading Pct'] = 0
    new_table['New_Difference in Trading Pct'].loc[new_table['前一期成交价格对'] == 'BidPrice1 and AskPrice1'] = new_table['Difference in Trading Pct']
    new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = w1 + new_table['Difference in Trading Pct']
    new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = w2 + new_table['Difference in Trading Pct']
    new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'AskPrice3 and AskPrice4')] = w3 + new_table['Difference in Trading Pct']
    new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -w1 + new_table['Difference in Trading Pct']
    new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -w2 + new_table['Difference in Trading Pct']
    new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'BidPrice4 and BidPrice3')] = -w3 + new_table['Difference in Trading Pct']

    new_table['factor1'] = new_table['New_Difference in Trading Pct']

    # # 根据window_size生成因子
    # new_table[f'std_{window_size1}_tick'] = new_table['current_volume'].rolling(window_size1).std()
    # new_table[f'std_{window_size2}_tick'] = new_table['current_volume'].rolling(window_size2).std()  # 长期窗口
    # new_table['factor2'] = (new_table[f'std_{window_size1}_tick'] / new_table[f'std_{window_size2}_tick']) * new_table['factor_weight']

    # # new_table['Median'] = new_table['current_volume'] - new_table['current_volume'].median()
    # # new_table['factor2'] = new_table['Median'].rolling(window_size1).median()

    # new_table['Momentum'] = new_table['current_avg_price'] - new_table['mid_price']
    # new_table['factor3'] = new_table['Momentum'] - new_table['Momentum'].rolling(window=window_size3).mean()

    # #标准化
    # cols_to_standardize = ['factor','factor1','factor2','factor3']
    # new_table[cols_to_standardize] = new_table[cols_to_standardize].apply(lambda x: (x - x.mean()) / x.std())

    # new_table['transform_factor2'] = np.log(abs(3/(new_table['factor2']+1.5)-1))/(-25)
    # new_table['factor2'] = new_table['transform_factor2'] * (np.abs(new_table['transform_factor2']) <= 1.5) + new_table['factor2']*(np.abs(new_table['transform_factor2']>1.5))
    # new_table['factor2'].replace([np.inf, -np.inf], np.nan, inplace=True)

    # # 清理数据，避免NaN
    # new_table_cleaned = new_table[['factor', 'factor1', 'factor2', 'factor3', 'return']].fillna(0)

    # # 获取因子数据和实际收益
    # factors = new_table_cleaned[['factor', 'factor1', 'factor2', 'factor3']]
    # returns = new_table_cleaned['return']

    # 计算信息系数
    ic = calculate_ic_multiple_factors(factors, returns)
    
    # 目标是最大化IC，所以返回负值
    return -ic  # Hyperopt最小化目标函数，因此返回负值来最大化IC

# 定义搜索空间
space = {
    'w1':hp.quniform('w1',0,2,0.1),
    'w2':hp.quniform('w2',0,2,0.1),
    'w3':hp.quniform('w3',0,2,0.1),
    'w4':hp.quniform('w4',0,1,0.1),
    'w5':hp.quniform('w5',0,2,0.1),
    'w6':hp.quniform('w6',0,3,0.1),
    'window_size1': hp.randint('window_size1', 5, 120),  # window_size1的范围
    'window_size2': hp.randint('window_size2', 5, 120),  # window_size2的范围
    'window_size3': hp.randint('window_size3', 5, 120),  # window_size3的范围
    'window_size4': hp.randint('window_size4', 5, 120),  # window_size4的范围
    # 'window_size5': hp.randint('window_size5', 0, 5)
}

# 运行超参数优化
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.rand.suggest, max_evals=1000, trials=trials)

# 输出最佳窗口大小
print("best w1",best['w1'])
print("best w2",best['w2'])
print("best w3",best['w3'])
print("best w4",best['w4'])
print("best w5",best['w5'])
print("best w6",best['w6'])
print("Best window_size1: ", best['window_size1'])
print("Best window_size2: ", best['window_size2'])
print("Best window_size3: ", best['window_size3'])
print("Best window_size4: ", best['window_size4'])
# print("Best window_size5: ", best['window_size5'])




In [49]:

new_table['factor'].fillna(0,inplace=True)
new_table['factor1'].fillna(0,inplace=True)
new_table['factor2'].fillna(0,inplace=True)
new_table['factor3'].fillna(0,inplace=True)


In [None]:
new_table['factor_weight'] = 0
new_table['factor_weight'].loc[new_table['主动买or主动卖'] == '主动买'] = best['w1']
new_table['factor_weight'].loc[(new_table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = best['w2']
new_table['factor_weight'].loc[(new_table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = best['w3']
new_table['factor_weight'].loc[new_table['主动买or主动卖'] == '主动卖'] = -best['w1']
new_table['factor_weight'].loc[(new_table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -best['w2']
new_table['factor_weight'].loc[(new_table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -best['w3']
new_table['factor'] = new_table['factor_weight'] * new_table['current_volume']
new_table['factor'] = new_table['factor'] - new_table['factor'].rolling(best['window_size4']).mean()


new_table['Difference in Trading Pct'] = new_table['前一期High Price比例'] - new_table['前一期Low Price比例']
new_table['New_Difference in Trading Pct'] = 0
new_table['New_Difference in Trading Pct'].loc[new_table['前一期成交价格对'] == 'BidPrice1 and AskPrice1'] = new_table['Difference in Trading Pct']
new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = best['w4'] + new_table['Difference in Trading Pct']
new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = best['w5'] + new_table['Difference in Trading Pct']
new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'AskPrice3 and AskPrice4')] = best['w6'] + new_table['Difference in Trading Pct']
new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -best['w4'] + new_table['Difference in Trading Pct']
new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -best['w5'] + new_table['Difference in Trading Pct']
new_table['New_Difference in Trading Pct'].loc[(new_table['前一期成交价格对'] == 'BidPrice4 and BidPrice3')] = -best['w6'] + new_table['Difference in Trading Pct']

new_table['factor1'] = new_table['New_Difference in Trading Pct']

# 根据window_size生成因子
new_table[f"std_{best['window_size1']}_tick"] = new_table['current_volume'].rolling(best['window_size1']).std()
new_table[f"std_{best['window_size2']}_tick"] = new_table['current_volume'].shift(best['window_size4']).rolling(best['window_size2']).std()  # 长期窗口
new_table['factor2'] = (new_table[f"std_{best['window_size1']}_tick"] / new_table[f"std_{best['window_size2']}_tick"]) * new_table['factor_weight']

# new_table['Median'] = new_table['current_volume'] - new_table['current_volume'].median()
# new_table['factor2'] = new_table['Median'].rolling(best['window_size1']).median()

new_table['Momentum'] = new_table['current_avg_price'] - new_table['mid_price']
new_table['factor3'] = new_table['Momentum'] - new_table['Momentum'].rolling(window=best['window_size3']).mean()

#标准化
cols_to_standardize = ['factor','factor1','factor2','factor3']
new_table[cols_to_standardize] = new_table[cols_to_standardize].apply(lambda x: (x - x.mean()) / x.std())

new_table['transform_factor2'] = np.log(abs(3/(new_table['factor2']+1.5)-1))/(-25)
new_table['factor2'] = new_table['transform_factor2'] * (np.abs(new_table['transform_factor2']) <= 1.5) + new_table['factor2']*(np.abs(new_table['transform_factor2']>1.5))
new_table['factor2'].replace([np.inf, -np.inf], np.nan, inplace=True)

# 清理数据，避免NaN
new_table_cleaned = new_table[['factor', 'factor1', 'factor2', 'factor3', 'return']].fillna(0)

factor_cols = ['factor', 'factor1', 'factor2', 'factor3']
# 获取因子数据和实际收益
factors = new_table_cleaned[factor_cols]
returns = new_table_cleaned['return']

# # 检查删除后是否还有缺失值
# print("Factors中是否存在NaN值：", factors.isnull().any().any())
# print("Returns中是否存在NaN值：", returns.isnull().any())
# #创建滞后特征
# window_size1 = 10
# window_size2 = 10
# new_table = create_lagged_features(new_table, window_size1,window_size2)

# factor_cols.extend([col for col in new_table.columns if 'factor_lag' in col or 'factor1_lag' in col])
ic_value = calculate_ic_multiple_factors(factors,returns)
print(f"该多因子模型 IC 值为：{ic_value}")



In [None]:
new_table['Base_factor'] = (new_table['AskVolume1']-new_table['BidVolume1'])/(new_table['AskVolume1']+new_table['BidVolume1'])
new_table['Base_factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
new_table['Base_factor'].dropna()

ic_value = calculate_ic(new_table['Base_factor'],new_table['return'])
print(f'经典因子的 IC 值：{ic_value}')

In [None]:
factor_cols.append("return")
new_table[factor_cols].corr()