In [40]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import warnings
from hyperopt import fmin, tpe, hp, Trials
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# 忽略所有警告
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体，或者使用你系统上可用的其他字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

In [41]:
#利用AskPrice1和BidPrice1计算价格的平均值，进而计算分钟频的收益率
def mid_price(df):
    mid = (df['AskPrice1'] + df['BidPrice1'])/2
    mid = mid.astype(float)
    return mid


In [42]:
# 构造滞后特征
def create_lagged_features(df, window_size1,window_size2):
    for i in range(1, window_size1 + 1):
        df[f'factor_lag{window_size1}_{i}'] = df['factor'].shift(i)
    for j in range(1, window_size2 + 1):
        df[f'factor1_lag{window_size2}_{j}'] = df['factor1'].shift(j)
    df = df.dropna()  # 删除NaN行，因为前几行会有缺失值
    return df

In [43]:
def calculate_ic(factors, returns):
    """
    计算单因子或多因子模型的IC。
    :param factors: 预测因子,可以是单因子,也可以是多因子拟合后得到的y-pred。
    :param returns: 实际的未来回报,Series 格式。
    :return: 信息系数(IC)
    """
    # 计算皮尔逊相关系数
    ic, _ = pearsonr(factors, returns)
    return ic

In [44]:
#由于日盘无法获取前一期夜盘数据，夜盘也无法获取前一期日盘的数据
def process_day(group, prev_period, back_period):
    split_time = pd.to_datetime(group['trading_date'].iloc[0].strftime('%Y-%m-%d') + ' 08:00:00')
    before_8 = group[group['exchange_time'] < split_time].iloc[prev_period:-back_period]
    after_8 = group[group['exchange_time'] >= split_time].iloc[prev_period:-back_period]
    return pd.concat([before_8, after_8])


In [45]:
table = pd.read_parquet('ag')

target_date = '2023-06-01'
# 将目标日期转换为 datetime 类型
target_date = pd.to_datetime(target_date)

table = table[table['trading_date'] > target_date]

In [46]:
table['current_volume'] = table['Volume'].diff()
table['Position Increase'] = table['OpenInterest'].diff()
table['current_turnover'] = table['Turnover'].diff()
table['current_avg_price'] = table['current_turnover']/(table['current_volume']*15)

In [47]:
#判断是否主动买卖
table['主动买or主动卖'] = '被动买卖'
table.loc[table['last'] >= table['AskPrice1'].shift(1),'主动买or主动卖'] = '主动买'
table.loc[table['last'] <= table['BidPrice1'].shift(1),'主动买or主动卖'] = '主动卖'

In [48]:
#这500ms中成交的价格对为Low Price和High Price（假设只有两个价格成交，这里的价格对指的是当期的价格）
#思考：用当期数据比较合理还是用前一期数据比较合理？
#table['Low Price比例'] = (table['current_avg_price'] - table['AskPrice1'])/(table['BidPrice1'] - table['AskPrice1'])
#table['High Price比例'] = 1 - table['Low Price比例']
#'High Price比例'反映高价交易的成交比例；'Low Price比例'反映低价交易的成交比例
#主动买：当High Price比例大于1时，说明交易以超过卖一价达成，即交易的买一价可能是卖二、卖三，反映市场的看多力量；如果出现Low Price比例大于1，可能是因为出现大单，在切片时间段内价格持续快速上涨，导致此时的买二价可能是原来的卖二、卖三等
#主动卖：当Low Price比例大于1时，说明交易以低于买一价达成，即交易的卖一价可能是买二、买三，反映市场的看空力量；如果出现High Price比例大于1，也可能是因为出现大单，在切片时间段内价格持续快速下跌，导致此时的卖二价可能是原来的买二、买三等

In [49]:
#构造基础信息：成交价格对，以及以高价成交和低价成交的比例
table['前一期成交价格对'] = 'BidPrice1 and AskPrice1'
table['前一期Low Price比例'] = (table['current_avg_price'] - table['AskPrice1'].shift(1))/(table['BidPrice1'].shift(1) - table['AskPrice1'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice1 and AskPrice2'
table.loc[(table['前一期Low Price比例'] > 1),'前一期成交价格对'] = 'BidPrice2 and BidPrice1'
table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice2'].shift(1))/(table['AskPrice1'].shift(1) - table['AskPrice2'].shift(1))
table['前一期Low Price比例'].loc[(table['前一期Low Price比例'] > 1)] = (table['current_avg_price'] - table['BidPrice1'].shift(1))/(table['BidPrice2'].shift(1) - table['BidPrice1'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice2 and AskPrice3'
table.loc[(table['前一期Low Price比例'] > 1),'前一期成交价格对'] = 'BidPrice3 and BidPrice2'
table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice3'].shift(1))/(table['AskPrice2'].shift(1) - table['AskPrice3'].shift(1))
table['前一期Low Price比例'].loc[(table['前一期Low Price比例'] > 1)] = (table['current_avg_price'] - table['BidPrice2'].shift(1))/(table['BidPrice3'].shift(1) - table['BidPrice2'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice3 and AskPrice4'
table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice4'].shift(1))/(table['AskPrice3'].shift(1) - table['AskPrice4'].shift(1))
table['前一期High Price比例'] = 1-table['前一期Low Price比例']

In [50]:
#计算收益，即y值
table['return'] = -mid_price(table).diff(-120)


In [51]:
#处理缺失值
table['前一期Low Price比例'].replace([np.inf, -np.inf], np.nan, inplace=True)
table['前一期High Price比例'].replace([np.inf, -np.inf], np.nan, inplace=True)
table['current_volume'].fillna(0, inplace=True)
table['Position Increase'].fillna(0, inplace=True)
table['current_turnover'].fillna(0, inplace=True)
table['current_avg_price'].fillna(method='ffill',inplace=True)
#table['Low Price比例'].fillna(0, inplace=True)
#table['High Price比例'].fillna(0, inplace=True)
table['前一期Low Price比例'].fillna(0, inplace=True)
table['前一期High Price比例'].fillna(0, inplace=True)
table['return'].fillna(0, inplace=True)



In [52]:
#table['Difference in Trading Volume'] = table['current_volume']*(table['High Price比例'] -table['Low Price比例'])
#table['factor3'] = (table['Difference in Trading Volume'] - table['Difference in Trading Volume'].mean())/table['Difference in Trading Volume'].std()

In [53]:
#因子：前一期不同成交价格比例之差
#同时我们对成交价格对不为“AskPrice1 and BidPrice1”的数据进行惩罚。
#如果全都以买价成交，说明有下降趋势，我们给他一个负的惩罚项；如果全都以卖价成交，说明有上升趋势，我们给一个正的惩罚项。

table['Difference in Trading Pct'] = table['前一期High Price比例'] - table['前一期Low Price比例']
table['New_Difference in Trading Pct'] = 0
table['New_Difference in Trading Pct'].loc[table['前一期成交价格对'] == 'BidPrice1 and AskPrice1'] = table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = 1 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = 1.5 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice3 and AskPrice4')] = 2 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -1 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -1.5 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice4 and BidPrice3')] = -2 + table['Difference in Trading Pct']
table['factor'] = (table['New_Difference in Trading Pct'] -  table['New_Difference in Trading Pct'].mean())/table['New_Difference in Trading Pct'].std()

#table['factor'] = (table['Difference in Trading Pct'] -  table['Difference in Trading Pct'].mean())/table['Difference in Trading Pct'].std()


In [54]:
#因子：主动买卖信号
table['factor1'] = 0
table['factor1'].loc[table['主动买or主动卖']=='主动买'] = 1
table['factor1'].loc[(table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = 2
table['factor1'].loc[(table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = 3
table['factor1'].loc[table['主动买or主动卖']=='主动卖'] = -1
table['factor1'].loc[(table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -2
table['factor1'].loc[(table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -3


In [55]:
#想法是用短期std()相对于长期std()来反映比较大的波动，但是std()无法反映变动方向，于是考虑加入主动买卖信号来判断方向
#用“-1、0、1”做解释变量还是用其他指标
#首先尝试，std（）的相对强弱* factor1
table['std_5_tick'] = table['current_avg_price'].rolling(5).std()
table['std_120_tick'] = table['current_avg_price'].rolling(120).std()
table['std_intensity'] = table['std_5_tick']/table['std_120_tick']
table['factor2'] = table['std_intensity'] * table['factor1']


In [56]:
table['factor'].fillna(0,inplace=True)
table['factor1'].fillna(0,inplace=True)
table['factor2'].fillna(0,inplace=True)

In [35]:
# 对数据进行处理
# grouped_table = table.groupby('trading_date')
# grouped_table = grouped_table.filter(lambda x: x['trading_date'].iloc[0] != pd.to_datetime('2024-04-08') and x['trading_date'].iloc[0] != pd.to_datetime('2024-05-20'))
#删除 'trading_date' 为 '2024-04-08' 和 '2024-05-20' 的行
table['trading_date'] = pd.to_datetime(table['trading_date'])
# 删除 'trading_date' 为 '2024-04-08' 和 '2024-05-20' 的行
table = table[~table['trading_date'].isin([pd.to_datetime('2024-04-08'), pd.to_datetime('2024-05-20')])]
table = table.groupby('trading_date').apply(process_day, prev_period=120, back_period=120)


In [None]:
ic_value = calculate_ic(table['factor2'],table['return'])
print(f"因子2的 IC 值为{ic_value}")

In [None]:

# 超参数空间定义
space = {
    'window_size1': hp.randint('window_size1', 0,15),  # 滞后期窗口大小
    'window_size2': hp.randint('window_size2', 0,15),  # 滞后期窗口大小
    'n_estimators': hp.randint('n_estimators', 1,100),  # 树的数量
    'max_depth': hp.choice('max_depth', [1,2,3]),  # 树的最大深度
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1)  # 学习率
}

def optimize_hyperparameters(params):
    """
    目标函数：根据超参数优化计算 IC。
    
    参数:
    params: 当前的超参数组合（来自 Hyperopt）。
    
    返回:
    float: 负的 IC 值，因为 Hyperopt 是最小化目标函数。
    """
    window_size1 = int(params['window_size1'])  # 滞后期的窗口大小
    window_size2 = int(params['window_size2'])
    n_estimators = int(params['n_estimators'])  # XGBoost树的数量
    max_depth = int(params['max_depth'])  # XGBoost树的最大深度
    learning_rate = params['learning_rate']  # 学习率
    
    # 创建滞后特征
    df_lagged = create_lagged_features(table.copy(), window_size1,window_size2)
    
    # 获取所有滞后因子列
    factor_cols = [col for col in df_lagged.columns if 'factor_lag' in col or 'factor1_lag' in col]
    
    # 选择特征 X 和目标 y
    X = df_lagged[factor_cols]  # 特征矩阵
    y = df_lagged['return']  # 未来回报


    # 数据标准化
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 拆分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, shuffle=False)


    # 创建XGBoost模型
    model = xgb.XGBRegressor(
        objective='reg:squarederror',  # 回归问题
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=0.8,
        colsample_bytree=0.8,
        alpha=0.5,   # L1 正则化
        lambda_=1.0,  # L2 正则化
        n_jobs=-1
    )
    

    # 训练模型
    model.fit(X_train, y_train)


    # 预测结果
    y_pred = model.predict(X_train)

    # 计算IC（预测值与实际收益的皮尔逊相关系数）
    ic = calculate_ic(y_pred, y_train)



    # 返回负值，因为 Hyperopt 最小化目标，我们希望最大化 IC
    return -ic


# 最大评估次数
max_evals = 10

# 使用 Hyperopt 进行优化
trials = Trials()
best = fmin(fn=optimize_hyperparameters,  # 优化目标函数
            space=space,  # 搜索空间
            algo=tpe.suggest,  # 使用贝叶斯优化算法
            max_evals=max_evals,  # 最大评估次数
            trials=trials)  # 记录所有试验

print("最佳超参数配置:", best)


In [None]:
# 获取所有滞后因子列
df_lagged = create_lagged_features(table.copy(), best['window_size1'],best['window_size2'])
factor_cols = [col for col in df_lagged.columns if 'factor_lag' in col]
X = df_lagged[factor_cols]  # 特征矩阵
y = df_lagged['return']  # 目标变量，未来收益

X.dropna()
y.dropna()


# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, shuffle=False)

# 创建 XGBoost 模型
model = xgb.XGBRegressor(
    objective='reg:squarederror',  # 回归问题
    n_estimators=best['n_estimators'],
    max_depth=best['max_depth'],
    learning_rate=best['learning_rate'],
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1
)

# 训练模型
model.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = model.predict(X_test)



ic_value1 = calculate_ic(y_pred, y_test)
print(f"测试集上的 IC 值: {ic_value1}")


In [None]:
table['Base_factor'] = (table['AskVolume1']-table['BidVolume1'])/(table['AskVolume1']+table['BidVolume1'])
table['Base_factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
table['Base_factor'].dropna()

ic_value = calculate_ic(table['Base_factor'],table['return'])
print(f'经典因子的 IC 值：{ic_value}')