In [30]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# from hyperopt import fmin, tpe, hp, Trials
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# import xgboost as xgb
# from sklearn.metrics import mean_squared_error, r2_score

# 忽略所有警告
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体，或者使用你系统上可用的其他字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题

In [31]:
#利用AskPrice1和BidPrice1计算价格的平均值，进而计算分钟频的收益率
def mid_price(df):
    mid = (df['AskPrice1'] + df['BidPrice1'])/2
    mid = mid.astype(float)
    return mid


In [32]:
def weighted_price(df):
    weighted = (df['AskPrice1']*df['AskVolume1'] + df['BidPrice1']*df['BidVolume1'])/(df['AskVolume1'] + df['BidVolume1'])
    weighted = weighted.astype(float)
    return weighted

In [33]:
# 构造滞后特征
def create_lagged_features(df, window_size1,window_size2):
    for i in range(1, window_size1 + 1):
        df[f'factor_lag{window_size1}_{i}'] = df['factor'].shift(i)
    for j in range(1, window_size2 + 1):
        df[f'factor1_lag{window_size2}_{j}'] = df['factor1'].shift(j)
    df = df.dropna()  # 删除NaN行，因为前几行会有缺失值
    return df

In [34]:
def calculate_ic(factors, returns):
    """
    计算单因子或多因子模型的IC。
    :param factors: 预测因子,可以是单因子,也可以是多因子拟合后得到的y-pred。
    :param returns: 实际的未来回报,Series 格式。
    :return: 信息系数(IC)
    """
    # 计算皮尔逊相关系数
    ic, _ = pearsonr(factors, returns)
    return ic

In [35]:
#由于日盘无法获取前一期夜盘数据，夜盘也无法获取前一期日盘的数据
def process_day(group, prev_period, back_period):
    split_time = pd.to_datetime(group['trading_date'].iloc[0].strftime('%Y-%m-%d') + ' 08:00:00')
    before_8 = group[group['exchange_time'] < split_time].iloc[prev_period:-back_period]
    after_8 = group[group['exchange_time'] >= split_time].iloc[prev_period:-back_period]
    return pd.concat([before_8, after_8])


In [36]:
def calculate_ic_multiple_factors(factors, returns):
    
    # 计算多因子模型的IC，通过回归预测值与实际收益的相关系数。
    
    # 参数:
    # factors (DataFrame): 每列为一个因子，行是样本（股票、日期等）。
    # returns (array-like): 实际的未来收益，和因子数据对应。
    
    # 返回:
    # float: 信息系数（IC），即回归预测值与实际收益的皮尔逊相关系数。
    
    # 创建回归模型
    model = LinearRegression()
    
    # 进行回归训练：因子作为自变量，收益作为因变量
    model.fit(factors, returns)
    
    # 获取回归预测值
    predicted_returns = model.predict(factors)
    
    # 计算预测值与 实际收益的相关系数（即IC）
    ic, _ = pearsonr(predicted_returns, returns)
    
    return ic


In [37]:
"""def calculate_ic_multiple_factors(factors, returns):
    
    # 计算多因子模型的IC，通过回归预测值与实际收益的相关系数。
    
    # 参数:
    # factors (DataFrame): 每列为一个因子，行是样本（股票、日期等）。
    # returns (array-like): 实际的未来收益，和因子数据对应。
    
    # 返回:
    # float: 信息系数（IC），即回归预测值与实际收益的皮尔逊相关系数。
    
    # 创建弹性网回归模型，并通过交叉验证自动选择最优的 alpha 和 l1_ratio
    elastic_net_model = ElasticNetCV(alphas=np.logspace(-6, 6, 13), l1_ratio=np.linspace(0, 1, 11), cv=5)

    # 进行回归训练：因子作为自变量，收益作为因变量
    elastic_net_model.fit(factors, returns)

    # 获取回归预测值
    predicted_returns = elastic_net_model.predict(factors)

    # 计算预测值与实际收益的相关系数（即IC）
    ic, _ = pearsonr(predicted_returns, returns)

    # 输出结果
    print(f"最优的正则化参数 alpha: {elastic_net_model.alpha_}")
    print(f"最优的 L1/L2 比例 (l1_ratio): {elastic_net_model.l1_ratio_}")
    print(f"信息系数（IC）：{ic}")

    return ic"""

'def calculate_ic_multiple_factors(factors, returns):\n    \n    # 计算多因子模型的IC，通过回归预测值与实际收益的相关系数。\n    \n    # 参数:\n    # factors (DataFrame): 每列为一个因子，行是样本（股票、日期等）。\n    # returns (array-like): 实际的未来收益，和因子数据对应。\n    \n    # 返回:\n    # float: 信息系数（IC），即回归预测值与实际收益的皮尔逊相关系数。\n    \n    # 创建弹性网回归模型，并通过交叉验证自动选择最优的 alpha 和 l1_ratio\n    elastic_net_model = ElasticNetCV(alphas=np.logspace(-6, 6, 13), l1_ratio=np.linspace(0, 1, 11), cv=5)\n\n    # 进行回归训练：因子作为自变量，收益作为因变量\n    elastic_net_model.fit(factors, returns)\n\n    # 获取回归预测值\n    predicted_returns = elastic_net_model.predict(factors)\n\n    # 计算预测值与实际收益的相关系数（即IC）\n    ic, _ = pearsonr(predicted_returns, returns)\n\n    # 输出结果\n    print(f"最优的正则化参数 alpha: {elastic_net_model.alpha_}")\n    print(f"最优的 L1/L2 比例 (l1_ratio): {elastic_net_model.l1_ratio_}")\n    print(f"信息系数（IC）：{ic}")\n\n    return ic'

In [38]:
table = pd.read_parquet('ag')

target_date = '2023-06-01'
# 将目标日期转换为 datetime 类型
target_date = pd.to_datetime(target_date)

table = table[table['trading_date'] > target_date]

# 当 AskPrice1 为 0 时，用 BidPrice1 替换
table['AskPrice1'] = table['AskPrice1'].where(table['AskPrice1'] != 0, table['BidPrice1'])

# 当 BidPrice1 为 0 时，用 AskPrice1 替换
table['BidPrice1'] = table['BidPrice1'].where(table['BidPrice1'] != 0, table['AskPrice1'])

In [39]:
table['current_volume'] = table['Volume'].diff()
table['Position Increase'] = table['OpenInterest'].diff()
table['current_turnover'] = table['Turnover'].diff()
table['current_avg_price'] = table['current_turnover']/(table['current_volume']*15)
table['weighted_price'] = weighted_price(table)
table['mid_price'] = mid_price(table)


In [40]:
#判断是否主动买卖
table['主动买or主动卖'] = '被动买卖'
table.loc[table['last'] >= table['AskPrice1'].shift(1),'主动买or主动卖'] = '主动买'
table.loc[table['last'] <= table['BidPrice1'].shift(1),'主动买or主动卖'] = '主动卖'

In [41]:
#这500ms中成交的价格对为Low Price和High Price（假设只有两个价格成交，这里的价格对指的是当期的价格）
#思考：用当期数据比较合理还是用前一期数据比较合理？
#table['Low Price比例'] = (table['current_avg_price'] - table['AskPrice1'])/(table['BidPrice1'] - table['AskPrice1'])
#table['High Price比例'] = 1 - table['Low Price比例']
#'High Price比例'反映高价交易的成交比例；'Low Price比例'反映低价交易的成交比例
#主动买：当High Price比例大于1时，说明交易以超过卖一价达成，即交易的买一价可能是卖二、卖三，反映市场的看多力量；如果出现Low Price比例大于1，可能是因为出现大单，在切片时间段内价格持续快速上涨，导致此时的买二价可能是原来的卖二、卖三等
#主动卖：当Low Price比例大于1时，说明交易以低于买一价达成，即交易的卖一价可能是买二、买三，反映市场的看空力量；如果出现High Price比例大于1，也可能是因为出现大单，在切片时间段内价格持续快速下跌，导致此时的卖二价可能是原来的买二、买三等

In [42]:
#构造基础信息：成交价格对，以及以高价成交和低价成交的比例
table['前一期成交价格对'] = 'BidPrice1 and AskPrice1'
table['前一期Low Price比例'] = (table['current_avg_price'] - table['AskPrice1'].shift(1))/(table['BidPrice1'].shift(1) - table['AskPrice1'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice1 and AskPrice2'
table.loc[(table['前一期Low Price比例'] > 1),'前一期成交价格对'] = 'BidPrice2 and BidPrice1'
table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice2'].shift(1))/(table['AskPrice1'].shift(1) - table['AskPrice2'].shift(1))
table['前一期Low Price比例'].loc[(table['前一期Low Price比例'] > 1)] = (table['current_avg_price'] - table['BidPrice1'].shift(1))/(table['BidPrice2'].shift(1) - table['BidPrice1'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice2 and AskPrice3'
table.loc[(table['前一期Low Price比例'] > 1),'前一期成交价格对'] = 'BidPrice3 and BidPrice2'
table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice3'].shift(1))/(table['AskPrice2'].shift(1) - table['AskPrice3'].shift(1))
table['前一期Low Price比例'].loc[(table['前一期Low Price比例'] > 1)] = (table['current_avg_price'] - table['BidPrice2'].shift(1))/(table['BidPrice3'].shift(1) - table['BidPrice2'].shift(1))
table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice3 and AskPrice4'
table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice4'].shift(1))/(table['AskPrice3'].shift(1) - table['AskPrice4'].shift(1))
table['前一期High Price比例'] = 1-table['前一期Low Price比例']

In [43]:
#计算收益，即y值
table['return'] = -mid_price(table).diff(-120)


In [44]:
#处理缺失值
table['前一期Low Price比例'].replace([np.inf, -np.inf], np.nan, inplace=True)
table['前一期High Price比例'].replace([np.inf, -np.inf], np.nan, inplace=True)

table['current_volume'].fillna(0, inplace=True)
table['Position Increase'].fillna(0, inplace=True)
table['current_turnover'].fillna(0, inplace=True)
table['current_avg_price'].fillna(method='ffill',inplace=True)
table['weighted_price'].fillna(method='ffill',inplace=True)
#table['Low Price比例'].fillna(0, inplace=True)
#table['High Price比例'].fillna(0, inplace=True)
table['前一期Low Price比例'].fillna(0, inplace=True)
table['前一期High Price比例'].fillna(0, inplace=True)
table['return'].fillna(0, inplace=True)



In [45]:
# table['Difference in Trading Volume'] = table['current_volume']*(table['High Price比例'] -table['Low Price比例'])
# table['factor3'] = (table['Difference in Trading Volume'] - table['Difference in Trading Volume'].mean())/table['Difference in Trading Volume'].std()

In [46]:
#因子：前一期不同成交价格比例之差
#同时我们对成交价格对不为“AskPrice1 and BidPrice1”的数据进行惩罚。
#如果全都以买价成交，说明有下降趋势，我们给他一个负的惩罚项；如果全都以卖价成交，说明有上升趋势，我们给一个正的惩罚项。

table['Difference in Trading Pct'] = table['前一期High Price比例'] - table['前一期Low Price比例']
table['New_Difference in Trading Pct'] = 0
table['New_Difference in Trading Pct'].loc[table['前一期成交价格对'] == 'BidPrice1 and AskPrice1'] = table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = 0.5 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = 0.8 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice3 and AskPrice4')] = 2.7 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -0.5 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -0.8 + table['Difference in Trading Pct']
table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice4 and BidPrice3')] = -2.7 + table['Difference in Trading Pct']
table['factor'] = table['New_Difference in Trading Pct']
# table['factor'] = (table['Difference in Trading Pct'] -  table['Difference in Trading Pct'].mean())/table['Difference in Trading Pct'].std()
# table['factor'] = table['前一期High Price比例'] - table['前一期Low Price比例']

In [47]:
#因子：主动买卖信号
table['factor1_weight'] = 0
table['factor1_weight'].loc[table['主动买or主动卖']=='主动买'] = 0.2
table['factor1_weight'].loc[(table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = 1.2
table['factor1_weight'].loc[(table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = 1.8
table['factor1_weight'].loc[table['主动买or主动卖']=='主动卖'] = -0.2
table['factor1_weight'].loc[(table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -1.2
table['factor1_weight'].loc[(table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -1.8
table['factor1'] = table['factor1_weight']*table['current_volume']




In [48]:
#想法是用短期std()相对于长期std()来反映比较大的波动，但是std()无法反映变动方向，于是考虑加入主动买卖信号来判断方向
#用“-1、0、1”做解释变量还是用其他指标
#首先尝试，std（）的相对强弱* factor1
table['std_20_tick'] = table['current_volume'].rolling(20).std()
# table['std_125_tick'] = table['current_volume'].rolling(125).std()
table['factor2'] = table['std_20_tick'] * table['factor1_weight']


In [49]:
table['factor3'] = table['current_avg_price'] - table['mid_price']
# table['factor3'] = table['factor3']
# -table['factor3'].rolling(110).mean()

# table['factor3'] = table['current_avg_price'] - table['mid_price']
# table[['factor3','return']].corr()

In [50]:
cols_to_standardize = ['factor','factor1','factor2','factor3']
table[cols_to_standardize] = table[cols_to_standardize].apply(lambda x: (x - x.mean()) / x.std())

In [51]:
table['factor'].fillna(0,inplace=True)
table['factor1'].fillna(0,inplace=True)
table['factor2'].fillna(0,inplace=True)
table['factor3'].fillna(0,inplace=True)

In [52]:
table.iloc[234280:234310]


Unnamed: 0,order_book_id,exchange_time,trading_date,open,last,high,low,prev_settlement,prev_close,Volume,OpenInterest,Turnover,limit_up,limit_down,AskPrice1,AskPrice2,AskPrice3,AskPrice4,AskPrice5,BidPrice1,BidPrice2,BidPrice3,BidPrice4,BidPrice5,AskVolume1,AskVolume2,AskVolume3,AskVolume4,AskVolume5,BidVolume1,BidVolume2,BidVolume3,BidVolume4,BidVolume5,change_rate,current_volume,Position Increase,current_turnover,current_avg_price,weighted_price,mid_price,主动买or主动卖,前一期成交价格对,前一期Low Price比例,前一期High Price比例,return,Difference in Trading Pct,New_Difference in Trading Pct,factor,factor1_weight,factor1,std_20_tick,factor2,factor3
20476441,AG2308,2023-06-07 11:18:05.500,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403945.0,369340.0,33344650000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,93.0,341.0,176.0,76.0,85.0,96.0,181.0,292.0,304.0,279.0,0.002003,15.0,-3.0,1238400.0,5504.0,5503.492063,5503.5,主动买,BidPrice1 and AskPrice1,-0.0,1.0,2.0,1.0,1.0,1.225836,0.2,-0.001696,4.027539,0.001673,0.888026
20476442,AG2308,2023-06-07 11:18:06.000,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403945.0,369340.0,33344650000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,99.0,251.0,266.0,76.0,85.0,97.0,271.0,202.0,304.0,279.0,0.002003,0.0,0.0,0.0,5504.0,5503.505102,5503.5,主动买,BidPrice1 and AskPrice1,0.0,0.0,2.0,0.0,0.0,0.001517,0.2,-0.002293,4.027539,0.001673,0.888026
20476443,AG2308,2023-06-07 11:18:06.500,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403954.0,369339.0,33345390000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,93.0,296.0,221.0,76.0,85.0,97.0,226.0,247.0,304.0,279.0,0.002003,9.0,-1.0,743040.0,5504.0,5503.489474,5503.5,主动买,BidPrice1 and AskPrice1,-0.0,1.0,2.0,1.0,1.0,1.225836,0.2,-0.001935,4.253484,0.001685,0.888026
20476444,AG2308,2023-06-07 11:18:07.000,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403954.0,369339.0,33345390000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,93.0,221.0,296.0,76.0,85.0,97.0,301.0,172.0,302.0,279.0,0.002003,0.0,0.0,0.0,5504.0,5503.489474,5503.5,主动买,BidPrice1 and AskPrice1,0.0,0.0,2.0,0.0,0.0,0.001517,0.2,-0.002293,4.253484,0.001685,0.888026
20476445,AG2308,2023-06-07 11:18:07.500,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403954.0,369339.0,33345390000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,93.0,221.0,311.0,76.0,85.0,97.0,316.0,172.0,304.0,279.0,0.002003,0.0,0.0,0.0,5504.0,5503.489474,5503.5,主动买,BidPrice1 and AskPrice1,0.0,0.0,2.0,0.0,0.0,0.001517,0.2,-0.002293,4.253484,0.001685,0.888026
20476446,AG2308,2023-06-07 11:18:09.000,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403955.0,369340.0,33345470000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,92.0,221.0,311.0,76.0,85.0,97.0,316.0,172.0,304.0,279.0,0.002003,1.0,1.0,82560.0,5504.0,5503.486772,5503.5,主动买,BidPrice1 and AskPrice1,-0.0,1.0,2.0,1.0,1.0,1.225836,0.2,-0.002253,4.253484,0.001685,0.888026
20476447,AG2308,2023-06-07 11:18:09.500,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403955.0,369340.0,33345470000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,97.0,221.0,311.0,76.0,85.0,97.0,316.0,172.0,304.0,279.0,0.002003,0.0,0.0,0.0,5504.0,5503.5,5503.5,主动买,BidPrice1 and AskPrice1,0.0,0.0,2.0,0.0,0.0,0.001517,0.2,-0.002293,4.253484,0.001685,0.888026
20476448,AG2308,2023-06-07 11:18:10.000,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403955.0,369340.0,33345470000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,97.0,236.0,296.0,76.0,85.0,102.0,301.0,187.0,304.0,279.0,0.002003,0.0,0.0,0.0,5504.0,5503.487437,5503.5,主动买,BidPrice1 and AskPrice1,0.0,0.0,2.0,0.0,0.0,0.001517,0.2,-0.002293,4.295348,0.001688,0.888026
20476449,AG2308,2023-06-07 11:18:10.500,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403955.0,369340.0,33345470000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,97.0,221.0,311.0,76.0,85.0,102.0,316.0,172.0,304.0,279.0,0.002003,0.0,0.0,0.0,5504.0,5503.487437,5503.5,主动买,BidPrice1 and AskPrice1,0.0,0.0,2.0,0.0,0.0,0.001517,0.2,-0.002293,4.295348,0.001688,0.888026
20476450,AG2308,2023-06-07 11:18:11.000,2023-06-07,5501.0,5504.0,5528.0,5469.0,5493.0,5489.0,403956.0,369341.0,33345560000.0,5987.0,4998.0,5504.0,5505.0,5506.0,5507.0,5508.0,5503.0,5502.0,5501.0,5500.0,5499.0,96.0,206.0,326.0,76.0,85.0,103.0,331.0,157.0,304.0,279.0,0.002003,1.0,1.0,82560.0,5504.0,5503.482412,5503.5,主动买,BidPrice1 and AskPrice1,-0.0,1.0,2.0,1.0,1.0,1.225836,0.2,-0.002253,4.268614,0.001686,0.888026


In [53]:
# 对数据进行处理
# grouped_table = table.groupby('trading_date')
# grouped_table = grouped_table.filter(lambda x: x['trading_date'].iloc[0] != pd.to_datetime('2024-04-08') and x['trading_date'].iloc[0] != pd.to_datetime('2024-05-20'))
#删除 'trading_date' 为 '2024-04-08' 和 '2024-05-20' 的行
table['trading_date'] = pd.to_datetime(table['trading_date'])
# 删除 'trading_date' 为 '2024-04-08' 和 '2024-05-20' 的行
new_table = table[~table['trading_date'].isin([pd.to_datetime('2024-04-08'), pd.to_datetime('2024-05-20')])]
new_table = table.groupby('trading_date').apply(process_day, prev_period=120, back_period=120)


In [54]:
new_table['Base_factor'] = (new_table['AskVolume1']-new_table['BidVolume1'])/(new_table['AskVolume1']+new_table['BidVolume1'])
new_table['Base_factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
new_table['Base_factor'].dropna()

ic_value = calculate_ic(new_table['Base_factor'],new_table['return'])
print(f'经典因子的 IC 值：{ic_value}')

经典因子的 IC 值：-0.04645385968463102


In [55]:
new_table['factor2'].replace([np.inf, -np.inf], np.nan, inplace=True)
new_table['factor2'].fillna(0,inplace=True)



In [56]:
factor_cols = ['factor','factor1','factor2','factor3']


# #创建滞后特征
# window_size1 = 10
# window_size2 = 10
# table = create_lagged_features(table, window_size1,window_size2)

# factor_cols.extend([col for col in table.columns if 'factor_lag' in col or 'factor1_lag' in col])

ic_value = calculate_ic_multiple_factors(new_table[factor_cols],new_table['return'])
print(f"该多因子模型 IC 值为：{ic_value}")

该多因子模型 IC 值为：0.033676915307658616


In [57]:
"""# 假设 new_table 已经包含了 'factor1', 'factor2', 'factor3', 'return' 等列
# 确保数据中没有缺失值
new_table_cleaned = new_table[['factor1', 'factor2', 'factor3', 'return']].dropna()

# 设置绘图风格
sns.set(style="whitegrid")

# 创建一个画布和子图（3行1列的图，方便展示每个因子与return的关系）
fig, axs = plt.subplots(3, 1, figsize=(10, 15))

# 绘制 factor1 与 return 的散点图
sns.scatterplot(data=new_table_cleaned, x='factor1', y='return', ax=axs[0], color='blue')
axs[0].set_title('Factor1 vs Return')

# 添加回归线
sns.regplot(x='factor1', y='return', data=new_table_cleaned, ax=axs[0], scatter=False, color='blue')

# 绘制 factor2 与 return 的散点图
sns.scatterplot(data=new_table_cleaned, x='factor2', y='return', ax=axs[1], color='green')
axs[1].set_title('Factor2 vs Return')

# 添加回归线
sns.regplot(x='factor2', y='return', data=new_table_cleaned, ax=axs[1], scatter=False, color='green')

# 绘制 factor3 与 return 的散点图
sns.scatterplot(data=new_table_cleaned, x='factor3', y='return', ax=axs[2], color='red')
axs[2].set_title('Factor3 vs Return')

# 添加回归线
sns.regplot(x='factor3', y='return', data=new_table_cleaned, ax=axs[2], scatter=False, color='red')

# 调整子图布局
plt.tight_layout()
plt.show()

# 如果需要，计算并显示每个因子与收益之间的相关性系数
correlation_factor1 = np.corrcoef(new_table_cleaned['factor1'], new_table_cleaned['return'])[0, 1]
correlation_factor2 = np.corrcoef(new_table_cleaned['factor2'], new_table_cleaned['return'])[0, 1]
correlation_factor3 = np.corrcoef(new_table_cleaned['factor3'], new_table_cleaned['return'])[0, 1]

print(f"Factor1 与 Return 的相关性: {correlation_factor1}")
print(f"Factor2 与 Return 的相关性: {correlation_factor2}")
print(f"Factor3 与 Return 的相关性: {correlation_factor3}")"""

'# 假设 new_table 已经包含了 \'factor1\', \'factor2\', \'factor3\', \'return\' 等列\n# 确保数据中没有缺失值\nnew_table_cleaned = new_table[[\'factor1\', \'factor2\', \'factor3\', \'return\']].dropna()\n\n# 设置绘图风格\nsns.set(style="whitegrid")\n\n# 创建一个画布和子图（3行1列的图，方便展示每个因子与return的关系）\nfig, axs = plt.subplots(3, 1, figsize=(10, 15))\n\n# 绘制 factor1 与 return 的散点图\nsns.scatterplot(data=new_table_cleaned, x=\'factor1\', y=\'return\', ax=axs[0], color=\'blue\')\naxs[0].set_title(\'Factor1 vs Return\')\n\n# 添加回归线\nsns.regplot(x=\'factor1\', y=\'return\', data=new_table_cleaned, ax=axs[0], scatter=False, color=\'blue\')\n\n# 绘制 factor2 与 return 的散点图\nsns.scatterplot(data=new_table_cleaned, x=\'factor2\', y=\'return\', ax=axs[1], color=\'green\')\naxs[1].set_title(\'Factor2 vs Return\')\n\n# 添加回归线\nsns.regplot(x=\'factor2\', y=\'return\', data=new_table_cleaned, ax=axs[1], scatter=False, color=\'green\')\n\n# 绘制 factor3 与 return 的散点图\nsns.scatterplot(data=new_table_cleaned, x=\'factor3\', y=\'return\', ax=axs[2], 

In [58]:
factor_cols.append("return")
new_table[factor_cols].corr()

Unnamed: 0,factor,factor1,factor2,factor3,return
factor,1.0,0.086358,0.153396,0.504772,0.019698
factor1,0.086358,1.0,0.738421,-0.08029,0.011838
factor2,0.153396,0.738421,1.0,0.008489,0.011578
factor3,0.504772,-0.08029,0.008489,1.0,0.030323
return,0.019698,0.011838,0.011578,0.030323,1.0
