In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from scipy import fft


from sklearn.model_selection import train_test_split
# import lightgbm as lgb
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from factor_repository.repositories import FactorRepositoryAG

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
plt.rcParams['font.sans-serif'] = ['SimHei']  # 使用黑体，或者使用你系统上可用的其他字体
plt.rcParams['axes.unicode_minus'] = False  # 解决负号显示问题



# 定义所需函数

In [2]:
#利用AskPrice1和BidPrice1计算价格的平均值，进而计算分钟频的收益率
def mid_price(df):
    mid = (df['AskPrice1'] + df['BidPrice1'])/2
    mid = mid.astype(float)
    return mid

In [3]:
def weighted_price(df):
    weighted = (df['AskPrice1']*df['BidVolume1'] + df['BidPrice1']*df['AskVolume1'])/(df['BidVolume1'] + df['AskVolume1'])
    weighted = weighted.astype(float)
    return weighted

In [4]:
def calculate_ic(factors, returns):
    # 计算皮尔逊相关系数
    ic, _ = pearsonr(factors, returns)
    return ic

In [5]:
def calculate_ic_multiple_factors(factors, returns):
    

    # 创建回归模型
    model = LinearRegression()
    
    # 进行回归训练：因子作为自变量，收益作为因变量
    model.fit(factors, returns)
    
    # 获取回归预测值
    predicted_returns = model.predict(factors)
    
    # 计算预测值与实际收益的相关系数（即IC）
    ic, _ = pearsonr(predicted_returns, returns)
    
    return ic

In [None]:
# 定义函数：获取前一个交易日
def get_previous_trading_date(current_date, trading_dates):
    idx = trading_dates.index(current_date)
    return trading_dates[idx - 1] if idx > 0 else None

# 定义主处理函数
def process_day(group, prev_period, back_period, trading_dates):
    """
    按照日盘和夜盘时间范围划分交易数据，并对每段数据进行前后切片处理，最后返回拼接结果。

    :param group: 分组后的 DataFrame，每组是一个交易日的数据。
    :param prev_period: 去掉每段数据前 prev_period 条记录。
    :param back_period: 去掉每段数据后 back_period 条记录。
    :param trading_dates: 所有交易日的序列，用于查找前一个交易日。
    :return: 处理后的日盘和夜盘数据拼接后的 DataFrame。
    """
    # 获取当前交易日和前一个交易日
    trading_date = group['trading_date'].iloc[0]
    previous_trading_date = get_previous_trading_date(trading_date, trading_dates)

    # 定义时间范围
    day_start = pd.to_datetime(f"{trading_date} 09:00:00")
    day_end = pd.to_datetime(f"{trading_date} 15:00:00")
    night_start = pd.to_datetime(f"{previous_trading_date} 21:00:00") if previous_trading_date else None
    night_end = (pd.to_datetime(f"{previous_trading_date} 02:30:00") + pd.Timedelta(days=1)) if previous_trading_date else None

    # 筛选日盘数据
    day_session = group[(group['exchange_time'] >= day_start) & (group['exchange_time'] <= day_end)]

    # 筛选夜盘数据（需要判断是否有前一个交易日）
    if night_start:
        night_session = group[(group['exchange_time'] >= night_start) & (group['exchange_time'] <= night_end)]
    else:
        night_session = pd.DataFrame()  # 如果没有前一个交易日，则夜盘数据为空

    # 检查数据长度，避免切片越界
    day_session_processed = day_session.iloc[prev_period:-back_period] 
    night_session_processed = night_session.iloc[prev_period:-back_period]

    # 拼接处理后的日盘和夜盘数据
    processed_data = pd.concat([night_session_processed,day_session_processed], ignore_index=True)

    return processed_data


# 提取指定时间范围内的数据
同时对涨跌停价做处理

In [None]:
df = pd.read_parquet(r'source\ag')
df['trading_date'] = pd.to_datetime(df['trading_date']) 

# 设置开始和结束时间
start_time = pd.to_datetime('2023-07-01')
end_time = pd.to_datetime('2024-07-01')

table = df[(df['trading_date'] >= start_time) & (df['trading_date'] <= end_time)]

# 当 AskPrice1 为 0 时，用 BidPrice1 替换
table['AskPrice1'] = table['AskPrice1'].where(table['AskPrice1'] != 0, table['BidPrice1'])

# 当 AskPrice1 为 0 时，用 AskPrice1 替换
table['BidPrice1'] = table['BidPrice1'].where(table['BidPrice1'] != 0, table['AskPrice1'])

# 计算tick数据中的基本信息

In [9]:
# 计算一些差分数据
table['current_volume'] = table['Volume'].diff()
table['Position Increase'] = table['OpenInterest'].diff()
table['current_turnover'] = table['Turnover'].diff()
table['current_avg_price'] = table['current_turnover']/(table['current_volume']*15)
table['mid_price'] = mid_price(table)
table['weighted_price'] = weighted_price(table)
# 计算买方委托量之和和卖方委托量之和，以及总委托量
table['BidVolume'] = table[['BidVolume1','BidVolume2','BidVolume3','BidVolume4','BidVolume5']].sum(axis=1)
table['AskVolume'] = table[['AskVolume1','AskVolume2','AskVolume3','AskVolume4','AskVolume5']].sum(axis=1)
table['Order_Volume'] = table['BidVolume'] + table['AskVolume']

In [10]:
# # 判断是否主动买卖
table['buy_sell_signal'] = 0
table.loc[table['last'] >= table['AskPrice1'].shift(1),'buy_sell_signal'] = 1
table.loc[table['last'] <= table['BidPrice1'].shift(1),'buy_sell_signal'] = -1


In [11]:
# # 构造基础信息：成交价格对，以及以高价成交和低价成交的比例
# table['前一期成交价格对'] = 'BidPrice1 and AskPrice1'
# table['前一期Low Price比例'] = (table['current_avg_price'] - table['AskPrice1'].shift(1))/(table['BidPrice1'].shift(1) - table['AskPrice1'].shift(1))
# table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
# table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
# table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice1 and AskPrice2'
# table.loc[(table['前一期Low Price比例'] > 1),'前一期成交价格对'] = 'BidPrice2 and BidPrice1'
# table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice2'].shift(1))/(table['AskPrice1'].shift(1) - table['AskPrice2'].shift(1))
# table['前一期Low Price比例'].loc[(table['前一期Low Price比例'] > 1)] = (table['current_avg_price'] - table['BidPrice1'].shift(1))/(table['BidPrice2'].shift(1) - table['BidPrice1'].shift(1))
# table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
# table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice2 and AskPrice3'
# table.loc[(table['前一期Low Price比例'] > 1),'前一期成交价格对'] = 'BidPrice3 and BidPrice2'
# table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice3'].shift(1))/(table['AskPrice2'].shift(1) - table['AskPrice3'].shift(1))
# table['前一期Low Price比例'].loc[(table['前一期Low Price比例'] > 1)] = (table['current_avg_price'] - table['BidPrice2'].shift(1))/(table['BidPrice3'].shift(1) - table['BidPrice2'].shift(1))
# table['前一期High Price比例'] = 1 - table['前一期Low Price比例']
# table.loc[(table['前一期High Price比例'] > 1),'前一期成交价格对'] = 'AskPrice3 and AskPrice4'
# table.loc[(table['前一期Low Price比例'] > 1),'前一期成交价格对'] = 'BidPrice4 and BidPrice3'
# table['前一期Low Price比例'].loc[(table['前一期High Price比例'] > 1)] = (table['current_avg_price'] - table['AskPrice4'].shift(1))/(table['AskPrice3'].shift(1) - table['AskPrice4'].shift(1))
# table['前一期Low Price比例'].loc[(table['前一期Low Price比例'] > 1)] = (table['current_avg_price'] - table['BidPrice3'].shift(1))/(table['BidPrice4'].shift(1) - table['BidPrice3'].shift(1))
# table['前一期High Price比例'] = 1 - table['前一期Low Price比例']


In [12]:
# 计算收益，即y值
time_list = [20*i for i in range(1,11)]
for i in time_list:
    table[f'frt_{i}'] = -table['mid_price'].diff(-i)

In [13]:
#处理缺失值
# table['前一期Low Price比例'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['前一期High Price比例'].replace([np.inf, -np.inf], np.nan, inplace=True)
table['current_volume'].fillna(0,inplace=True)
table['Position Increase'].fillna(0,inplace=True)
table['current_turnover'].fillna(0,inplace=True)
table['current_avg_price'].replace([np.inf, -np.inf], np.nan, inplace=True)
table['current_avg_price'].fillna(method='ffill',inplace=True)
table['buy_sell_signal'].fillna(0,inplace=True)
# table['前一期Low Price比例'].fillna(0,inplace=True)
# table['前一期High Price比例'].fillna(0,inplace=True)
# table['frt_120'].fillna(0,inplace=True)
for i in time_list:
    table[f'frt_{i}'].fillna(0,inplace=True)


In [14]:
# table['std_price_diff'] = table['mid_price'].diff()/table['mid_price'].rolling(5).std()
# table['std_price_diff'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['std_price_diff'].fillna(0, inplace=True)

# table['bid_volume'] = table['current_volume'] * norm.pdf(table['std_price_diff'])
# table['ask_volume'] = table['current_volume']  - table['bid_volume']

In [15]:
table = table.reset_index(drop=True)

# 构造因子

In [16]:
# # 应用傅里叶变换
# fft_result = np.fft.fft(table['current_volume'])
# freqs = np.fft.fftfreq(len(table['current_volume']))

# # 逆傅里叶变换
# filtered = fft_result.copy()
# filtered[np.abs(freqs) < 0.01] = 0
# inverse_fft = np.fft.ifft(filtered)

# # 绘制原始信号和分解后的信号
# plt.figure(figsize=(14, 7))
# plt.plot(table['current_volume'], label='Original current_volume')
# plt.plot(np.real(inverse_fft), label='Reconstructed from Sine Waves')
# plt.legend()

In [17]:
# abc因子
# table['factor'] = -np.log((table['BidVolume1'] - table[['BidVolume1', 'BidVolume2', 'BidVolume3', 'BidVolume4', 'BidVolume5']].max(axis=1))/(table['AskVolume1'] - table[['AskVolume1', 'AskVolume2', 'AskVolume3', 'AskVolume4', 'AskVolume5']].max(axis=1)))
# # table['factor']= -np.log((table['BidVolume1'] - table['BidVolume2'])/(table['AskVolume1'] - table['AskVolume2']))
# table['factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['factor'].fillna(0,inplace=True)

In [18]:
# table['factor'] = table['weighted_price'].rolling(120).corr(table['current_volume']/table['current_volume'].rolling(120).sum())
# table['factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['factor'].fillna(0,inplace=True)

In [19]:
# table['主动买'] = 0
# table.loc[table['last'] >= table['AskPrice1'].shift(1),'主动买'] = 1
# table['主动卖'] = 0
# table.loc[table['last'] <= table['BidPrice1'].shift(1),'主动卖'] = 1
# table['factor'] = np.log((table['主动买'].rolling(120).sum() + 0.00001)/(table['主动卖'].rolling(120).sum()+0.00001))
# table['factor'].fillna(0,inplace=True)

In [20]:
# table['Bid_factor'] = table['BidVolume1']/table['BidVolume1'].rolling(60).mean()
# table['Ask_factor'] = table['AskVolume1']/table['AskVolume1'].rolling(60).mean()
# table['Ln'] = np.log(table['Bid_factor']/table['Ask_factor'])
# # table['factor'] = (1-0.3 * (np.abs(table['Ln']) >= 2.5)*np.log(np.abs(table['Ln'])))*table['Ln']
# table['factor'] = table['Ln']
# table['factor'].fillna(0,inplace=True)

In [21]:
# table = create_lagged_features(table,120)
# columns_to_adjust_Bid = [col for col in table.columns if 'BidVolume1' in col]
# columns_to_adjust_Ask = [col for col in table.columns if 'AskVolume1' in col]
# table[columns_to_adjust_Bid].fillna(method='bfill',inplace=True)
# table[columns_to_adjust_Ask].fillna(method='bfill',inplace=True)

# Bid_volumes = table[columns_to_adjust_Bid]
# values = Bid_volumes[columns_to_adjust_Bid].values

# # 计算均值、标准差和偏度
# means = np.mean(values, axis=1)
# stds = np.std(values, axis=1, ddof=1)
# skewness_Bid = np.sum(((values - means[:, np.newaxis]) / stds[:, np.newaxis])**3, axis=1) / values.shape[1]

# # 将偏度结果存入 DataFrame
# Bid_volumes['skewness'] = skewness_Bid
# Bid_volumes.fillna(0,inplace=True)


# Ask_volumes = table[columns_to_adjust_Ask]
# values = Ask_volumes[columns_to_adjust_Ask].values

# # 计算均值、标准差和偏度
# means = np.mean(values, axis=1)
# stds = np.std(values, axis=1, ddof=1)
# skewness_Ask = np.sum(((values - means[:, np.newaxis]) / stds[:, np.newaxis])**3, axis=1) / values.shape[1]

# # 将偏度结果存入 DataFrame
# Ask_volumes['skewness'] = skewness_Ask
# Ask_volumes.fillna(0, inplace=True)



# table['factor'] = Bid_volumes['skewness']-Ask_volumes['skewness']
# table['factor'].replace([np.inf,-np.inf],0,inplace=True)
# table['factor'].fillna(0,inplace=True)

In [22]:
# # 买卖斜率因子，在AG上IC：0.035
# table['Bid_Slope'] = table['BidVolume1'].diff()/(table['BidPrice1'].diff()+0.001)
# table['std_Bid_Slope'] = (table['Bid_Slope'] - table['Bid_Slope'].rolling(120).mean())/table['Bid_Slope'].rolling(120).std()
# table['Ask_Slope'] = table['AskVolume1'].diff()/(table['AskPrice1'].diff()+0.001)
# table['std_Ask_Slope'] = (table['Ask_Slope'] - table['Ask_Slope'].rolling(120).mean())/table['Ask_Slope'].rolling(120).std()
# table['Slope_factor'] = table['std_Bid_Slope'] - table['std_Ask_Slope']
# table['Slope_factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['Slope_factor'].fillna(0,inplace=True)


In [23]:
#在AG上的IC:0.018
# table['Bid_Slope'] = (table['BidVolume1'] - table['BidVolume2'])/(table['BidPrice1'] - table['BidPrice2'])
# table['Ask_Slope'] = (table['AskVolume1'] - table['AskVolume2'])/(table['AskPrice1'] - table['AskPrice2'])
# table['std_factor'] = table['Bid_Slope'] + table['Ask_Slope']
# table['factor'] = (
# 1
# + 0.2 * (np.abs(table['std_factor']) >= 90)
# * np.log(np.abs(table['std_factor'] / 90))
# ) * (1/(1+np.exp(-table['std_factor']/15)) - 0.5)
# table['factor'].fillna(0,inplace=True)

In [24]:
# # 偏度因子
# columns_to_adjust = ['BidVolume1', 'BidVolume2', 'BidVolume3', 'BidVolume4', 'BidVolume5']
# Bid_volumes = table[columns_to_adjust]
# # Bid_volumes['mean'] = Bid_volumes.mean(axis=1)
# # Bid_volumes[columns_to_adjust] = Bid_volumes[columns_to_adjust].subtract(Bid_volumes['mean'],axis=0)
# # 提取数值数据为 NumPy 数组
# values = Bid_volumes[columns_to_adjust].values

# # 计算均值、标准差和偏度
# means = np.mean(values, axis=1)
# stds = np.std(values, axis=1, ddof=1)
# skewness = np.sum(((values - means[:, np.newaxis]) / stds[:, np.newaxis])**3, axis=1) / values.shape[1]


# # 将偏度结果存入 DataFrame
# Bid_volumes['skewness'] = skewness



In [25]:
# columns_to_adjust = ['AskVolume1', 'AskVolume2', 'AskVolume3', 'AskVolume4', 'AskVolume5']
# Ask_volumes = table[columns_to_adjust]
# # Ask_volumes['mean'] = Ask_volumes.mean(axis=1)
# # Ask_volumes[columns_to_adjust] = Ask_volumes[columns_to_adjust].subtract(Ask_volumes['mean'],axis=0)
# # 提取数值数据为 NumPy 数组
# values = Ask_volumes[columns_to_adjust].values

# # 计算均值、标准差和偏度
# means = np.mean(values, axis=1)
# stds = np.std(values, axis=1, ddof=1)
# skewness = np.sum(((values - means[:, np.newaxis]) / stds[:, np.newaxis])**3, axis=1) / values.shape[1]


# # 将偏度结果存入 DataFrame
# Ask_volumes['skewness'] = skewness


# table['skew_factor'] = Bid_volumes['skewness'] - Ask_volumes['skewness']
# # table['factor'] = (table['skew_factor'] - table['skew_factor'].rolling(20).mean())/table['skew_factor'].rolling(20).std()
# table['factor'] = table['skew_factor']
# table['factor'].replace([np.inf,-np.inf],np.nan,inplace=True)
# table['factor'].fillna(0,inplace=True)

In [26]:
# table['Difference in Trading Pct'] = table['前一期High Price比例'] - table['前一期Low Price比例']
# table['factor'] =  table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'] = 0
# table['New_Difference in Trading Pct'].loc[table['前一期成交价格对'] == 'BidPrice1 and AskPrice1'] = table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = 1.0 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = 2.0 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'AskPrice3 and AskPrice4')] = 2.8 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -1.0 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -2.0 + table['Difference in Trading Pct']
# table['New_Difference in Trading Pct'].loc[(table['前一期成交价格对'] == 'BidPrice4 and BidPrice3')] = -2.8 + table['Difference in Trading Pct']
# table['factor'] = table['New_Difference in Trading Pct']

In [27]:
# IC:0.058
# table['price_diff'] = table['current_avg_price'] - table['mid_price']
# table['std_factor'] = table['current_volume'].rolling(5).std() * table['price_diff']
# table['factor'] = (
# 1
# + 0.2 * (np.abs(table['std_factor']) >= 25)
# * np.log(np.abs(table['std_factor'] / 25))
# ) * (1/(1+np.exp(-table['std_factor']/5)) - 0.5)
# table['factor'].fillna(0,inplace=True)


In [28]:
# # 压力因子
# ask_weights = [f'Askweight{i}' for i in range(1, 6)]
# bid_weights = [f'Bidweight{i}' for i in range(1, 6)]
# ask_volumes = [f'AskVolume{i}' for i in range(1, 6)]
# bid_volumes = [f'BidVolume{i}' for i in range(1, 6)]

# # 计算 Askweight 和 Bidweight
# for i in range(5):
#     table[ask_weights[i]] = table['current_avg_price'] / (table[f'AskPrice{i+1}'] - table['current_avg_price'] + 0.00000123787)
#     table[bid_weights[i]] = -table['current_avg_price'] / (table[f'BidPrice{i+1}'] - table['current_avg_price'] + 0.00000123787)

# # 计算卖盘压力 press_ask
# ask_weight_sum = table[ask_weights].sum(axis=1)
# table['press_ask'] = sum(table[ask_volumes[i]] * table[ask_weights[i]] for i in range(5)) / ask_weight_sum

# # 计算买盘压力 press_bid
# bid_weight_sum = table[bid_weights].sum(axis=1)
# table['press_bid'] = sum(table[bid_volumes[i]] * table[bid_weights[i]] for i in range(5)) / bid_weight_sum

# table['Press_factor'] = -np.log(table['press_ask']/table['press_bid'])
# table['Press_factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['Press_factor'].fillna(0,inplace=True)
# # # 计算因子 factor
# # table['factor'] = table['Press_factor']
# # # 填充缺失值
# # table['factor'].fillna(0, inplace=True)

In [29]:
# table['BAV_diff'] = table['BidVolume1'].diff() - table['AskVolume1'].diff()
# table['BAV_Diff_Transform'] = (
# 1
# + 0.2 * (np.abs(table['BAV_diff']) >= 6*15)
# * np.log(np.abs(table['BAV_diff'] / (6*15)))
# ) * (1 / (1 + np.exp(-table['BAV_diff'] / 15)) - 0.5)
# table['BAV_Diff_Transform'].fillna(0,inplace=True)

In [30]:
# table['delta_vol_Bid'] = table['BidVolume1'].diff() * (table['BidPrice1'] == table['BidPrice1'].shift(1)) + table['BidVolume1'] * (table['BidPrice1'] > table['BidPrice1'].shift(1)) + (table['BidVolume1']-table['BidVolume2'].shift(1)) * (table['BidPrice1'] < table['BidPrice1'].shift(1))
# table['delta_vol_Ask'] = table['AskVolume1'].diff() * (table['AskPrice1'] == table['AskPrice1'].shift(1)) + table['AskVolume1'] * (table['AskPrice1'] < table['AskPrice1'].shift(1)) + (table['AskVolume1']-table['AskVolume2'].shift(1)) * (table['AskPrice1'] > table['AskPrice1'].shift(1))
# table['volume_order_imbalance'] = (table['delta_vol_Bid'] - table['delta_vol_Ask'])
# # table['factor'] = (
# # 1
# # + 0.2 * (np.abs(table['std_factor']) >= 25)
# # * np.log(np.abs(table['std_factor'] / 25))
# # ) * (1/(1+np.exp(-table['std_factor']/5)) - 0.5)
# table['volume_order_imbalance'].fillna(0,inplace=True)

In [31]:
# # 根据 BidPrice1 和 BidPrice1.shift(1) 创建分类变量
# conditions = (table['BidPrice1'] < table['BidPrice1'].shift(1)) * -1 + \
#              (table['BidPrice1'] == table['BidPrice1'].shift(1)) * 0 + \
#              (table['BidPrice1'] > table['BidPrice1'].shift(1)) * 1
             
# # 绘制直方图
# plt.figure(figsize=(10, 6))
# plt.hist(conditions, bins=3, alpha=0.7, edgecolor='black', rwidth=0.8)

# # 设置标签和标题
# plt.xlabel('Condition Value')
# plt.ylabel('Frequency')
# plt.title('Frequency of Conditions: BidPrice1 vs BidPrice1.shift(1)')
# plt.xticks([-1, 0, 1], ['BidPrice1 < BidPrice1.shift(1)', 'BidPrice1 == BidPrice1.shift(1)', 'BidPrice1 > BidPrice1.shift(1)'])
# plt.grid(True)
# plt.show()


In [32]:
# # 仅筛选出 -1 和 1 的条件
# filtered_conditions = conditions[(conditions == -1) | (conditions == 1)]

# # 绘制直方图
# plt.figure(figsize=(10, 6))
# plt.hist(filtered_conditions, bins=2, alpha=0.7, edgecolor='black', rwidth=0.8)

# # 设置标签和标题
# plt.xlabel('Condition Value')
# plt.ylabel('Frequency')
# plt.title('Frequency of Conditions: BidPrice1 < BidPrice1.shift(1) vs BidPrice1 > BidPrice1.shift(1)')
# plt.xticks([-1, 1], ['BidPrice1 < BidPrice1.shift(1)', 'BidPrice1 > BidPrice1.shift(1)'])
# plt.grid(True)
# plt.show()

In [33]:
# IC:0.056
# table['Ask_diff'] = table['AskVolume1'] - table['AskVolume2']
# table['Bid_diff'] = table['BidVolume1'] - table['BidVolume2']
# table['factor'] = table['Bid_diff'].diff() - table['Ask_diff'].diff()
# table['factor'] = (
# 1
# + 0.2 * (np.abs(table['factor']) >= 25)
# * np.log(np.abs(table['factor'] / 25))
# ) * (1/(1+np.exp(-table['factor']/5)) - 0.5)
# table['factor'].fillna(0,inplace=True)



In [34]:
# table = table[(table['BAV_diff']<-50) | (table['BAV_diff']>50)]
# table = table[(table['BAV_diff']>=-100) & (table['BAV_diff']<=100)]


In [35]:
# table['factor1_weight'] = 0
# table['factor1_weight'].loc[table['主动买or主动卖']=='主动买'] = 0.2
# table['factor1_weight'].loc[(table['前一期成交价格对'] == 'AskPrice1 and AskPrice2')] = 1.2
# table['factor1_weight'].loc[(table['前一期成交价格对'] == 'AskPrice2 and AskPrice3')] = 1.8
# table['factor1_weight'].loc[table['主动买or主动卖']=='主动卖'] = -0.2
# table['factor1_weight'].loc[(table['前一期成交价格对'] == 'BidPrice2 and BidPrice1')] = -1.2
# table['factor1_weight'].loc[(table['前一期成交价格对'] == 'BidPrice3 and BidPrice2')] = -1.8
# table['factor'] = table['factor1_weight']*table['current_volume']

In [36]:
# table['factor'] = table['Position Increase'].rolling(120).mean()/table['current_volume'].rolling(120).mean()
# table['factor'].fillna(0,inplace=True)

In [37]:
# ask_weights = [f'Askweight{i}' for i in range(1, 6)]
# bid_weights = [f'Bidweight{i}' for i in range(1, 6)]
# ask_volumes = [f'AskVolume{i}' for i in range(1, 6)]
# bid_volumes = [f'BidVolume{i}' for i in range(1, 6)]
# ask_price = [f'AskPrice{i}' for i in range(1, 6)]
# bid_price = [f'BidPrice{i}' for i in range(1, 6)]

# # 计算 Askweight 和 Bidweight
# for i in range(5):
#     table[ask_weights[i]] = table[f'AskVolume{i+1}'] / table[ask_volumes].sum(axis=1)
#     table[bid_weights[i]] = table[f'BidVolume{i+1}'] / table[bid_volumes].sum(axis=1)

# # 计算卖盘压力 press_ask
# table['press_ask'] = sum(table[ask_price[i]] * table[ask_weights[i]] for i in range(5)) 
# # 计算买盘压力 press_bid
# table['press_bid'] = sum(table[bid_price[i]] * table[bid_weights[i]] for i in range(5))

# table['factor'] = table['press_ask'] + table['press_bid'] - 2*table['weighted_price']

# # 填充缺失值
# table['factor'].fillna(0, inplace=True)

In [38]:
# table['factor'] = np.log(table['current_volume'].rolling(5).std()+1)
# table['factor'] = np.log(table['factor']+1)
# table['factor'] = np.log(table['BidVolume1'].rolling(5).std())/np.log(table['AskVolume1'].rolling(5).std())

In [39]:
# table['factor'] = table['mid_price'].diff(120)/np.abs(table['mid_price'].diff()).rolling(120).sum()
# table['factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['factor'].fillna(0,inplace=True)

In [40]:
# def rolling_fft(fft_current_volume):
#     # 计算每个窗口的 FFT
#     data = fft_current_volume.copy()
#     data.reset_index(drop=True)
#     # 计算每个窗口的频率
#     freqs = np.fft.fftfreq(len(data))
#     pows = np.abs(data)*(freqs > 0)
#     # 留下能量最高的频率
#     fund_freq = freqs[pows.argmax()]
#     # 寻找需要抹掉的噪声频率的索引
#     noised_indices = np.where(freqs != fund_freq)

#     # 复制一个数组以避免修改原始数据
#     filter_complex_array = data.copy()
#     filter_complex_array.reset_index(drop=True, inplace=True)

#     # 将噪声频率的复数部分置为零
#     filter_complex_array[noised_indices] = 0

#     filter_sigs = fft.ifft(filter_complex_array)

#     return filter_sigs 





In [41]:
# import numpy as np
# import pandas as pd
# from scipy import fft

# # 定义 rolling_fft 函数，用于处理每个滚动窗口的数据
# def rolling_fft(data_window):
#     # 对滚动窗口的数据进行 FFT
#     fft_data = np.fft.fft(data_window)

#     freqs = np.fft.fftfreq(len(data_window))
#     pows = np.abs(fft_data)**2
    
#     # 寻找需要抹掉的噪声频率的索引
#     noised_indices = np.where(np.argsort(pows)[:-10])

#     # 复制一个数组以避免修改原始数据
#     filter_complex_array = fft_data.copy()

#     # 将噪声频率的复数部分置为零
#     filter_complex_array[noised_indices] = 0

#     filtered_signal = np.fft.ifft(filter_complex_array)
    
#     return np.mean(filtered_signal.real)

# # 现在我们可以对每个滚动窗口进行 FFT 和滤波
# # 使用 rolling().apply() 处理 'current_volume' 列
# table['rolling_fft_current_volume'] = np.log(table['current_volume']+1).rolling(window=60).apply(lambda x: rolling_fft(x))
# table['rolling_fft_current_volume'].fillna(0, inplace=True)
# # 删除辅助列 'max_volume_idx'，如果不需要
# # table = table.drop(columns=['max_volume_idx'])
# table['factor'] = table['rolling_fft_current_volume']
# table['factor'].fillna(0,inplace=True)


In [42]:
# import numpy as np
# import pandas as pd
# from scipy import fft

# # 定义 rolling_fft 函数，用于处理每个滚动窗口的数据
# def rolling_fft(data_window):
#     # 对滚动窗口的数据进行 FFT
#     fft_data = np.fft.fft(data_window)

#     freqs = np.fft.fftfreq(len(data_window))
#     pows = np.abs(fft_data)**2

#     # 留下能量最高的频率
#     fund_freq = freqs[pows.argmax()]
#     noised_indices = np.where(freqs != fund_freq)

#     # 复制一个数组以避免修改原始数据
#     filter_complex_array = fft_data.copy()

#     # 将噪声频率的复数部分置为零
#     filter_complex_array[noised_indices] = 0

#     filter_sigs = fft.ifft(filter_complex_array)

#     return np.mean(filter_sigs)


# # 假设 'table' 是一个包含 'current_volume' 列的 DataFrame

# # 现在我们可以对每个滚动窗口进行 FFT 和滤波
# # 使用 rolling().apply() 处理 'current_volume' 列
# table['rolling_fft_current_volume'] = np.log(table['current_volume']+1).rolling(window=120).apply(lambda x: rolling_fft(x))
# table['rolling_fft_current_volume'].fillna(0, inplace=True)

# table['factor'] = table['rolling_fft_current_volume']

# table['sign'] = np.sign(2*table['current_avg_price'] - table['BidPrice1'] - table['AskPrice1'])
# table['max_volume_idx'] = table['current_volume'].rolling(window=20, min_periods=1).apply(lambda x: x.idxmax(), raw=False)
# # 用这些索引来获取对应的 sign
# table['max_sign'] = table['max_volume_idx'].map(table['sign'])
# 删除辅助列 'max_volume_idx'，如果不需要
# table = table.drop(columns=['max_volume_idx'])
# table['factor'] = table['max_sign'] * table['rolling_fft_current_volume']

In [43]:
# table['factor'] = np.log(1+table['current_volume'])

In [44]:
# table['factor'] = (table['mid_price'].rolling(60).max()-table['mid_price'].shift(60))/(table['mid_price'].rolling(60).max() - table['mid_price'].rolling(60).min())
# table['factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['factor'].fillna(0, inplace=True)

In [45]:
# table['factor'] = np.log(table['current_avg_price']/table['mid_price'])

In [46]:
# table['factor'] = table['current_avg_price'] - table['mid_price']
# table['factor'] = table['weighted_price'] - table['mid_price']

In [47]:
# window_size = 2 
# scalar = 0.2
# table['ratio'] = table['BidVolume1']/(table['BidVolume1'] + table['AskVolume1'])
# table['pending_vol_ratio_factor'] = scalar * table['BidPrice1'].diff(window_size)/(table['BidPrice1'] - table['BidPrice2']) + table['ratio'].diff(window_size)
# table['pending_vol_ratio_factor'].fillna(0,inplace=True)

In [48]:
# table['sign'] = 1*(table['BidPrice1'] < table['BidPrice1'].rolling(8,min_periods=1).min()) - 1*(table['AskPrice1'].rolling(8,min_periods=1).max() <= table['AskPrice1'])
# table['sign'].fillna(0,inplace=True)
# table['factor'] = table['sign']
# table['factor'].fillna(0,inplace=True)


In [49]:
# window_size = 10
# table['Bid_submit_price'] = (table['BidPrice1'] * table['BidVolume1'] + table['BidPrice2'] * table['BidVolume2'] + table['BidPrice3'] * table['BidVolume3'] + table['BidPrice4'] * table['BidVolume4'] + table['BidPrice5'] * table['BidVolume5'])/table['BidVolume']
# table['Ask_submit_price'] = (table['AskPrice1'] * table['AskVolume1'] + table['AskPrice2'] * table['AskVolume2'] + table['AskPrice3'] * table['AskVolume3'] + table['AskPrice4'] * table['AskVolume4'] + table['AskPrice5'] * table['AskVolume5'])/table['AskVolume']
# table['std_factor'] = (table['Ask_submit_price'] - table['mid_price']) - (table['mid_price'] - table['Bid_submit_price'])
# # table['std_factor'] = (table['Ask_submit_price'] - table['weighted_price']) - (table['weighted_price'] - table['Bid_submit_price'])

# # table['factor'] = table['std_factor']
# table['factor'] = (table['std_factor'] - table['std_factor'].rolling(window_size,min_periods=1).mean())/table['std_factor'].rolling(window_size,min_periods=1).std()
# table['factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['factor'].fillna(0, inplace=True)

In [50]:
# table[['factor','std_factor']].corr()

In [51]:
# window_size = 20
# # table['Bid_ratio'] = table['BidVolume']/table['BidVolume1']
# # table['Ask_ratio'] = table['AskVolume']/table['AskVolume1']
# table['Bid_ratio'] = table['BidVolume']/table['BidVolume1']
# table['Ask_ratio'] = table['AskVolume']/table['AskVolume1']
# table['std_factor'] = table['Ask_ratio'] - table['Bid_ratio']
# # table['factor'] = table['std_factor']
# table['relative_vol_ratio_imbalance'] = (table['std_factor'] - table['std_factor'].rolling(window_size,min_periods=1).mean())/table['std_factor'].rolling(window_size,min_periods=1).std()
# table['relative_vol_ratio_imbalance'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['relative_vol_ratio_imbalance'].fillna(0,inplace=True)

In [None]:
# table['BAV_diff'] = table['BidVolume1'].diff() - table['AskVolume1'].diff()
# # table['BAV_diff_transform'] = table['BAV_diff']
# # table['BAV_diff_transform'].fillna(0,inplace=True)
# table['BAV_diff_transform'] = (
# 1
# + 0.2 * (np.abs(table['BAV_diff']) >= 6*15)
# * np.log(np.abs(table['BAV_diff'] / (6*15)))
# ) * (1 / (1 + np.exp(-table['BAV_diff'] / 15)) - 0.5)
# table['BAV_diff_transform'].fillna(0,inplace=True)

# table['Bid_Slope'] = table['BidVolume1'].diff()/(table['BidPrice1'].diff()+0.001)
# table['std_Bid_Slope'] = (table['Bid_Slope'] - table['Bid_Slope'].rolling(120).mean())/table['Bid_Slope'].rolling(120).std()
# table['Ask_Slope'] = table['AskVolume1'].diff()/(table['AskPrice1'].diff()+0.001)
# table['std_Ask_Slope'] = (table['Ask_Slope'] - table['Ask_Slope'].rolling(120).mean())/table['Ask_Slope'].rolling(120).std()
# table['slope_factor'] = table['std_Bid_Slope'] - table['std_Ask_Slope']
# table['slope_factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['slope_factor'].fillna(0,inplace=True)

# table['delta_vol_Bid'] = table['BidVolume1'].diff() * (table['BidPrice1'] == table['BidPrice1'].shift(1)) + table['BidVolume1'] *(table['BidPrice1'] > table['BidPrice1'].shift(1)) + (table['BidVolume1']-table['BidVolume2'].shift(1)) * (table['BidPrice1'] < table['BidPrice1'].shift(1))
# table['delta_vol_Ask'] = table['AskVolume1'].diff() * (table['AskPrice1'] == table['AskPrice1'].shift(1)) + table['AskVolume1'] *(table['AskPrice1'] < table['AskPrice1'].shift(1)) + (table['AskVolume1']-table['AskVolume2'].shift(1)) * (table['AskPrice1'] > table['AskPrice1'].shift(1))
# table['Volume_Order_Imbalance'] = table['delta_vol_Bid'] - table['delta_vol_Ask']
# table['Volume_Order_Imbalance'].fillna(0,inplace=True)

# table['Base_factor'] = -(table['AskVolume1']-table['BidVolume1'])/(table['AskVolume1']+table['BidVolume1'])

# table['ratio'] = table['BidVolume1']/(table['BidVolume1'] + table['AskVolume1'])
# table['pending_vol_ratio_factor'] = 0.1 * table['BidPrice1'].diff(2)/(table['BidPrice1'] - table['BidPrice2']) + table['ratio'].diff(2)
# table['pending_vol_ratio_factor'].fillna(0,inplace=True)

# table['BidVolume'] = table[['BidVolume1','BidVolume2','BidVolume3','BidVolume4','BidVolume5']].sum(axis=1)
# table['AskVolume'] = table[['AskVolume1','AskVolume2','AskVolume3','AskVolume4','AskVolume5']].sum(axis=1)
# table['Bid_ratio'] = table['BidVolume']/table['BidVolume1']
# table['Ask_ratio'] = table['AskVolume']/table['AskVolume1']
# table['relative_vol_ratio_diff'] = table['Ask_ratio'] - table['Bid_ratio']
# table['relative_vol_ratio_imbalance'] = (table['relative_vol_ratio_diff'] - table['relative_vol_ratio_diff'].rolling(20,min_periods=1).mean())/table['relative_vol_ratio_diff'].rolling(20,min_periods=1).std()
# table['relative_vol_ratio_imbalance'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['relative_vol_ratio_imbalance'].fillna(0,inplace=True)

# table['Bid_submit_price'] = (table['BidPrice1'] * table['BidVolume1'] + table['BidPrice2'] * table['BidVolume2'] + table['BidPrice3'] * table['BidVolume3'] + table['BidPrice4'] * table['BidVolume4'] + table['BidPrice5'] * table['BidVolume5'])/table['BidVolume']
# table['Ask_submit_price'] = (table['AskPrice1'] * table['AskVolume1'] + table['AskPrice2'] * table['AskVolume2'] + table['AskPrice3'] * table['AskVolume3'] + table['AskPrice4'] * table['AskVolume4'] + table['AskPrice5'] * table['AskVolume5'])/table['AskVolume']
# table['std_factor'] = (table['Ask_submit_price'] - table['mid_price']) - (table['mid_price'] - table['Bid_submit_price'])
# # table['std_factor'] = (table['Ask_submit_price'] - table['weighted_price']) - (table['weighted_price'] - table['Bid_submit_price'])

# # table['factor'] = table['std_factor']
# table['submit_price_imbalance'] = (table['std_factor'] - table['std_factor'].rolling(10,min_periods=1).mean())/table['std_factor'].rolling(10,min_periods=1).std()
# table['submit_price_imbalance'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['submit_price_imbalance'].fillna(0, inplace=True)

In [53]:
def std_factor(X):
    X_std = X.rolling(10).mean()
    return X_std

In [54]:
def create_lagged_features(df, window , factor):
    new_factor_list = []
    for i in window:
        for j in factor:
            df[f'{j}_lag{i}'] = df[j].rolling(i).mean()
            df[f'{j}_lag{i}'].fillna(0,inplace = True)
            new_factor_list.append(f'{j}_lag{i}')
    return df,new_factor_list

In [55]:
# import pandas as pd
# from sklearn.linear_model import LinearRegression

# # 提取因子和目标变量
# factor_columns = ['Base_factor','BAV_diff_transform', 'Volume_Order_Imbalance','pending_vol_ratio_factor', 'relative_vol_ratio_imbalance']

# table,lag_factor = create_lagged_features(table,window = [10,26,40,60,120], factor= factor_columns)
# factor_columns = lag_factor

# table[factor_columns] = std_factor(table[factor_columns])
# table[factor_columns] = table[factor_columns].fillna(0)


# X = table[factor_columns]
# y = table['frt_120']


# # 线性回归模型（不含截距项）
# model = LinearRegression(fit_intercept=False)
# model.fit(X, y)

# # 输出回归系数
# coefficients = pd.DataFrame(model.coef_, index=factor_columns, columns=['Coefficient']).to_numpy().flatten().tolist()
# print(coefficients)

# table['factor'] = np.dot(table[factor_columns],coefficients)

[-0.6396554882852781, 22.253634119619523, 0.02557680223038756, 6.180723252721563, 0.2406336044104225, 1.401066194290231, 2.6773355244914554, -0.05374589761312223, 24.810935301753553, 0.12425760428760163, 2.8581712608490206, -0.4689789691669593, 0.046064693187394735, 3.8579606711137724, -0.2121088739063698, -0.6903135939349665, -14.830210615958467, 0.20106455713946905, -31.95226026811632, 0.41806671309642696, -2.59446086719592, 46.032620512199536, -0.05206817499860075, -125.74806745331595, -0.09974847669035959]


In [56]:
# import numpy as np
# import pandas as pd
# import statsmodels.api as sm
# import matplotlib.pyplot as plt

# from scipy.optimize import minimize

# # 提取因子和目标变量
# factor_columns = ['Base_factor','BAV_diff_transform', 'Volume_Order_Imbalance',
#                   'pending_vol_ratio_factor', 'submit_price_imbalance', 'relative_vol_ratio_imbalance']

# table[factor_columns] = std_factor(table[factor_columns])

# X = table[factor_columns].to_numpy()
# y = table['frt_120'].to_numpy()

# # Cauchy负对数似然函数
# def cauchy_loss(params, X, y):
#     y_pred = np.dot(X, params)
#     residuals = y - y_pred
#     return np.sum(np.log(1 + (residuals ** 2)))

# # 初始参数（全为0）
# initial_params = np.zeros(X.shape[1])

# # 极大似然估计
# result = minimize(cauchy_loss, initial_params, args=(X, y))

# # 回归系数
# coefficients = pd.DataFrame(result.x, index=factor_columns, columns=['Coefficient'])
# print(coefficients)

# # 计算因子值
# table['factor'] = np.dot(X, result.x)



In [57]:
# table['factor'].quantile([0.01,0.99])

In [58]:
# table['std_factor'] = (table['BidVolume'].diff() - table['AskVolume'].diff())/table['current_volume']
# table['factor'] = (table['std_factor'] - table['std_factor'].rolling(10).mean())/table['std_factor'].rolling(10).std()
# table['factor'].replace([np.inf,-np.inf],0,inplace=True)
# table['factor'].fillna(0,inplace=True)

In [59]:
# table['factor'] = table['BidPrice1'] + table['AskPrice1'] - 2*table['current_avg_price']
# table['factor'].fillna(0,inplace=True)

In [60]:
# # window_size = 15
# table['max_vol_ratio_imbalance'] = -table[['BidVolume1', 'BidVolume2', 'BidVolume3', 'BidVolume4', 'BidVolume5']].max(axis=1)/table['BidVolume1'] + table[['AskVolume1', 'AskVolume2', 'AskVolume3', 'AskVolume4', 'AskVolume5']].max(axis=1)/table['AskVolume1']
# # table['factor'] = (table['std_factor'] - table['std_factor'].rolling(window_size,min_periods=1).mean())/table['std_factor'].rolling(window_size,min_periods=1).std()
# table['max_vol_ratio_imbalance'].fillna(0,inplace=True)

In [61]:
# table['Base_factor'] = (table['BidVolume1'] - table['AskVolume1'])/(table['BidVolume1'] + table['AskVolume1'])

In [62]:
# from hyperopt import fmin, tpe, hp, Trials
# from hyperopt import anneal
# import numpy as np


# # 超参数调整强度
# LAMBDA = 0.06  # 可调整，越大代表越强调降低相关性
# # 目标函数
# def objective(weights):
#     table['Bid_entropy'] = 0
#     table['Ask_entropy'] = 0
    
#     for i in range(1, 6):
#         weight = weights[f'weight_{i}']
#         p = table[f'BidVolume{i}'] / table['BidVolume']
#         q = table[f'AskVolume{i}'] / table['AskVolume']
        
#         table['Bid_entropy'] -= weight * p * np.log2(p)
#         table['Ask_entropy'] -= weight * q * np.log2(q)

#     # 计算盘口买卖量熵差
#     table['vol_entropy_diff'] = table['Bid_entropy'] - table['Ask_entropy']
#     table['vol_entropy_diff'].fillna(0, inplace=True)
#     # 计算IC（信息系数）
#     ic = table['vol_entropy_diff'].corr(table['frt_120'])

#     # 计算新因子和Base_factor的相关性
#     corr_with_base = table['vol_entropy_diff'].corr(table['Base_factor'])
#     corr = corr_with_base * (abs(corr_with_base)>0.55) + 0.55 * (abs(corr_with_base)<=0.55)
#         # 计算新因子和Base_factor的相关性
#     # corr_with_base = table['vol_entropy_diff'].corr(table['BAV_Diff_Transform'])
#     # corr = corr_with_base * (abs(corr_with_base)>0.55) + 0.55 * (abs(corr_with_base)<=0.55)
    
#     # 目标函数 = 最大化IC，同时惩罚与Base_factor的相关性
#     loss = -ic + LAMBDA * abs(corr)
    
#     return loss

# # 定义超参数搜索空间
# space = {f'weight_{i}': hp.uniform(f'weight_{i}', 0, 1) for i in range(1, 6)}

# # 超参数优化
# trials = Trials()
# best = fmin(fn=objective,
#             space=space,
#             algo=tpe.suggest,
#             max_evals=200,
#             trials=trials)

# print("最优权重:", best)

# table['Bid_entropy'] = 0
# table['Ask_entropy'] = 0
# for i in range(1,6):
#     weight = best[f'weight_{i}']
#     p = table[f'BidVolume{i}']/table['BidVolume']
#     table['Bid_entropy'] -= weight * p * np.log2(p)
#     q = table[f'AskVolume{i}']/table['AskVolume']
#     table['Ask_entropy'] -= weight * q * np.log2(q)

# # table['factor'] = (table['Bid_entropy'].diff() - table['Bid_entropy'].diff().rolling(window_size).mean()) - (table['Ask_entropy'].diff() - table['Ask_entropy'].diff().rolling(window_size).mean())
# # table['factor'] = table['Bid_entropy'].diff()/table['Bid_entropy'].shift(1) - table['Ask_entropy'].diff()/table['Ask_entropy'].shift(1)
# table['factor'] = table['Bid_entropy']- table['Ask_entropy']
# table['factor'].fillna(0,inplace=True)

In [63]:
# table['Bid_entropy'] = 0
# table['Ask_entropy'] = 0
# for i in range(1,6):
#     p = table[f'BidVolume{i}']/table['BidVolume']
#     table['Bid_entropy'] -= p * np.log2(p)
#     q = table[f'AskVolume{i}']/table['AskVolume']
#     table['Ask_entropy'] -= q * np.log2(q)

# # table['factor'] = (table['Bid_entropy'].diff() - table['Bid_entropy'].diff().rolling(window_size).mean()) - (table['Ask_entropy'].diff() - table['Ask_entropy'].diff().rolling(window_size).mean())
# # table['factor'] = table['Bid_entropy'].diff()/table['Bid_entropy'].shift(1) - table['Ask_entropy'].diff()/table['Ask_entropy'].shift(1)
# table['factor'] = table['Bid_entropy'].diff() - table['Ask_entropy'].diff()
# table['factor'].fillna(0,inplace=True)

In [64]:
# table['factor'] = table['BidPrice1']*table[['BidVolume1', 'BidVolume2', 'BidVolume3', 'BidVolume4', 'BidVolume5']].max(axis=1) + \
#     table['AskPrice1']*table[['AskVolume1', 'AskVolume2', 'AskVolume3', 'AskVolume4', 'AskVolume5']].max(axis=1) -\
#           2 * table['mid_price'] * (table['BidVolume1'] + table['AskVolume1'])
# table['factor'].fillna(0,inplace=True)


In [65]:
# table['factor'] = (2*table['weighted_price'] - table['BidPrice1'] - table['AskPrice1'])
# table['factor'].fillna(0,inplace=True)

In [66]:
# window_size = 5
# table['factor'] = table['current_volume'].diff().rolling(window_size).corr(table['mid_price'].diff())
# table['factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# table['factor'].fillna(0,inplace=True)


# 处理数据
由于日盘拿不到夜盘的数据，夜盘也拿不到日盘的数据，所以我们要将每一日数据中日盘和夜盘的前prev_period个数据和后back_period个数据删掉

In [None]:
# 删除 'trading_date' 为 '2024-04-08' 和 '2024-05-20' 的行
new_table = table[~table['trading_date'].isin([pd.to_datetime('2024-04-08'), pd.to_datetime('2024-05-20')])]
unique_trading_dates = sorted(table['trading_date'].unique())
new_table = new_table.groupby('trading_date').apply(process_day, prev_period=0, back_period=120,trading_dates=unique_trading_dates)


In [68]:
# new_table = table[~table['trading_date'].isin([pd.to_datetime('2023-12-08')])]
# # new_table = table[~table['trading_date'].isin([pd.to_datetime('2024-04-08'),pd.to_datetime('2024-05-20')])]
# unique_trading_dates = sorted(table['trading_date'].unique())
# new_table = new_table.groupby('trading_date').apply(process_day, prev_period=0, back_period=120,trading_dates=unique_trading_dates)


# 单因子IC的计算

In [None]:
ic_value1 = calculate_ic(new_table['factor'],new_table['frt_120'])
print(f'单因子的 IC 值：{ic_value1}')

In [70]:
# import math
# def compute_entropy(series):
#     # 统计每个值的出现次数
#     counts = series.value_counts()
#     total = len(series)
    
#     entropy = 0.0
#     for count in counts:
#         p = count / total
#         if p > 0:
#             entropy -= p * math.log2(p)
#     return entropy

# entropy_category_list = []
# IC_list = []
# std_list = []
# for i in time_list:
#     ic_value = calculate_ic(new_table[f'frt_{i}'], new_table['factor'])
#     IC_list.append(ic_value)
#     print(f"单因子与frt_{i} 列的IC为: {ic_value:.3f} ")
#     entropy_category = compute_entropy(table[f'frt_{i}'])
#     entropy_category_list.append(entropy_category)
#     print(f"frt_{i} 列的熵为: {entropy_category:.3f} bits")
#     std = table[f'frt_{i}'].std()
#     std_list.append(std)
#     print(f"frt_{i} 列的标准差为: {std:.3f} ")
#     print("--------------------------------------")

In [71]:
# IC_diffs = [-(IC_list[i] - IC_list[i-1])/IC_list[i-1] for i in range(1, len(IC_list))]
# entropy_category_diffs = [(entropy_category_list[i] - entropy_category_list[i-1])/entropy_category_list[i-1] for i in range(1, len(entropy_category_list))]
# std_diffs = [(std_list[i] - std_list[i-1])/std_list[i-1] for i in range(1, len(std_list))]
# ic_ent_diffs = [IC_diffs[i] - entropy_category_diffs[i] for i in range(len(IC_diffs))]
# ic_std_diffs = [IC_diffs[i] - std_list[i] for i in range(len(IC_diffs))]

In [72]:
# import csv
# time_list = []
# for i in range(1,11):
#     time_list.append(f"{i*20}-{i*20+20}")
# with open("change_rate.csv",'w',newline='') as csvfile:
#     writer = csv.writer(csvfile)
#     writer.writerow(['time','ic_change_rate', 'ent_change_rate', 'std_change_rate'])
#     for row in zip(time_list,IC_diffs,entropy_category_diffs,std_diffs):
#         writer.writerow(row)
# print("数据已写入 change_rate.csv 文件")

In [73]:
# ic_ent_diffs

In [74]:
# ic_ent_did = [abs(ic_ent_diffs[i] - ic_ent_diffs[i-1]) for i in range(1, len(ic_ent_diffs))]
# ic_ent_did

In [75]:
# max_value = max(ic_ent_diffs)
# max_index = ic_ent_diffs.index(max_value)
# print("最大值:", max_value)
# print("最大值的下标:", max_index)
# print(f'ic与ent变化率差值最大的time为：{(max_index+1)*20}-->{(max_index+2)*20}')

# 单因子分段画图看趋势

In [None]:
# 将因子分段排序
new_table_sorted = new_table.sort_values(by='factor', ascending=True)
new_table_sorted['group'] = pd.qcut(new_table['factor'],100, labels=False,duplicates='drop')

# 计算每组内 Factor1 和 return 的均值
grouped = new_table_sorted.groupby('group').agg(
    factor_mean=('factor', 'mean'),
    frt_120_mean=('frt_120', 'mean')
).reset_index()

sns.set(style="whitegrid")

fig, ax = plt.subplots(1, 1, figsize=(10, 15))
sns.scatterplot(data=grouped, x='factor_mean', y='frt_120_mean', ax=ax, color='blue')

ax.set_title('Factor vs Frt_120')
# 添加回归线
sns.regplot(x='factor_mean', y='frt_120_mean', data=grouped, ax=ax, scatter=False, color='blue')

In [None]:
grouped.describe()

In [78]:
# new_table[new_table['frt_120'] > 70]['factor'].describe()

In [79]:
# new_table['factor'].quantile([0.01, 0.99])

In [80]:
# # 将因子分段排序
# new_table_sorted = new_table.sort_values(by='factor', ascending=True)
# new_table_sorted['group'] = pd.qcut(new_table['factor'], 100, labels=False,duplicates='drop')

# # 获取实际的最大组和最小组编号
# max_group = new_table_sorted['group'].max()
# min_group = new_table_sorted['group'].min()

# # 提取最大组和最小组的数据
# top_group = new_table_sorted[new_table_sorted['group'] == max_group]
# bottom_group = new_table_sorted[new_table_sorted['group'] == min_group]

# top_std = top_group['frt_120'].std()
# bottom_std = bottom_group['frt_120'].std()

# # 去掉超过若干个标准差的异常值
# def remove_outliers(df, column):
#     mean = df[column].mean()
#     std = df[column].std()
#     return df[(df[column] >= mean - 1 * std) & (df[column] <= mean + 1 * std)]

# top_group = remove_outliers(top_group, 'frt_120')
# bottom_group = remove_outliers(bottom_group, 'frt_120')

# # 计算标准差
# top_group_std = top_group['frt_120'].std()
# bottom_group_std = bottom_group['frt_120'].std()

# # 可视化：箱型图
# sns.set(style="whitegrid")
# fig, ax = plt.subplots(1, 2, figsize=(15, 8))

# sns.boxplot(y=top_group['frt_120'], ax=ax[0], color='skyblue')
# ax[0].set_title(f'Max Group (original_std: {top_std:.2f},std: {top_group_std:.2f})')

# sns.boxplot(y=bottom_group['frt_120'], ax=ax[1], color='salmon')
# ax[1].set_title(f'Min Group (original_std: {bottom_std:.2f},std: {bottom_group_std:.2f})')

# plt.tight_layout()
# plt.show()

In [81]:
# top_group['frt_120'].describe()

In [82]:
# # 将因子分段排序
# new_table_sorted = new_table.sort_values(by='vol_entropy_diff', ascending=True)
# new_table_sorted['group'] = pd.qcut(new_table['vol_entropy_diff'], 100, labels=False,duplicates='drop')

# # 获取实际的最大组和最小组编号
# max_group = new_table_sorted['group'].max()
# min_group = new_table_sorted['group'].min()

# # 提取最大组和最小组的数据
# top_group = new_table_sorted[new_table_sorted['group'] == max_group]
# bottom_group = new_table_sorted[new_table_sorted['group'] == min_group]

# top_std = top_group['frt_120'].std()
# bottom_std = bottom_group['frt_120'].std()

# # 去掉超过若干个标准差的异常值
# def remove_outliers(df, column):
#     mean = df[column].mean()
#     std = df[column].std()
#     return df[(df[column] >= mean - 2 * std) & (df[column] <= mean + 2 * std)]

# top_group = remove_outliers(top_group, 'frt_120')
# bottom_group = remove_outliers(bottom_group, 'frt_120')

# # 计算标准差
# top_group_std = top_group['frt_120'].std()
# bottom_group_std = bottom_group['frt_120'].std()

# # 可视化：箱型图
# sns.set(style="whitegrid")
# fig, ax = plt.subplots(1, 2, figsize=(15, 8))

# sns.boxplot(y=top_group['frt_120'], ax=ax[0], color='skyblue')
# ax[0].set_title(f'Max Group (original_std: {top_std:.2f},std: {top_group_std:.2f})')

# sns.boxplot(y=bottom_group['frt_120'], ax=ax[1], color='salmon')
# ax[1].set_title(f'Min Group (original_std: {bottom_std:.2f},std: {bottom_group_std:.2f})')

# plt.tight_layout()
# plt.show()

In [83]:
# # 将因子分段排序
# new_table_sorted = new_table.sort_values(by='volume_order_imbalance', ascending=True)
# new_table_sorted['group'] = pd.qcut(new_table['volume_order_imbalance'], 100, labels=False,duplicates='drop')

# # 获取实际的最大组和最小组编号
# max_group = new_table_sorted['group'].max()
# min_group = new_table_sorted['group'].min()

# # 提取最大组和最小组的数据
# top_group = new_table_sorted[new_table_sorted['group'] == max_group]
# bottom_group = new_table_sorted[new_table_sorted['group'] == min_group]

# top_std = top_group['frt_120'].std()
# bottom_std = bottom_group['frt_120'].std()

# # 去掉超过若干个标准差的异常值
# def remove_outliers(df, column):
#     mean = df[column].mean()
#     std = df[column].std()
#     return df[(df[column] >= mean - 1 * std) & (df[column] <= mean + 1 * std)]

# top_group = remove_outliers(top_group, 'frt_120')
# bottom_group = remove_outliers(bottom_group, 'frt_120')

# # 计算标准差
# top_group_std = top_group['frt_120'].std()
# bottom_group_std = bottom_group['frt_120'].std()

# # 可视化：箱型图
# sns.set(style="whitegrid")
# fig, ax = plt.subplots(1, 2, figsize=(15, 8))

# sns.boxplot(y=top_group['frt_120'], ax=ax[0], color='skyblue')
# ax[0].set_title(f'Max Group (original_std: {top_std:.2f},std: {top_group_std:.2f})')

# sns.boxplot(y=bottom_group['frt_120'], ax=ax[1], color='salmon')
# ax[1].set_title(f'Min Group (original_std: {bottom_std:.2f},std: {bottom_group_std:.2f})')

# plt.tight_layout()
# plt.show()

In [84]:
# # 将因子分段排序
# new_table_sorted = new_table.sort_values(by='pending_vol_ratio_factor', ascending=True)
# new_table_sorted['group'] = pd.qcut(new_table['pending_vol_ratio_factor'], 100, labels=False,duplicates='drop')

# # 获取实际的最大组和最小组编号
# max_group = new_table_sorted['group'].max()
# min_group = new_table_sorted['group'].min()

# # 提取最大组和最小组的数据
# top_group = new_table_sorted[new_table_sorted['group'] == max_group]
# bottom_group = new_table_sorted[new_table_sorted['group'] == min_group]

# top_std = top_group['frt_120'].std()
# bottom_std = bottom_group['frt_120'].std()

# # 去掉超过若干个标准差的异常值
# def remove_outliers(df, column):
#     mean = df[column].mean()
#     std = df[column].std()
#     return df[(df[column] >= mean - 1 * std) & (df[column] <= mean + 1 * std)]

# top_group = remove_outliers(top_group, 'frt_120')
# bottom_group = remove_outliers(bottom_group, 'frt_120')

# # 计算标准差
# top_group_std = top_group['frt_120'].std()
# bottom_group_std = bottom_group['frt_120'].std()

# # 可视化：箱型图
# sns.set(style="whitegrid")
# fig, ax = plt.subplots(1, 2, figsize=(15, 8))

# sns.boxplot(y=top_group['frt_120'], ax=ax[0], color='skyblue')
# ax[0].set_title(f'Max Group (original_std: {top_std:.2f},std: {top_group_std:.2f})')

# sns.boxplot(y=bottom_group['frt_120'], ax=ax[1], color='salmon')
# ax[1].set_title(f'Min Group (original_std: {bottom_std:.2f},std: {bottom_group_std:.2f})')

# plt.tight_layout()
# plt.show()

In [85]:
# # 将因子分段排序
# new_table_sorted = new_table.sort_values(by='relative_vol_ratio_imbalance', ascending=True)
# new_table_sorted['group'] = pd.qcut(new_table['relative_vol_ratio_imbalance'], 100, labels=False,duplicates='drop')

# # 获取实际的最大组和最小组编号
# max_group = new_table_sorted['group'].max()
# min_group = new_table_sorted['group'].min()

# # 提取最大组和最小组的数据
# top_group = new_table_sorted[new_table_sorted['group'] == max_group]
# bottom_group = new_table_sorted[new_table_sorted['group'] == min_group]

# top_std = top_group['frt_120'].std()
# bottom_std = bottom_group['frt_120'].std()

# # 去掉超过若干个标准差的异常值
# def remove_outliers(df, column):
#     mean = df[column].mean()
#     std = df[column].std()
#     return df[(df[column] >= mean - 1 * std) & (df[column] <= mean + 1 * std)]

# top_group = remove_outliers(top_group, 'frt_120')
# bottom_group = remove_outliers(bottom_group, 'frt_120')

# # 计算标准差
# top_group_std = top_group['frt_120'].std()
# bottom_group_std = bottom_group['frt_120'].std()

# # 可视化：箱型图
# sns.set(style="whitegrid")
# fig, ax = plt.subplots(1, 2, figsize=(15, 8))

# sns.boxplot(y=top_group['frt_120'], ax=ax[0], color='skyblue')
# ax[0].set_title(f'Max Group (original_std: {top_std:.2f},std: {top_group_std:.2f})')

# sns.boxplot(y=bottom_group['frt_120'], ax=ax[1], color='salmon')
# ax[1].set_title(f'Min Group (original_std: {bottom_std:.2f},std: {bottom_group_std:.2f})')

# plt.tight_layout()
# plt.show()

In [86]:
# # 将因子分段排序
# new_table_sorted = new_table.sort_values(by='Slope_factor', ascending=True)
# new_table_sorted['group'] = pd.qcut(new_table['Slope_factor'], 100, labels=False,duplicates='drop')

# # 获取实际的最大组和最小组编号
# max_group = new_table_sorted['group'].max()
# min_group = new_table_sorted['group'].min()

# # 提取最大组和最小组的数据
# top_group = new_table_sorted[new_table_sorted['group'] == max_group]
# bottom_group = new_table_sorted[new_table_sorted['group'] == min_group]

# top_std = top_group['frt_120'].std()
# bottom_std = bottom_group['frt_120'].std()

# # 去掉超过若干个标准差的异常值
# def remove_outliers(df, column):
#     mean = df[column].mean()
#     std = df[column].std()
#     return df[(df[column] >= mean - 1 * std) & (df[column] <= mean + 1 * std)]

# top_group = remove_outliers(top_group, 'frt_120')
# bottom_group = remove_outliers(bottom_group, 'frt_120')

# # 计算标准差
# top_group_std = top_group['frt_120'].std()
# bottom_group_std = bottom_group['frt_120'].std()

# # 可视化：箱型图
# sns.set(style="whitegrid")
# fig, ax = plt.subplots(1, 2, figsize=(15, 8))

# sns.boxplot(y=top_group['frt_120'], ax=ax[0], color='skyblue')
# ax[0].set_title(f'Max Group (original_std: {top_std:.2f},std: {top_group_std:.2f})')

# sns.boxplot(y=bottom_group['frt_120'], ax=ax[1], color='salmon')
# ax[1].set_title(f'Min Group (original_std: {bottom_std:.2f},std: {bottom_group_std:.2f})')

# plt.tight_layout()
# plt.show()

In [87]:
# import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
# import pandas as pd

# # 假设你的数据已经准备好了，并且 grouped 是你的数据框

# # 拟合三次多项式
# coefficients = np.polyfit(grouped['factor_mean'], grouped['return_mean'], 3)

# # 使用多项式系数创建一个多项式函数
# polynomial = np.poly1d(coefficients)

# # 创建拟合曲线的 x 轴数据（我们可以在 factor_mean 的范围内生成一些点）
# x_fit = np.linspace(grouped['factor_mean'].min(), grouped['factor_mean'].max(), 100)
# y_fit = polynomial(x_fit)

# # 打印三次多项式的表达式
# print("拟合的三次函数表达式：")
# print(f"f(x) = {coefficients[0]:.4f}x^3 + {coefficients[1]:.4f}x^2 + {coefficients[2]:.4f}x + {coefficients[3]:.4f}")

# # 绘图
# sns.set(style="whitegrid")
# fig, ax = plt.subplots(1, 1, figsize=(10, 15))

# # 绘制散点图
# sns.scatterplot(data=grouped, x='factor_mean', y='return_mean', ax=ax, color='blue')

# # 绘制三次拟合曲线
# ax.plot(x_fit, y_fit, color='red', label='Cubic Fit')

# ax.set_title('Factor vs Return with Cubic Fit')
# ax.legend()

# plt.show()


In [88]:
# grouped

In [89]:
# # X 和 y 是从 DataFrame 中提取的
# X = grouped['factor_mean'].values.reshape(-1, 1)  # 确保 X 是二维数组
# y = grouped['return_mean']  # y 是一维数组

# # 创建并训练线性回归模型
# model = LinearRegression()
# model.fit(X, y)

# # 获取回归参数
# slope = model.coef_[0]  # 回归系数
# intercept = model.intercept_  # 截距

# # 打印回归参数
# print(f"回归系数 (slope): {slope}")
# print(f"截距 (intercept): {intercept}")

# # 可视化回归结果
# plt.scatter(X, y, color='blue', label='数据点')
# plt.plot(X, model.predict(X), color='red', label=f'y = {slope:.2f} * X + {intercept:.2f}')
# plt.xlabel('Factor')
# plt.ylabel('Return')
# plt.title('线性回归')
# plt.legend()
# plt.show()

# 基准因子的IC值

In [90]:
# new_table['Base_factor'] = -(new_table['AskVolume1']-new_table['BidVolume1'])/(new_table['AskVolume1']+new_table['BidVolume1'])
# # new_table['Base_factor'].replace([np.inf, -np.inf], np.nan, inplace=True)
# # new_table['Base_factor'].fillna(0,inplace=True)
# ic_value = calculate_ic(new_table['Base_factor'],new_table['frt_120'])
# print(f'经典因子的 IC 值：{ic_value}')

In [91]:
# # 将因子分段排序
# new_table_sorted = new_table.sort_values(by='Base_factor', ascending=True)
# new_table_sorted['group'] = pd.qcut(new_table['Base_factor'], 100, labels=False,duplicates='drop')

# # 计算每组内 Factor1 和 return 的均值
# grouped = new_table_sorted.groupby('group').agg(
#     Base_factor_mean=('Base_factor', 'mean'),
#     return_mean=('return', 'mean')
# ).reset_index()


# sns.set(style="whitegrid")

# fig, ax = plt.subplots(1, 1, figsize=(10, 15))
# sns.scatterplot(data=grouped, x='Base_factor_mean', y='return_mean', ax=ax, color='blue')
# ax.set_title('Factor vs Return')
# # 添加回归线
# sns.regplot(x='Base_factor_mean', y='return_mean', data=grouped, ax=ax, scatter=False, color='blue')

In [92]:
# # 将因子分段排序
# new_table_sorted = new_table.sort_values(by='Base_factor', ascending=True)
# new_table_sorted['group'] = pd.qcut(new_table['Base_factor'], 100, labels=False,duplicates='drop')

# # 获取实际的最大组和最小组编号
# max_group = new_table_sorted['group'].max()
# min_group = new_table_sorted['group'].min()

# # 提取最大组和最小组的数据
# top_group = new_table_sorted[new_table_sorted['group'] == max_group]
# bottom_group = new_table_sorted[new_table_sorted['group'] == min_group]


# # # 去掉超过若干个标准差的异常值
# # def remove_outliers(df, column):
# #     mean = df[column].mean()
# #     std = df[column].std()
# #     return df[(df[column] >= mean -  2 * std) & (df[column] <= mean + 2 * std)]

# # top_group = remove_outliers(top_group, 'frt_120')
# # bottom_group = remove_outliers(bottom_group, 'frt_120')

# # 计算标准差
# top_group_std = top_group['frt_120'].std()
# bottom_group_std = bottom_group['frt_120'].std()

# # 可视化：箱型图
# sns.set(style="whitegrid")
# fig, ax = plt.subplots(1, 2, figsize=(15, 8))

# sns.boxplot(y=top_group['frt_120'], ax=ax[0], color='skyblue')
# ax[0].set_title(f'Max Group (std: {top_group_std:.2f})')

# sns.boxplot(y=bottom_group['frt_120'], ax=ax[1], color='salmon')
# ax[1].set_title(f'Min Group (std: {bottom_group_std:.2f})')

# plt.tight_layout()
# plt.show()

In [93]:
# new_table[['factor','Base_factor']].corr()