In [7]:
import pandas as pd
import numpy as np
import os
import pickle
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

### 因子构建

In [None]:
futures_type_list = ['rb2310', 'ag2308', 'fu2309', 'ni2306', 'sn2306', 'au2308']

n, n2 = 60, 600
for futures_type in futures_type_list:
    df_name = f"data_all_{futures_type}"
    df = pd.read_csv(f'/disk1/imb/202305_all/{df_name}.csv')

    df['mid_price_skew'] = df['mid_price'].rolling(n, min_periods=int(1 / 2 * (n + 1))).skew().shift(periods=1)
    df['mid_price_kurt'] = df['mid_price'].rolling(n, min_periods=int(1 / 2 * (n + 1))).kurt().shift(periods=1)
    df['mid_price_std'] = df['mid_price'].rolling(n, min_periods=int(1 / 2 * (n + 1))).std().shift(periods=1)

    df['volume_pct'] = (df['log_volume'].rolling(n).sum() / df['log_volume'].rolling(n2).sum()).shift(periods=1)
    df['volume_pct'] =df['volume_pct'].ffill()  # df['volume_pct'].fillna(method='ffill', inplace=True)
    df['prop_quoted_spread'] = ((df['diff_ask_price1']-df['diff_bid_price1'])/df['mid_price']).shift(periods=1)

    df_name_to_write = f"{df_name}_with_factor_group1"
    df.to_csv(f'/disk1/imb/202305_all/{df_name_to_write}.csv', index=False)

### 因子效果检测

f8: factor_cols = ['mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta', 'illiquidity']  
f7: factor_cols = ['mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta']  

#### f8 绘图

In [None]:
futures_type_list = ['rb2310', 'ag2308', 'fu2309', 'ni2306', 'sn2306', 'au2308']
column_list = ['R2', 'MSE', 'MAE']

df_results = pd.DataFrame(index=futures_type_list, columns=column_list)

for futures_type in futures_type_list:
    df_name = f"data_all_{futures_type}_with_factor_all"
    df = pd.read_csv(f'/disk1/imb/202305_all/{df_name}.csv')
    sub_df = df[['TimeStamp', 'mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta', 'illiquidity', 'return']]
    
    # 缺失值处理
    sub_df['illiquidity'] = sub_df['illiquidity'].replace([np.nan, np.inf, -np.inf], 0)
    sub_df.dropna(inplace=True)
    
    # 提取自变量和因变量
    factor_cols = ['mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta', 'illiquidity']
    X = sub_df[factor_cols]
    y = sub_df['return']
    
    # 添加常数列（截距项）
    X = sm.add_constant(X)
    # 执行线性回归
    model = sm.OLS(y, X, missing='drop').fit()
    
    # 预测因变量
    y_pred = model.predict(X)

    # 计算 x * y:
    factor_cols_multiply = [f"{factor_col}_dot_return" for factor_col in factor_cols] + ["pred_return_dot_return"]
    df_multiply = pd.DataFrame(index=X.index, columns=factor_cols_multiply)
    for factor_col in factor_cols:
        df_multiply[f"{factor_col}_dot_return"] = (X[factor_col] * y).cumsum()
    df_multiply["pred_return_dot_return"] = (y_pred * y).cumsum()
    # print(df_multiply)

    fig, axs = plt.subplots(3, 3, figsize=(10, 10))
    fig_name = f'fig_xm_f8_{futures_type}'
    fig.suptitle(fig_name)

    # 绘制每个子图中的数据
    for i, ax in enumerate(axs.flatten()):
        col_name = df_multiply.columns[i]
        ax.plot(df_multiply.index, df_multiply[col_name])
        ax.set_title(col_name)
        ax.grid(True)
    
    # 调整布局
    plt.tight_layout()
    plt.savefig(f'/disk1/imb/202305_figure/{fig_name}.png')
    plt.show()    


#### f7 绘图

In [None]:
# f7 绘图
futures_type_list = ['rb2310', 'ag2308', 'fu2309', 'ni2306', 'sn2306', 'au2308']
column_list = ['R2', 'MSE', 'MAE']

df_results = pd.DataFrame(index=futures_type_list, columns=column_list)

for futures_type in futures_type_list:
    df_name = f"data_all_{futures_type}_with_factor_all"
    df = pd.read_csv(f'/disk1/imb/202305_all/{df_name}.csv')
    sub_df = df[['TimeStamp', 'mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta', 'return']]
    
    # 缺失值处理
    # sub_df['illiquidity'] = sub_df['illiquidity'].replace([np.nan, np.inf, -np.inf], 0)
    sub_df.dropna(inplace=True)
    
    # 提取自变量和因变量
    factor_cols = ['mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta']
    X = sub_df[factor_cols]
    y = sub_df['return']
    
    # 添加常数列（截距项）
    X = sm.add_constant(X)
    # 执行线性回归
    model = sm.OLS(y, X, missing='drop').fit()
    
    # 预测因变量
    y_pred = model.predict(X)

    # 计算 x * y:
    factor_cols_multiply = [f"{factor_col}_dot_return" for factor_col in factor_cols] + ["pred_return_dot_return"]
    df_multiply = pd.DataFrame(index=X.index, columns=factor_cols_multiply)
    for factor_col in factor_cols:
        df_multiply[f"{factor_col}_dot_return"] = (X[factor_col] * y).cumsum()
    df_multiply["pred_return_dot_return"] = (y_pred * y).cumsum()
    # print(df_multiply)

    fig, axs = plt.subplots(2, 4, figsize=(15, 8))
    fig_name = f'fig_xm_f7_{futures_type}'
    fig.suptitle(fig_name)

    # 绘制每个子图中的数据
    for i, ax in enumerate(axs.flatten()):
        col_name = df_multiply.columns[i]
        ax.plot(df_multiply.index, df_multiply[col_name])
        ax.set_title(col_name)
        ax.grid(True)
    
    # 调整布局
    plt.tight_layout()
    plt.savefig(f'/disk1/imb/202305_figure/{fig_name}.png')
    plt.show()    


#### f8: 统计量计算  算R2, MSE, MAE等

In [None]:
futures_type_list = ['rb2310', 'ag2308', 'fu2309', 'ni2306', 'sn2306', 'au2308']
column_list = ['R2', 'MSE', 'MAE']

df_results = pd.DataFrame(index=futures_type_list, columns=column_list)

for futures_type in futures_type_list:
    df_name = f"data_all_{futures_type}_with_factor_all"
    df = pd.read_csv(f'/disk1/imb/202305_all/{df_name}.csv')
    sub_df = df[['TimeStamp', 'mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta', 'illiquidity', 'return']]
    
    # 缺失值处理
    sub_df['illiquidity'] = sub_df['illiquidity'].replace([np.nan, np.inf, -np.inf], 0)
    sub_df.dropna(inplace=True)
    
    # 提取自变量和因变量
    X = sub_df[['mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta', 'illiquidity']]
    y = sub_df['return']
    
    # 添加常数列（截距项）
    X = sm.add_constant(X)
    # 执行线性回归
    model = sm.OLS(y, X, missing='drop').fit()
    
    # 计算R^2
    r_squared = model.rsquared
    # 预测因变量
    y_pred = model.predict(X)
    # 计算均方误差（MSE）
    mse = mean_squared_error(y, y_pred)
    # 计算平均绝对误差（MAE）
    mae = mean_absolute_error(y, y_pred)

    df_results.loc[futures_type, 'R2'] = r_squared
    df_results.loc[futures_type, 'MSE'] = mse
    df_results.loc[futures_type, 'MAE'] = mae
print(df_results)

#### f7: 统计量计算  算R2, MSE, MAE等

In [None]:
# f7: 统计量计算
futures_type_list = ['rb2310', 'ag2308', 'fu2309', 'ni2306', 'sn2306', 'au2308']
column_list = ['R2', 'MSE', 'MAE']

df_results = pd.DataFrame(index=futures_type_list, columns=column_list)


for futures_type in futures_type_list:
    # 数据读取
    df_name = f"data_all_{futures_type}_with_factor_all"
    df = pd.read_csv(f'/disk1/imb/202305_all/{df_name}.csv')
    sub_df = df[['TimeStamp', 'mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta', 'return']]

    # 缺失值处理
    # sub_df['illiquidity'] = sub_df['illiquidity'].replace([np.nan, np.inf, -np.inf], 0)
    sub_df.dropna(inplace=True)
    
    # 提取自变量和因变量
    X = sub_df[['mid_price_skew', 'mid_price_kurt', 'mid_price_std', 'volume_pct', 'prop_quoted_spread', 'mid_price_mean', 'beta']]
    y = sub_df['return']

    # 添加常数列（截距项）
    X = sm.add_constant(X)
    # 执行线性回归
    model = sm.OLS(y, X, missing='drop').fit()
    
    # 计算R^2
    r_squared = model.rsquared
    # 预测因变量
    y_pred = model.predict(X)
    # 计算均方误差（MSE）
    mse = mean_squared_error(y, y_pred)
    # 计算平均绝对误差（MAE）
    mae = mean_absolute_error(y, y_pred)

    df_results.loc[futures_type, 'R2'] = r_squared
    df_results.loc[futures_type, 'MSE'] = mse
    df_results.loc[futures_type, 'MAE'] = mae

    print(1)

print(df_results)