# 因子计算
1. 利用商品期货1min数据进行高频数据低频化，得到日频因子，预测t+1开盘到t+2开盘的收益;
2. 原始数据已经分品种经过换月处理，并且合约只做主力合约;
3. 因子的参考主要来源是券商金工研报;
4. 此Jupyter Notebook仅计算因子，计算好因子后统一进行因子测试。

In [2]:
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

## 加载数据

In [3]:
folder_path = '合约数据'
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

dataframes = []
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

df_futures = pd.concat(dataframes, ignore_index=True)
df_futures

Unnamed: 0,order_book_id,datetime,open,high,low,close,volume,open_interest,trading_date,future,session
0,V1405,2014-01-06 09:01:00,6375.0,6375.0,6370.0,6375.0,100.0,29968.0,2014-01-06,V,Day
1,V1405,2014-01-06 09:02:00,6380.0,6380.0,6375.0,6375.0,58.0,29982.0,2014-01-06,V,Day
2,V1405,2014-01-06 09:03:00,6375.0,6380.0,6375.0,6380.0,10.0,29982.0,2014-01-06,V,Day
3,V1405,2014-01-06 09:04:00,6380.0,6380.0,6375.0,6375.0,40.0,30012.0,2014-01-06,V,Day
4,V1405,2014-01-06 09:05:00,6375.0,6375.0,6375.0,6375.0,10.0,30016.0,2014-01-06,V,Day
...,...,...,...,...,...,...,...,...,...,...,...
30032740,M2305,2022-12-30 14:56:00,3931.0,3934.0,3930.0,3930.0,5741.0,1349591.0,2022-12-30,M,Day
30032741,M2305,2022-12-30 14:57:00,3930.0,3934.0,3930.0,3933.0,3478.0,1349378.0,2022-12-30,M,Day
30032742,M2305,2022-12-30 14:58:00,3932.0,3938.0,3932.0,3936.0,4894.0,1348904.0,2022-12-30,M,Day
30032743,M2305,2022-12-30 14:59:00,3937.0,3938.0,3932.0,3933.0,6123.0,1348085.0,2022-12-30,M,Day


## 量价相关性因子
参考研报: 【东吴证券】“技术分析拥抱选股因子”系列研究（五）：CPV因子移位版，价量自相关性中蕴藏的选股信息

In [4]:
def cal_corr_price_N_vol(df):
    # 1min bar 的t时刻收盘价与t时刻成交量序列的相关系数
    corr_price_N_vol = df['close'].corr(df['volume'])
    return corr_price_N_vol

def cal_corr_price_l1_N_vol(df):
    # 1min bar 的t-1时刻收盘价与t时刻成交量序列的相关系数
    corr_price_l1_N_vol = df['close'].shift(1).corr(df['volume'])
    return corr_price_l1_N_vol

def cal_corr_price_1_N_vol(df):
    # 1min bar 的t+1时刻收盘价与t时刻成交量序列的相关系数
    corr_price_1_N_vol = df['close'].shift(-1).corr(df['volume'])
    return corr_price_1_N_vol

def cal_corr_ret_N_vol(df):
    # 1min bar 的t时刻1min收益率与t时刻成交量序列的相关系数
    df['1min_ret'] = df['close'].pct_change(1)
    corr_ret_N_vol = df['1min_ret'].corr(df['volume'])
    return corr_ret_N_vol

def cal_corr_ret_l1_N_vol(df):
    # 1min bar 的t-1时刻1min收益率与t时刻成交量序列的相关系数
    df['1min_ret'] = df['close'].pct_change(1)
    corr_ret_l1_N_vol = df['1min_ret'].shift(1).corr(df['volume'])
    return corr_ret_l1_N_vol

def cal_corr_ret_1_N_vol(df):
    # 1min bar 的t+1时刻1min收益率与t时刻成交量序列的相关系数
    df['1min_ret'] = df['close'].pct_change(1)
    corr_ret_1_N_vol = df['1min_ret'].shift(-1).corr(df['volume'])
    return corr_ret_1_N_vol

In [5]:
df_corr_price_N_vol = df_futures.groupby(['future','trading_date'],as_index=False,group_keys=False).apply(cal_corr_price_N_vol).rename(columns={None:'corr_price_N_vol'})
df_corr_price_N_vol.pivot(index='trading_date',columns='future',values='corr_price_N_vol').to_csv("Factor_csv/CPV_corr_price_N_vol.csv")

In [6]:
df_corr_price_l1_N_vol = df_futures.groupby(['future','trading_date'],as_index=False,group_keys=False).apply(cal_corr_price_l1_N_vol).rename(columns={None:'corr_price_l1_N_vol'})
df_corr_price_l1_N_vol.pivot(index='trading_date',columns='future',values='corr_price_l1_N_vol').to_csv("Factor_csv/CPV_corr_price_l1_N_vol.csv")

In [7]:
df_corr_price_1_N_vol = df_futures.groupby(['future','trading_date'],as_index=False,group_keys=False).apply(cal_corr_price_1_N_vol).rename(columns={None:'corr_price_1_N_vol'})
df_corr_price_1_N_vol.pivot(index='trading_date',columns='future',values='corr_price_1_N_vol').to_csv("Factor_csv/CPV_corr_price_1_N_vol.csv")

In [8]:
df_corr_ret_N_vol = df_futures.groupby(['future','trading_date'],as_index=False,group_keys=False).apply(cal_corr_ret_N_vol).rename(columns={None:'corr_ret_N_vol'})
df_corr_ret_N_vol.pivot(index='trading_date',columns='future',values='corr_ret_N_vol').to_csv("Factor_csv/CPV_corr_ret_N_vol.csv")

In [9]:
df_corr_ret_l1_N_vol = df_futures.groupby(['future','trading_date'],as_index=False,group_keys=False).apply(cal_corr_ret_l1_N_vol).rename(columns={None:'corr_ret_l1_N_vol'})
df_corr_ret_l1_N_vol.pivot(index='trading_date',columns='future',values='corr_ret_l1_N_vol').to_csv("Factor_csv/CPV_corr_ret_l1_N_vol.csv")

In [10]:
df_corr_ret_1_N_vol = df_futures.groupby(['future','trading_date'],as_index=False,group_keys=False).apply(cal_corr_ret_1_N_vol).rename(columns={None:'corr_ret_1_N_vol'})
df_corr_ret_1_N_vol.pivot(index='trading_date',columns='future',values='corr_ret_1_N_vol').to_csv("Factor_csv/CPV_corr_ret_1_N_vol.csv")

## 成交量潮汐因子
参考研报：【方正证券】多因子选股系列研究之二：个股成交量的潮汐变化及“潮汐”因子构建

In [11]:
def cal_tide(df,lenth):
    df['neighbor_volume'] = df['volume'].rolling(window=lenth, center=True).sum()
    df  = df.dropna()
    
    peak_idx = df['neighbor_volume'].idxmax()
    peak_time = df.index.get_loc(peak_idx)
    V_peak = df.loc[peak_idx, 'neighbor_volume']
    C_peak = df.loc[peak_idx, 'close']
    
    flood_data = df[df.index < peak_idx]
    if not flood_data.empty : 
        flood_idx = flood_data['neighbor_volume'].idxmin()
        Vm = df.loc[flood_idx, 'neighbor_volume']
        Cm = df.loc[flood_idx, 'close']
        m = df.index.get_loc(flood_idx)
    else:
        Vm = V_peak
        Cm = C_peak
        m = peak_time

    ebb_data = df[df.index > peak_idx]
    if not ebb_data.empty:
        ebb_idx = ebb_data['neighbor_volume'].idxmin()
        Vn = df.loc[ebb_idx, 'neighbor_volume']
        Cn = df.loc[ebb_idx, 'close']
        n = df.index.get_loc(ebb_idx)
    else:
        Vn = V_peak
        Cn = C_peak
        n = peak_time
    
    whole_tide = (Cn-Cm)/Cm/(n-m)
    flood_tide = (C_peak-Cm)/Cm/(peak_time-m)
    ebb_tide = (Cn-C_peak)/C_peak/(n-peak_time)

    return whole_tide, flood_tide, ebb_tide

In [12]:
def cal_factor_1(df,lenth):

    df = df.sort_index()

    if len(df['session'].unique())>1:
        df_night = df[df['session']=='Night']
        df_day = df[df['session']=='Day']

        night_whole_tide, night_flood_tide, night_ebb_tide = cal_tide(df_night,lenth)
        day_whole_tide, day_flood_tide, day_ebb_tide = cal_tide(df_day,lenth)

        whole_tide = (night_whole_tide + day_whole_tide)/2
        flood_tide = (night_flood_tide + day_flood_tide)/2
        ebb_tide = (night_ebb_tide + day_ebb_tide)/2
    else:
        whole_tide, flood_tide, ebb_tide = cal_tide(df,lenth)
        night_whole_tide, day_whole_tide = np.nan, whole_tide


    return pd.DataFrame([[night_whole_tide, day_whole_tide, whole_tide, flood_tide, ebb_tide]],
                        columns=['night_whole_tide', 'day_whole_tide', 'whole_tide', 'flood_tide', 'ebb_tide'])


In [13]:
from pandarallel import pandarallel
pandarallel.initialize(nb_workers=6,progress_bar=True)

df_daily_factor_1 = df_futures.groupby(['future','trading_date']).parallel_apply(cal_factor_1,9).reset_index(level=2,drop=True).reset_index()
df_daily_factor_2 = df_futures.groupby(['future','trading_date']).parallel_apply(cal_factor_1,7).reset_index(level=2,drop=True).reset_index()
df_daily_factor_3 = df_futures.groupby(['future','trading_date']).parallel_apply(cal_factor_1,5).reset_index(level=2,drop=True).reset_index()
df_daily_factor_4 = df_futures.groupby(['future','trading_date']).parallel_apply(cal_factor_1,3).reset_index(level=2,drop=True).reset_index()

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14219), Label(value='0 / 14219')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14219), Label(value='0 / 14219')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14219), Label(value='0 / 14219')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14219), Label(value='0 / 14219')))…

In [14]:
df_daily_factor_1.pivot(index='trading_date',columns='future',values='whole_tide').to_csv("Factor_csv/Whole_tide_9.csv")
df_daily_factor_2.pivot(index='trading_date',columns='future',values='whole_tide').to_csv("Factor_csv/Whole_tide_7.csv")
df_daily_factor_3.pivot(index='trading_date',columns='future',values='whole_tide').to_csv("Factor_csv/Whole_tide_5.csv")
df_daily_factor_4.pivot(index='trading_date',columns='future',values='whole_tide').to_csv("Factor_csv/Whole_tide_3.csv")

## 成交量激增时段的跳跃因子

In [15]:
def cal_factor_ret_res(df):

    df = df.sort_index()

    df['1min_ret'] = df['close'].pct_change()
    df['1min_log_ret'] = np.log(df['close'] / df['close'].shift(1))

    df = df.fillna(0)

    df['diff_ret'] = df['1min_ret'] - df['1min_log_ret']
    df['ret_res'] = df['diff_ret']*2 - df['1min_log_ret']**2

    top10_mean_ret_res = df.nlargest(int(np.ceil(len(df)/10)),'volume')['ret_res'].mean()
    top20_mean_ret_res = df.nlargest(int(np.ceil(len(df)/5)),'volume')['ret_res'].mean()
    mean_ret_res = df['ret_res'].mean()

    bot90_mean_ret_res = df.nsmallest((len(df) - int(np.ceil(len(df)/10))),'volume')['ret_res'].mean()
    bot10_mean_ret_res = df.nsmallest(int(np.ceil(len(df)/10)),'volume')['ret_res'].mean()
    Diff_10_mean_ret_res = top10_mean_ret_res - mean_ret_res
    Diff_10_20_ret_res = top10_mean_ret_res - top20_mean_ret_res
    Diff_top10_bot90 = top10_mean_ret_res - bot90_mean_ret_res

    return pd.DataFrame(
        [[mean_ret_res,top10_mean_ret_res,top20_mean_ret_res,Diff_10_mean_ret_res,Diff_10_20_ret_res,Diff_top10_bot90]],
        columns = ['mean_ret_res','top10_mean_ret_res','top20_mean_ret_res','Diff_10_mean_ret_res','Diff_10_20_ret_res','Diff_top10_bot90']
    )

In [16]:
df_daily_ret_res = df_futures.groupby(['future','trading_date']).parallel_apply(cal_factor_ret_res).reset_index(level=2,drop=True).reset_index()

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=14219), Label(value='0 / 14219')))…

In [17]:
df_daily_ret_res.pivot(index='trading_date',columns='future',values='mean_ret_res').to_csv("Factor_csv/Mean_ret_res.csv")
df_daily_ret_res.pivot(index='trading_date',columns='future',values='top10_mean_ret_res').to_csv("Factor_csv/Top10_mean_ret_res.csv")
df_daily_ret_res.pivot(index='trading_date',columns='future',values='top20_mean_ret_res').to_csv("Factor_csv/Top20_mean_ret_res.csv")
df_daily_ret_res.pivot(index='trading_date',columns='future',values='Diff_10_mean_ret_res').to_csv("Factor_csv/Diff_10_mean_ret_res.csv")
df_daily_ret_res.pivot(index='trading_date',columns='future',values='Diff_10_20_ret_res').to_csv("Factor_csv/Diff_10_20_ret_res.csv")
df_daily_ret_res.pivot(index='trading_date',columns='future',values='Diff_top10_bot90').to_csv("Factor_csv/Diff_top10_bot90.csv")

## 极端收益因子


In [18]:
def cal_ext_ret(df):
    
    df['1min_ret'] = df['close'].pct_change().fillna(0)
    max_1min_ret = df['1min_ret'].max()
    min_1min_ret = df['1min_ret'].min()
    if abs(max_1min_ret) > abs(min_1min_ret):
        ext_ret = max_1min_ret
    else:
        ext_ret = min_1min_ret
    
    return ext_ret

In [19]:
df_ext_ret = df_futures.groupby(['future','trading_date'],as_index=False,group_keys=False).apply(cal_ext_ret).rename(columns={None:'ext_ret'})
df_ext_ret.pivot(index='trading_date',columns='future',values='ext_ret').to_csv("Factor_csv/ext_ret.csv")