# 手工挖掘因子

## 1. 环境配置

In [1]:
import os
import pandas as pd
from tqdm import tqdm

## 2. 数据读取

In [2]:
factors = {}  # 因子
data_dir = '/share1/home/daitianyang/Quant/factor_readable/base_factor'  # 数据目录(基础因子)

for csv_name in tqdm(sorted(os.listdir(data_dir)), desc="基础因子"):
    if csv_name.endswith('.csv'):
        data_path = os.path.join(data_dir, csv_name)  # 数据路径
        data = pd.read_csv(data_path)  # 数据
        data = data.rename(columns={data.columns[0]: 'index'})  # 修改第1列列名为'index'
        data = data.set_index(data.columns[0])  # 设置第1列为索引
        factor_name = csv_name[:-4]  # 因子名称，'factor1'
        factor_index = int(factor_name[6:])  # 因子索引，1
        factors[factor_index] = data  # 因子

基础因子: 100%|██████████| 8/8 [00:00<00:00, 27.97it/s]


标准格式如下：

In [3]:
factor_num = len(factors)  # 因子数目
future_names = factors[27].columns  # 期货名称
future_num = len(future_names)  # 期货数目

print('因子数目:', factor_num)
print('期货名称:', future_names.values)
print('期货数目:', future_num)

print('shape:', factors[27].shape)
factors[27].head(5)

因子数目: 8
期货名称: ['A' 'AG' 'AL' 'AP' 'AU' 'B' 'BB' 'BC' 'BU' 'C' 'CF' 'CJ' 'CS' 'CU' 'CY'
 'EB' 'EG' 'ER' 'FB' 'FG' 'FU' 'HC' 'I' 'IC' 'IF' 'IH' 'IM' 'J' 'JD' 'JM'
 'JR' 'L' 'LH' 'LR' 'LU' 'M' 'MA' 'ME' 'NI' 'NR' 'OI' 'P' 'PB' 'PF' 'PG'
 'PK' 'PM' 'PP' 'RB' 'RI' 'RM' 'RO' 'RR' 'RS' 'RU' 'SA' 'SC' 'SF' 'SM'
 'SN' 'SP' 'SR' 'SS' 'T' 'TA' 'TC' 'TF' 'TS' 'UR' 'V' 'WH' 'WR' 'WS' 'WT'
 'Y' 'ZC' 'ZN']
期货数目: 77
shape: (2430, 77)


Unnamed: 0_level_0,A,AG,AL,AP,AU,B,BB,BC,BU,C,...,TS,UR,V,WH,WR,WS,WT,Y,ZC,ZN
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20130104,4740.0,6226.0,15315.0,,336.839996,4674.0,,,,2446.0,...,,,6690.0,2688.0,3981.0,2572.0,,8768.0,,15540.0
20130107,4731.0,6392.0,15265.356593,,338.540009,4667.0,,,,2445.0,...,,,6705.0,2694.0,4046.0,,,8688.0,,15376.002772
20130108,4739.0,6371.0,15240.356848,,336.529999,4664.0,,,,2456.0,...,,,6685.0,2707.0,4020.0,,,8660.0,,15311.397718
20130109,4737.0,6429.0,15245.356965,,339.320007,4655.0,,,,2444.0,...,,,6700.0,2689.0,4020.0,,,8600.0,,15321.336957
20130110,4750.0,6425.0,15270.357551,,338.679993,4640.0,,,,2445.0,...,,,6705.0,2677.0,4025.0,,,8562.0,,15430.668586


因子预处理

In [4]:
for factor_key, factor_value in factors.items():  # factor_key=1
    data = factor_value
    data = data.fillna(0.00)  # 填充缺失值
    data.index = pd.to_datetime(data.index, format='%Y%m%d')  # 转换索引为datetime格式
    data = data.reset_index().rename(columns={'index': 'date'})  # 修改索引列名为'date'
    factors[factor_key] = data  # 因子

In [5]:
FACTORS = {}  # ! 因子，键为期货，值为因子

for future_name in future_names:  # 期货名称，'A'
    data = pd.DataFrame()
    data['date'] = factors[27]['date']  # 日期，2013-01-04
    row_num = len(data)  # 行数，2430
    data['code'] = pd.Series([future_name] * row_num)  # 期货名称，'A'
    
    for factor_key, factor_value in factors.items():  # factor_key=1
        data[f'factor{factor_key}'] = factor_value[future_name]
    
    FACTORS[future_name] = data  # 因子

预处理后格式如下：

In [6]:
print('shape:', FACTORS['A'].shape)
FACTORS['A'].head(5)

shape: (2430, 10)


Unnamed: 0,date,code,factor27,factor28,factor29,factor30,factor31,factor32,factor33,factor34
0,2013-01-04,A,4740.0,4720.0,4752.0,4682.0,4730.0,41649.0,113783.0,4730.21273
1,2013-01-07,A,4731.0,4721.0,4765.0,4718.0,4741.0,52936.0,111612.0,4741.153279
2,2013-01-08,A,4739.0,4753.0,4753.0,4706.0,4729.0,53321.0,111279.0,4729.124735
3,2013-01-09,A,4737.0,4739.0,4754.0,4734.0,4743.0,22009.0,109914.0,4743.957018
4,2013-01-10,A,4750.0,4740.0,4751.0,4735.0,4744.0,17663.0,107208.0,4744.473759


## 3. 准备数据

在这里额外计算的因子是价格斜率（slope）、相对强弱指标（rsi）、威廉指标值（wr）、MACD快线（dif）、MACD慢线（dea）、MACD柱（macd）和抛物线指标（sar）这7个因子。

In [7]:
import talib

factor_alias = {'close': 'factor27', 'open': 'factor28', 'high': 'factor29', 'low': 'factor30', 'settle': 'factor31',
               'volume': 'factor32', 'opi': 'factor33', 'vwap': 'factor34', 'slope': 'factor35', 'rsi': 'factor36',
               'wr': 'factor37', 'dif': 'factor38', 'dea': 'factor39', 'macd': 'factor40', 'sar': 'factor41'}  # 因子别名

for future_name in future_names:  # 期货名称，'A'
    data = FACTORS[future_name]
    
    # 收盘价的斜率(slope)
    data[factor_alias['slope']] = talib.LINEARREG_SLOPE(data[factor_alias['close']].values, timeperiod=5)
    # 相对强弱指标(rsi)
    data[factor_alias['rsi']] = talib.RSI(data[factor_alias['close']].values, timeperiod = 14)
    # 威廉指标值(wr)
    data[factor_alias['wr']] = talib.WILLR(data[factor_alias['high']].values, data[factor_alias['low']].values, data[factor_alias['close']].values, timeperiod=7)
    # MACD中的DIF、DEA和MACD柱(dif, dea, macd)
    data[factor_alias['dif']], data[factor_alias['dea']], data[factor_alias['macd']] = talib.MACD(data[factor_alias['close']].values, fastperiod=12, slowperiod=26, signalperiod=9)
    # 抛物线指标(sar)
    data[factor_alias['sar']] = talib.SAR(data[factor_alias['high']].values, data[factor_alias['low']].values)
    
    FACTORS[future_name] = data

In [8]:
print('shape:', FACTORS['A'].shape)
FACTORS['A'].head(5)

shape: (2430, 17)


Unnamed: 0,date,code,factor27,factor28,factor29,factor30,factor31,factor32,factor33,factor34,factor35,factor36,factor37,factor38,factor39,factor40,factor41
0,2013-01-04,A,4740.0,4720.0,4752.0,4682.0,4730.0,41649.0,113783.0,4730.21273,,,,,,,
1,2013-01-07,A,4731.0,4721.0,4765.0,4718.0,4741.0,52936.0,111612.0,4741.153279,,,,,,,4682.0
2,2013-01-08,A,4739.0,4753.0,4753.0,4706.0,4729.0,53321.0,111279.0,4729.124735,,,,,,,4683.66
3,2013-01-09,A,4737.0,4739.0,4754.0,4734.0,4743.0,22009.0,109914.0,4743.957018,,,,,,,4685.2868
4,2013-01-10,A,4750.0,4740.0,4751.0,4735.0,4744.0,17663.0,107208.0,4744.473759,2.6,,,,,,4686.881064


## 4. 数据导出

In [9]:
data_dir = '/share1/home/daitianyang/Quant/factor_readable/com_factor'  # 数据目录(共同因子)
data_path = os.path.join(data_dir, 'factor1.csv')  # 数据路径
data_df = pd.read_csv(data_path)
date_sr = data_df['index']  # 日期列

In [10]:
data_dir = '/share1/home/daitianyang/Quant/factor_readable/extra_factor'  # 数据目录(额外因子)
begin_index = 35
end_index = 41

for factor_index in range(begin_index, end_index+1):
    data = pd.DataFrame()
    data['index'] = date_sr  # 日期，20130104
    
    for future_name in future_names:  # 期货名称，'A'
        data[future_name] = FACTORS[future_name][f'factor{factor_index}']

    data = data.rename(columns={data.columns[0]: 'index'})  # 修改第1列列名为'index'
    data = data.set_index(data.columns[0])  # 设置第1列为索引
    factors[factor_index] = data
    
    factors[factor_index].to_csv(f'{data_dir}/factor{factor_index}.csv')  # 文件导出

In [11]:
print('shape:', factors[35].shape)
factors[35].head(5)

shape: (2430, 77)


Unnamed: 0_level_0,A,AG,AL,AP,AU,B,BB,BC,BU,C,...,TS,UR,V,WH,WR,WS,WT,Y,ZC,ZN
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20130104,,,,,,,,,,,...,,,,,,,,,,
20130107,,,,,,,,,,,...,,,,,,,,,,
20130108,,,,,,,,,,,...,,,,,,,,,,
20130109,,,,,,,,,,,...,,,,,,,,,,
20130110,2.6,43.5,-10.928453,-0.0,0.445999,-8.0,-0.0,-0.0,-0.0,-0.3,...,-0.0,-0.0,2.5,-2.7,6.2,-514.4,-0.0,-50.0,-0.0,-27.332864
