# 因子检测数据获取

In [80]:
import pandas as pd 
import numpy as np
from scipy import stats
from sklearn import linear_model
import statsmodels.api as sm
from rqdatac import *
from rqfactor import *
from rqfactor.notebook import *
from rqfactor.extension import *
init()
import rqdatac



# 券池过滤数据

In [95]:
# 研究时间
start_date = '20140101'   # 【example】：2020-01-01
end_date = '20221214'       # 【example】：2022-11-01
# 研究标的
index_item = '399303.XSHE'             # 【example】：全A 000985.XSHG 中证1000 000852.XSHG  国证2000 399303.XSHE 中证500 000905.XSHG 沪深300 000300

# 股票池
index = pd.DataFrame()
trade_days = get_trading_dates(start_date, end_date, market='cn')   #获取期间交易日信息
for i in trade_days:
    index_day = pd.DataFrame(rqdatac.index_components(index_item, i),columns = [i]).T
    index = pd.concat([index,index_day],axis = 0)
    print('\r 当前：{} / 总量：{}'.format(i,trade_days[-1]),end='')

# 构建动态股票池 
index_fix = index.unstack().reset_index().iloc[:,-2:]
index_fix.columns = ['date','stock']
index_fix.date = pd.to_datetime(index_fix.date)
index_fix['level'] = True
index_fix.dropna(inplace = True)
index_fix = index_fix.set_index(['date','stock']).level.unstack()
index_fix.fillna(False,inplace = True)

 当前：2022-12-14 / 总量：2022-12-14

In [89]:
index_fix.to_pickle('index_filter_000852.pkl')

In [91]:
index_fix.to_pickle('index_filter_000300.pkl')

In [93]:
index_fix.to_pickle('index_filter_000905.pkl')

In [96]:
index_fix.to_pickle('index_filter_399303.pkl')

# 新股 涨停 ST 停牌筛选

In [65]:
def get_new_stock_filter(stock_list,date_list, newly_listed_threshold=21):

    listed_date_list = [rqdatac.instruments(stock).listed_date for stock in stock_list]        
    newly_listed_window = pd.Series(index=stock_list, data=[rqdatac.get_next_trading_date(listed_date, n=newly_listed_threshold) for listed_date in listed_date_list])     
    newly_listed_label = pd.DataFrame(index=date_list, columns=stock_list, data=0.0)

    # 上市时间短语指定窗口的新股标记为1，否则为0
    for stock in newly_listed_window.index:
        newly_listed_label.loc[:newly_listed_window.loc[stock], stock] = 1.0
                    #剔除新股
    newly_listed_label.replace(1,True,inplace = True)
    newly_listed_label.replace(0,False,inplace = True)
    newly_listed_label = newly_listed_label.shift(-1).fillna(method = 'ffill')
    print('剔除新股已构建')

    return newly_listed_label

def get_st_filter(stock_list,date_list):
    # 对st股票做标记,st=1,非st=0

    st_filter = rqdatac.is_st_stock(stock_list,date_list[0],date_list[-1]).astype('float').reindex(columns=stock_list,index = date_list)                                #剔除ST
    st_filter.replace(1,True,inplace = True)
    st_filter.replace(0,False,inplace = True)
    st_filter = st_filter.shift(-1).fillna(method = 'ffill')
    print('剔除ST已构建')

    return st_filter

def get_suspended_filter(stock_list,date_list):

    suspended_filter = rqdatac.is_suspended(stock_list,date_list[0],date_list[-1]).astype('float').reindex(columns=stock_list,index=date_list)

    suspended_filter.replace(1,True,inplace = True)
    suspended_filter.replace(0,False,inplace = True)
    suspended_filter = suspended_filter.shift(-1).fillna(method = 'ffill')
    print('剔除停牌已构建')

    return suspended_filter

def get_limit_up_down_filter(stock_list,date_list):

    # 涨停则赋值为1,反之为0    
    df = pd.DataFrame(index = date_list,columns=stock_list,data=0.0)
    total_price = rqdatac.get_price(stock_list,date_list[0],date_list[-1],adjust_type='none')

    for stock in stock_list:

        try:
            price = total_price.loc[stock]
        except:
            print('no stock data:',stock)
            df[stock] = np.nan
            continue                    

        # 如果close == limit_up or limit down,则股票涨停或者跌停        
        condition = ((price['open'] == price['limit_up']))#|(price['close'] == price['limit_down']))        
        if condition.sum()!=0:
            df.loc[condition.loc[condition==True].index,stock] = 1.0

    df.replace(1.0,True,inplace = True)
    df.replace(0.0,False,inplace = True)
    df = df.shift(-1).fillna(method = 'ffill')
    print('剔除开盘涨停已构建')

    return df

In [40]:
start_date = '2014-01-01'
end_date = '2022-12-15'

# 全A
insts = all_instruments('CS')
# 获取股票池&研究周期
stock_list = sorted(insts[((insts['de_listed_date'] == '0000-00-00') | (insts['de_listed_date'] > start_date)) & (insts['listed_date'] <= end_date)].order_book_id.tolist())
date_list = get_trading_dates(start_date, end_date, market='cn')

## 过滤mask获取

In [66]:
limit_up_down_filter = get_limit_up_down_filter(stock_list,date_list)

no stock data: 301368.XSHE
no stock data: 301398.XSHE
剔除开盘涨停已构建


In [75]:
limit_up_down_filter.dropna(axis = 1,how = 'all').info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2180 entries, 2014-01-02 to 2022-12-14
Columns: 4998 entries, 000001.XSHE to 689009.XSHG
dtypes: bool(4998)
memory usage: 10.5 MB


In [41]:
new_stock_filter = get_new_stock_filter(stock_list,date_list)
st_filter = get_st_filter(stock_list,date_list)
suspended_filter = get_suspended_filter(stock_list,date_list)
limit_up_down_filter = get_limit_up_down_filter(stock_list,date_list)

剔除新股已构建
剔除ST已构建
剔除停牌已构建
no stock data: 301368.XSHE
no stock data: 301398.XSHE
剔除开盘涨停已构建


In [44]:
new_stock_filter.dropna(axis = 1,how = 'all').to_pickle('new_stock_filter.pkl')
st_filter.dropna(axis = 1,how = 'all').to_pickle('st_filter.pkl')
suspended_filter.dropna(axis = 1,how = 'all').to_pickle('suspended_filter.pkl')
limit_up_down_filter.dropna(axis = 1,how = 'all').to_pickle('limit_up_down_filter.pkl')

# 行业分类

In [77]:
def get_industry_exposure(order_book_ids,datetime_period):
    zx2019_industry = rqdatac.client.get_client().execute('__internal__zx2019_industry')
    df = pd.DataFrame(zx2019_industry)
    df.set_index(['order_book_id', 'start_date'], inplace=True)
    df = df['first_industry_name'].sort_index()
    print('中信行业数据已获取')

    #构建动态行业数据表格
    index = pd.MultiIndex.from_product([order_book_ids, datetime_period], names=['order_book_id', 'datetime'])
    pos = df.index.searchsorted(index, side='right') - 1
    index = index.swaplevel()   # level change (oid, datetime) --> (datetime, oid)
    result = pd.Series(df.values[pos], index=index)
    result = result.sort_index()
    print('动态行业数据已构建')

    #生成行业虚拟变量
    return pd.get_dummies(result)

industry_exposure = get_industry_exposure(stock_list,date_list)

中信行业数据已获取
动态行业数据已构建


In [78]:
industry_exposure.to_pickle('industry_exposure.pkl')

In [79]:
industry_exposure

Unnamed: 0_level_0,Unnamed: 1_level_0,交通运输,传媒,农林牧渔,医药,商贸零售,国防军工,基础化工,家电,建材,建筑,...,综合,综合金融,计算机,轻工制造,通信,钢铁,银行,非银行金融,食品饮料,餐饮旅游
datetime,order_book_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2014-01-02,000001.XSHE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2014-01-02,000002.XSHE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-02,000004.XSHE,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-02,000005.XSHE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2014-01-02,000006.XSHE,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-14,688799.XSHG,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2022-12-14,688800.XSHG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2022-12-14,688819.XSHG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2022-12-14,688981.XSHG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 市值数据

In [85]:
'000043.XSHE' in stock_list

True

In [81]:
f = Factor('market_cap_3')
df_market_cap_whole = execute_factor(f,stock_list,start_date,end_date).stack()
df_market_cap_whole = np.log(df_market_cap_whole)
df_market_cap_whole

date        order_book_id
2014-01-02  000001.XSHE      25.330970
            000002.XSHE      25.200703
            000004.XSHE      20.714996
            000005.XSHE      21.545989
            000006.XSHE      22.606461
                               ...    
2022-12-15  688799.XSHG      21.988720
            688800.XSHG      23.259029
            688819.XSHG      24.319310
            688981.XSHG      26.547552
            689009.XSHG      23.878441
Length: 7645226, dtype: float64

In [83]:
df_market_cap_whole.to_pickle('market_cap.pkl')