生成几个pandas数组，从数组中寻找最佳的参数

In [12]:
import numpy as np
from matplotlib import cm, pyplot as plt
import matplotlib.dates as dates
import pandas as pd
from datetime import timedelta,date

import sklearn

import warnings
warnings.filterwarnings("ignore")

一、组成最基础的价格、成交额数据

In [13]:
################# 基础数据  ###################
# price_data_pd ： 从聚宽平台中提取的基础数据，表示沪深300指数股在时间段内的收盘价、成交额

# 模拟开始时间、结束时间
beginDate = '2014-1-1'
endDate = '2017-12-30'

# 沪深300指数股
Secs = get_index_stocks('000300.XSHG')

# 提取相应的收盘价、成交额
price_data_pd = get_price(Secs, start_date = beginDate, end_date=endDate, frequency='daily', fields=['close','money'],fq = "pre")


二、搜集相应的数据进行数据的组装，生成收益率排名、成交额排名

In [14]:

#################  标准参数  #################
# 回滚观察过去的时间长度，表示以过去多长时间的长度作为观察长度
n_period = 2


#################### 生成收益率排名  ############
# 收益率的pandas数组
# data_yield_rank_pd ：表示以过去n_period为周期的时间段内的收益率排名，每天排名

# 其中利用到全局参数 n_period
data_yield_pd = pd.rolling_sum(np.log(price_data_pd['close']/price_data_pd['close'].shift(1)),n_period)

# 收益率的每天排序的数组
# axis = 1：表示横向的数列进行排列
# axis = 0：表示纵向的数列进行排列
# 此处应该用1，采用横向的数组进行排列
data_yield_rank_pd = data_yield_pd.rank(axis = 1)

# 填充Nan数据为300
data_yield_rank_pd.fillna(300,inplace = True)


#################### 生成成交额排名  ############
# data_money_rank_pd ：表示以过去n_period为周期的时间段内的成交额排名，每天排名

data_money_pd = pd.rolling_sum(np.log(price_data_pd['money']/price_data_pd['money'].shift(1)),n_period)

# 收益率的每天排序的数组
# axis = 1：表示横向的数列进行排列
# axis = 0：表示纵向的数列进行排列
# 此处应该用1，采用横向的数组进行排列
data_money_rank_pd = data_money_pd.rank(axis = 1)

# 填充Nan数据为300
data_money_rank_pd.fillna(300,inplace = True)


按照n_period的参数，生成几个关键参数：
1. data_yield_rank_pd ：表示以过去n_period为周期的时间段内的收益率排名，每天排名
2. data_money_rank_pd ：表示以过去n_period为周期的时间段内的成交额排名，每天排名

In [15]:
data_money_rank_pd
data_yield_rank_pd

Unnamed: 0,000001.XSHE,000002.XSHE,000063.XSHE,000069.XSHE,000100.XSHE,000157.XSHE,000166.XSHE,000333.XSHE,000338.XSHE,000402.XSHE,...,603156.XSHG,603160.XSHG,603259.XSHG,603260.XSHG,603288.XSHG,603799.XSHG,603833.XSHG,603858.XSHG,603986.XSHG,603993.XSHG
2014-01-02,300,300,300,300.0,300,300,300,300,300,300,...,300,300,300,300,300,300,300,300,300.0,300
2014-01-03,300,300,300,300.0,300,300,300,300,300,300,...,300,300,300,300,300,300,300,300,300.0,300
2014-01-06,91,30,198,71.0,186,101,300,82,62,130,...,300,300,300,300,300,300,300,300,300.0,55
2014-01-07,135,27,32,75.0,189,128,300,92,97,142,...,300,300,300,300,300,300,300,300,300.0,56
2014-01-08,170,87,51,83.0,159,148,300,110,152,77,...,300,300,300,300,300,300,300,300,300.0,52
2014-01-09,222,194,196,68.0,220,102,300,180,122,117,...,300,300,300,300,300,300,300,300,300.0,27
2014-01-10,226,187,122,149.0,237,96,300,218,84,160,...,300,300,300,300,300,300,300,300,300.0,105
2014-01-13,117,82,9,95.0,144,146,300,188,119,143,...,300,300,300,300,300,300,300,300,300.0,247
2014-01-14,56,30,7,20.0,176,184,300,221,214,80,...,300,300,300,300,300,300,300,300,300.0,220
2014-01-15,110,35,41,9.0,167,115,300,206,176,82,...,300,300,300,300,300,300,300,300,300.0,29


三、按照co_XX的系数，结合两个排名pandas数组，最终生成打分的pandas数组
1. score_pd


注意：score中fillna是按照0填充，所以最终推荐使用nlargest

In [16]:
# 收益率系数，用于加权
co_yield = 1
# 成交额系数，用于加权
co_money = -1

#################### 生成score_pd  ############
# score_pd ： 按照成交额、收益率排名，乘以系数以后打分的得分值
score_pd = pd.DataFrame()
score_pd = data_yield_rank_pd*co_yield + data_money_rank_pd*co_money

# 注意：因为是填充0，所以后面选择最佳组合数量时，推荐使用pd.nlargest(n=XXX)
score_pd.fillna(0,inplace = True)


In [17]:
score_pd

Unnamed: 0,000001.XSHE,000002.XSHE,000063.XSHE,000069.XSHE,000100.XSHE,000157.XSHE,000166.XSHE,000333.XSHE,000338.XSHE,000402.XSHE,...,603156.XSHG,603160.XSHG,603259.XSHG,603260.XSHG,603288.XSHG,603799.XSHG,603833.XSHG,603858.XSHG,603986.XSHG,603993.XSHG
2014-01-02,0,0,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0
2014-01-03,0,0,0,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0
2014-01-06,-15,-199,176,-137.0,15,-63,0,-77,-113,-12,...,0,0,0,0,0,0,0,0,0.0,18
2014-01-07,45,-179,-2,-76.0,76,13,0,54,24,-5,...,0,0,0,0,0,0,0,0,0.0,-69
2014-01-08,37,71,-47,48.0,40,127,0,-190,11,-34,...,0,0,0,0,0,0,0,0,0.0,-23
2014-01-09,49,39,28,-109.0,-12,-42,0,-120,19,-12,...,0,0,0,0,0,0,0,0,0.0,-113
2014-01-10,118,102,-74,16.0,3,-72,0,-82,-59,73,...,0,0,0,0,0,0,0,0,0.0,-82
2014-01-13,-46,-132,-194,-63.0,103,40,0,-112,-4,48,...,0,0,0,0,0,0,0,0,0.0,14
2014-01-14,-76,-201,-179,-214.0,-61,71,0,-79,146,-76,...,0,0,0,0,0,0,0,0,0.0,79
2014-01-15,66,23,-4,-230.0,-61,55,0,168,121,-57,...,0,0,0,0,0,0,0,0,0.0,21


四、挑选股票，对比自选组合和市场组合的收益率
1. market_yield_pd：市场的收益率，延迟n_lag天后的收益率，每天滚动计算
2. portfolio_yield_pd：组合的收益率，持仓n_lag天后的收益率，每天滚动计算

In [18]:
#################### 关键参数 ##################
# 延迟后n_lag天，延迟n_lag天作为最终的收益结算时间
n_lag = 5

#################### 生成市场收益率（注意不是每日）  ############
# market_yield_pd ：表示延迟n_lag天的收益率，用于与组合收益率进行比较

market_data_pd = get_price('000300.XSHG', start_date = beginDate, end_date=endDate, frequency='daily', fields=['close'],fq = "pre")

# 注意：这里是滚动相加后再移位，移位根据测试需要加1
market_yield_pd = pd.rolling_sum(np.log(market_data_pd['close']/market_data_pd['close'].shift(1)),n_lag).shift(-n_lag+1)


#################### 生成市场收益率（注意不是每日）  ############
# portfolio_yield_pd ：表示延迟n_lag天的收益率，用于与组合收益率进行比较

# 关键参数
# 选择股票的数量
n_length = 30

# 每日的候选股票列表，结构为{日期：[股票1，股票2，股票3....股票n_length]}
candidate_secs = {}

for k,v in score_pd.iterrows():
    candidate_secs[k] = list(v.nlargest(n=n_length).index)
    
# 合计所有数据的收益率
# 相应组合的收益率从总的收益率中产生
portfolio_yield_total_pd = pd.rolling_sum(np.log(price_data_pd['close']/price_data_pd['close'].shift(1)),n_lag).shift(-n_lag+1)


# 用于暂存每个日期的收益率的字典
portfolio_yield_dict = {}

for i in range(0,portfolio_yield_total_pd.shape[0]):
    portfolio_yield_dict[portfolio_yield_total_pd.index[i]] = average(portfolio_yield_total_pd.loc[portfolio_yield_total_pd.index[i],candidate_secs[portfolio_yield_total_pd.index[i]]])

    
# 生成所需的组合收益率pandas数组
portfolio_yield_pd = pd.DataFrame.from_dict(portfolio_yield_dict,orient='index')
portfolio_yield_pd.columns = ['portfolio_yield']
portfolio_yield_pd    

五、比较市场收益率与组合收益率的情况

In [71]:
# 生成组合对比的pandas数组
cmp_pd = pd.DataFrame()
cmp_pd = pd.concat([market_yield_pd.to_frame(name = 'market_yield'),portfolio_yield_pd],axis = 1)

# 删除无效的Nan数据
cmp_pd = cmp_pd.dropna(axis=0,how='any')

# 计算portfolio大于market的数量
cmp_pd[cmp_pd['portfolio_yield']>cmp_pd['market_yield']].shape[0]

# 计算总的次数
cmp_pd.shape[0]

# 胜过市场的总的收益率
sum(cmp_pd['portfolio_yield'] - cmp_pd['market_yield'])

7.3078433208759