In [1]:
from __future__ import division      #除数可以显示为float

from six import StringIO    #使用聚宽readfile函数
import numpy as np
import pandas as pd

import time                 #使用time stamp
import datetime             

import matplotlib.pyplot as plt

import math

import talib


In [2]:
# 功能：输入Pandas数据，生成对应的指标数据
# 输入：
# input_data_pd：输入的原始Pandas数据
# 格式：必须包含的列包括：'code','date','close','high','low','volume','open'，用于计算指标
# 输出：
# ret_pd：输出Pandas数据
# ret_col：返回的特征向量
# 计算 15个指标，用于计算的指标包括：
# 'EMA_5', 'EMA_10', 'EMA_gap', 'KDJ_K', 'KDJ_D', 'KDJ_J', 'RSI', 'MACD_dif', 'MACD_dea', 'MACD_macd','MOM_12', 'MOM_25', 'MOM_gap', 'Long_Short_Rate_OBV', 'Volume'

def Indicator_Generate(input_data_pd):
    
    ret_pd = pd.DataFrame()
    ret_pd = input_data_pd
    
    # 返回的特征向量
    ret_col = ['EMA_5', 'EMA_10', 'EMA_gap', 'KDJ_K', 'KDJ_D', 'KDJ_J', 'RSI', 'MACD_dif', 'MACD_dea', 'MACD_macd','MOM_12', 'MOM_25', 'MOM_gap', 'Long_Short_Rate_OBV', 'Volume']
    
    stock_list = set(list(input_data_pd['code']))
    
    # 计算相应的值
    for stock_name in stock_list:
    
        ################################   1、生成MACD信息   ###################################################
        ret_pd["MACD_dif"] = None
        ret_pd["MACD_dea"] = None
        ret_pd["MACD_macd"] = None
        
        for stock_name in stock_list:
            dif = []
            dea = []
            macd = []

            macd_price = ret_pd.loc[ret_pd['code'] == stock_name,"close"]

            dif, dea, macd = talib.MACD(np.array(macd_price), fastperiod=5, slowperiod=10, signalperiod=5)

            ret_pd.loc[ret_pd['code'] == stock_name,"MACD_dif"] = np.array(dif)
            ret_pd.loc[ret_pd['code'] == stock_name,"MACD_dea"] = np.array(dea)
            ret_pd.loc[ret_pd['code'] == stock_name,"MACD_macd"] = np.array(macd)


        ################################   2、生成均线信息   ###################################################
        ret_pd["EMA_5"] = None
        ret_pd["EMA_10"] = None
        ret_pd["EMA_gap"] = None

        
        
        for stock_name in stock_list:
            ema_5 = []
            ema_10 = []
            ema_gap = []


            ema_price = ret_pd.loc[ret_pd['code'] == stock_name,"close"]
            ema_5 = talib.EMA(np.array(ema_price), timeperiod=5)
            ema_10 = talib.EMA(np.array(ema_price), timeperiod=10)
            ema_gap = talib.EMA(np.array(ema_price), timeperiod=5) - talib.EMA(np.array(ema_price), timeperiod=10)


            ret_pd.loc[ret_pd['code'] == stock_name,"EMA_5"] = np.array(ema_5)
            ret_pd.loc[ret_pd['code'] == stock_name,"EMA_10"] = np.array(ema_10)
            ret_pd.loc[ret_pd['code'] == stock_name,"EMA_gap"] = np.array(ema_gap)


        ################################   3、生成KDJ指标   ###################################################
        ret_pd["KDJ_K"] = None
        ret_pd["KDJ_D"] = None    
        ret_pd["KDJ_J"] = None
        
        
        
        for stock_name in set(stock_list):
            K_values = []
            D_values = []
            J_values = []

            kdj_price = ret_pd.loc[ret_pd['code'] == stock_name,['high','low','close']]
            kdj_price = kdj_price.fillna(0)

                #                  (Today's Close - LowestLow)
                # FASTK(Kperiod) = --------------------------- * 100
                #                   (HighestHigh - LowestLow)

                # FASTD(FastDperiod) = MA Smoothed FASTK over FastDperiod

                # SLOWK(SlowKperiod) = MA Smoothed FASTK over SlowKperiod

                # SLOWD(SlowDperiod) = MA Smoothed SLOWK over SlowDperiod

            K_values, D_values = talib.STOCH(kdj_price['high'].values,
                                               kdj_price['low'].values,
                                               kdj_price['close'].values,
                                               fastk_period=9,
                                               slowk_period=3,
                                               slowk_matype=0,
                                               slowd_period=3,
                                               slowd_matype=0)


            J_values = 3 * np.array(K_values) - 2 * np.array(D_values)

            ret_pd.loc[ret_pd['code'] == stock_name,"KDJ_K"] = np.array(K_values)
            ret_pd.loc[ret_pd['code'] == stock_name,"KDJ_D"] = np.array(D_values)
            ret_pd.loc[ret_pd['code'] == stock_name,"KDJ_J"] = np.array(J_values)


        ################################   4、开始计算RSI指标   ###################################################
        ret_pd["RSI"] = None

        
        for stock_name in set(stock_list):
            rsi_values = []

            rsi_price = ret_pd.loc[ret_pd['code'] == stock_name,['high','low','close']]

            rsi_price = rsi_price.fillna(0)

            rsi_values = talib.RSI(np.array(rsi_price['close']), 12)       #RSI的天数一般是6、12、24

            ret_pd.loc[ret_pd['code'] == stock_name,"RSI"] = np.array(rsi_values)

        ret_pd = ret_pd.fillna(0)

        ################################   5、开始计算动量指标“MOM”   ###################################################
        ret_pd["MOM_12"] = None
        ret_pd["MOM_25"] = None
        # 选择一条10日均线作为中间线，判断
        ret_pd["MOM_gap"] = None

        
        for stock_name in set(stock_list):

            MOM_values = []
            MOM_gap_values = []

            mom_price = list(ret_pd[ret_pd['code'] == stock_name]['close'])


            MOM_12_values = talib.MOM(np.array(mom_price), timeperiod = 12)

            MOM_25_values = talib.MOM(np.array(mom_price), timeperiod = 25)

            MOM_gap_values = MOM_25_values - MOM_12_values

            ret_pd.loc[ret_pd['code'] == stock_name,"MOM_12"] = np.array(MOM_12_values)
            ret_pd.loc[ret_pd['code'] == stock_name,"MOM_25"] = np.array(MOM_25_values)

            ret_pd.loc[ret_pd['code'] == stock_name,"MOM_gap"] = np.array(MOM_gap_values)


        ################################   6、开始计算能量指标OBV   ###################################################
        ret_pd["Long_Short_Rate_OBV"] = None


        
        for stock_name in set(stock_list):
            OBV_values = []


            obv_price = ret_pd.loc[ret_pd['code'] == stock_name,['high','low','close','volume']]

            obv_price = obv_price.fillna(0)

            # 通过价格进行调整
            Long_Short_Rate_OBV_values = []


            OBV_gap = 0 
            for i in range(0,len(obv_price['close'])):
                OBV_gap = ((obv_price['close'].values[i]-obv_price['low'].values[i]) \
                               -(obv_price['high'].values[i]-obv_price['close'].values[i])) \
                              /(obv_price['high'].values[i]-obv_price['low'].values[i])


                if np.isnan(OBV_gap):
                    OBV_gap = 0

                if i == 0:
                    Long_Short_Rate_OBV_values.append(obv_price['volume'].values[i])
                else:
                    Long_Short_Rate_OBV_values.append(float(OBV_gap)*float(obv_price['volume'].values[i]))

            ret_pd.loc[ret_pd['code'] == stock_name,"Long_Short_Rate_OBV"] = np.array(Long_Short_Rate_OBV_values)       


        ################################  7、开始计算成交量   ###################################################
        ret_pd["Volume"] = None

        for stock_name in set(stock_list):

            Volume_price = ret_pd.loc[ret_pd['code'] == stock_name,['high','low','close','volume']]

            ret_pd.loc[ret_pd['code'] == stock_name,"Volume"] = np.array(Volume_price['volume'])       

        ret_pd = ret_pd.fillna(0)
    
    return ret_pd,ret_col

In [3]:
# 功能：数据数据后，按照日期，进行相应的扩展，将前几天的数据进行扩展，分别显示前N天的数据
# 输入：
# input_data_pd：输入的原始Pandas数据，函数中必须包含'code'，'date','win_rate'
# day_count：需要向前扩展的天数
# Col_Name：需要扩展的列，避免将'code'、'date','win_rate'也同步扩展了
# 输出：
# ret_pd：输出扩展后的Pandas数据
# Col_Total:输出扩展后的列表列名
def Indicator_Extend(input_data_pd,day_count,Col_Name):

    # 生成总的列表
    Col_Total = []
    for col in Col_Name:
        Col_Total.append(col)
        
        # 生成前两天的值
        for day_count_i in range(1, 3):
            Col_Total.append(str(str(col) + "_pre" + str(day_count_i)))
    
    
    
    # 初始化数据
    ret_pd = pd.DataFrame(columns = (['code','date'] +Col_Total))    
    ret_pd[['code','date','win_rate']] =  input_data_pd.loc[:,['code','date','win_rate']]
    
    ret_pd[Col_Name] =  input_data_pd.loc[:,Col_Name]
    Stock_list = set(list(input_data_pd['code']))
    
    ##############################################  1、 赋值前1-2天的值      ############################################################

    for col in Col_Name:

        # 修改列属性为float,之后数据透视需要
        ret_pd[col] = ret_pd[col].astype(float)

        for day_count_i in range(1, day_count + 1):

            ret_pd.loc[:, str(str(col) + "_pre" + str(day_count_i))] = None

            # 修改列属性为float,之后数据透视需要
            ret_pd[str(str(col) + "_pre" + str(day_count_i))] = ret_pd[str(str(col) + "_pre" + str(day_count_i))].astype(float)

            for stock_name in Stock_list:
                temp = []
                temp = list(input_data_pd.loc[input_data_pd['code'] == stock_name, col])
                for i in range(0, day_count_i):
                    temp.insert(0,0)

                
                ret_pd.loc[input_data_pd['code'] == stock_name, str(str(col) + "_pre" + str(day_count_i))] = np.array(temp[:-day_count_i])


    return ret_pd,Col_Total

In [4]:
# 功能：删除不必要的数据，包括
# 1、前60个数据
# 2、后10个数据
# 3、成交量为0的数据
# 返回：
# 删除完成后的数据
def delete_data(input_data_pd):

    ret_pd = input_data_pd.copy()
    
    # 删除无效数据，准备进行数据处理
    stock_list = set(list(ret_pd['code']))
    # 更改平局为-1
    ret_pd.loc[ret_pd['win_rate'] == 0, "win_rate"] = -1
    for stock_name in stock_list:
        # 删除前60，后10的数据
        ret_pd = ret_pd.drop(ret_pd[ret_pd['code'] == stock_name].iloc[:60].index, axis=0)
        ret_pd = ret_pd.drop(ret_pd[ret_pd['code'] == stock_name].iloc[-10:].index, axis=0)
        # 统一操作，删除价格为0的数据
        ret_pd = ret_pd.drop(ret_pd[ret_pd['Volume'] == 0].index, axis=0)
    return ret_pd

In [5]:
# normalize_into_pd
# 功能：将数据进行归一化，逐股票生成相应的离散值
# 离散原则：按照正态分布的规则，按照均值、方差的值，分阶段进行离散
# input_data_pd：pandas数据结构，多行数据，记录样本归一化前的原始数据
# cols：列名，归一化的相应列名
# 返回值：
# ret_pd：按照原则离散化后的Pandas数组
# MS_pd：返回按照股票、cols列为维度的Pandas数据

def normalize_into_pd(input_data_pd,cols):
    # 定义分组函数，对输入的数列按照n组进行划分
    # 输入数据为list类型
    def get_group(sample_data):
        # 返回结果的数组
        ret_group = []
        if len(sample_data)!=0:
            # 确定最大值、最小值
            # 同时gap适当扩大，避免出现n+1的分组
            d_mean = np.mean(sample_data)
            d_std = np.std(sample_data)
            ret_group = [math.floor(vals) if abs(math.floor(vals)) <=3 else 4*abs(math.floor(vals))/math.floor(vals) for vals in (np.array(sample_data) - d_mean)/(0.5*d_std)]
        else:
            ret_group =  []
        return ret_group
    
    # 变量赋初值，其中input_data_pd必须包含code列
    Stock_list = set(list(input_data_pd['code']))
    ret_pd = pd.DataFrame(columns = ['code','date'] + cols)
    MS_pd = pd.DataFrame()

    ret_pd['code'] = np.array(input_data_pd['code'])
    ret_pd['date'] = np.array(input_data_pd['date'])
    ret_pd['win_rate'] = np.array(input_data_pd['win_rate'])
    
    # 逐列开始进行数据处理
    for col in cols:

        # 处理原始列数据

        for stock_name in Stock_list:
            ret_pd.loc[ret_pd['code'] == stock_name, col] \
                = np.array(get_group(list(input_data_pd[input_data_pd['code'] == stock_name][col])))
    
        
    MS_pd = pd.pivot_table(input_data_pd, index=["code"], values=cols, aggfunc=[np.mean,np.std])

    return ret_pd,MS_pd


In [6]:
# normalize_by_list使用索引表对数据进行归一化，生成模型可识别的样本
# file_name：索引文件名，记录均值方差，需遵守相应表格列名称规则
# sample_data_pd：pandas数据结构，仅一行，记录样本归一化前的原始数据
# stock_name：股票名，用于去索引文件中查找对应的数据
# cols：列名，归一化的相应列名

def normalize_by_list(MS_pd,sample_data_series,stock_name,cols):
    # ret_pd是返回的pd数组
    ret_pd = pd.DataFrame(columns = cols)
    
    # 创建列，赋值为None
    ret_pd[cols] = None
    
       
    # 逐列计算对应归一化的值
    for col in cols:
        mean = 0
        std = 0
        
        # 查找表中的数据
        mean = MS_pd.loc[MS_pd['code']==stock_name,str(col+"_mean")]
        std = MS_pd.loc[MS_pd['code']==stock_name,str(col+"_std")]

        # 计算
        val = (float(sample_data_series[col]) - mean)/(0.5*std)
        
        temp = 0
        if abs(math.floor(val)) <=3:
            temp = math.floor(val)
        else:
            temp = 4*val/abs(val)

        ret_pd.loc[:,col] = np.array([float(temp)])

    return ret_pd


In [7]:
# 从基础数据表中读取数据，生成指标数据
data_test = pd.DataFrame()
data_test,Col_Name = Indicator_Generate(pd.read_csv("raw_data_price1.csv"))



In [8]:
# 扩展数据，生成多列数据
data_extend_test = pd.DataFrame()
data_extend_test,Col_Total = Indicator_Extend(data_test,3,Col_Name)
data_extend_test.shape

(60000, 63)

In [9]:
# 删除无效数据，更改“平”为“负”
data_delete = delete_data(data_extend_test)
data_delete.shape

(52858, 63)

In [10]:
# 数据归一化，并且生成均方差列表
data_normalize = pd.DataFrame()
MS_pd = pd.DataFrame()
data_normalize,MS_pd = normalize_into_pd(data_delete,Col_Total)

In [17]:
MS_pd['mean']['EMA_10']['000001.XSHE']

code
000001.XSHE    13.146855
000002.XSHE    14.107902
000060.XSHE    10.988804
000063.XSHE    18.746680
000069.XSHE     7.660405
000100.XSHE     3.274741
000157.XSHE     7.258858
000166.XSHE     7.819640
000333.XSHE    34.864180
000338.XSHE    24.362258
000402.XSHE     8.285357
000413.XSHE    11.978396
000415.XSHE     9.165302
000423.XSHE    47.492618
000425.XSHE    12.258484
000503.XSHE    23.285562
000538.XSHE    72.993682
000540.XSHE     8.666715
000559.XSHE    11.131535
000568.XSHE    34.814623
000623.XSHE    24.716076
000625.XSHE    11.749457
000627.XSHE     5.898539
000630.XSHE    11.964755
000651.XSHE    28.692380
000671.XSHE     9.700301
000709.XSHE     3.278898
000723.XSHE    11.354159
000725.XSHE     2.969591
000728.XSHE    14.719109
Name: EMA_10, dtype: float64

In [27]:
MS_pd.to_csv("MS_list.csv")

# 提示：这里需要对均方差列表进行手工操作，以便之后进行操作

In [28]:
# 开始进行量化预测
from __future__ import division      #除数可以显示为float
from six import StringIO    #使用聚宽readfile函数
import numpy as np
import pandas as pd

import time                 #使用time stamp
import datetime             

import matplotlib.pyplot as plt

import math

# 最基本的KNN算法
from sklearn.neighbors import KNeighborsClassifier

# 导入样本拆分模块
from sklearn.model_selection import train_test_split

# 导入KNN半径算法
from sklearn.neighbors import RadiusNeighborsClassifier

# 评分函数
from sklearn.metrics import precision_score, recall_score, f1_score

# 交叉评分函数
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


In [29]:
Stock_list = set(list(data_normalize['code']))
score_stock_dict = {}
k = 11
for stock_name in Stock_list:

    # 测试数据
    X =  np.array(data_normalize.loc[data_normalize['code'] == stock_name,Col_Total])
    # 测试结果
    y = np.array(data_normalize.loc[data_normalize['code'] == stock_name,"win_rate"])

    # 分别生成训练数据、测试数据
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
    
    # 单一模型预测
    # 训练模型
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    score_stock_dict[stock_name] = [precision_score(y_test,y_pred ,average='binary'),int(len(y_test[y_pred == 1])),int(sum(y_test[y_pred == 1] ==1))]
    print "正常回归：股票名称: %s; 正确率:%f；正例个数:%d；正确个数:%d"%(str(stock_name),score_stock_dict[stock_name][0],score_stock_dict[stock_name][1],score_stock_dict[stock_name][2])


正常回归：股票名称: 000568.XSHE; 正确率:0.562500；正例个数:160；正确个数:90
正常回归：股票名称: 000100.XSHE; 正确率:0.509259；正例个数:108；正确个数:55
正常回归：股票名称: 000630.XSHE; 正确率:0.500000；正例个数:68；正确个数:34
正常回归：股票名称: 000559.XSHE; 正确率:0.507812；正例个数:128；正确个数:65
正常回归：股票名称: 000413.XSHE; 正确率:0.483607；正例个数:122；正确个数:59
正常回归：股票名称: 000728.XSHE; 正确率:0.450382；正例个数:131；正确个数:59
正常回归：股票名称: 000002.XSHE; 正确率:0.572917；正例个数:96；正确个数:55
正常回归：股票名称: 000540.XSHE; 正确率:0.590361；正例个数:83；正确个数:49
正常回归：股票名称: 000402.XSHE; 正确率:0.628319；正例个数:113；正确个数:71
正常回归：股票名称: 000060.XSHE; 正确率:0.608333；正例个数:120；正确个数:73
正常回归：股票名称: 000538.XSHE; 正确率:0.593985；正例个数:133；正确个数:79
正常回归：股票名称: 000425.XSHE; 正确率:0.403846；正例个数:104；正确个数:42
正常回归：股票名称: 000625.XSHE; 正确率:0.425926；正例个数:108；正确个数:46
正常回归：股票名称: 000069.XSHE; 正确率:0.565891；正例个数:129；正确个数:73
正常回归：股票名称: 000725.XSHE; 正确率:0.516129；正例个数:124；正确个数:64
正常回归：股票名称: 000415.XSHE; 正确率:0.520548；正例个数:73；正确个数:38
正常回归：股票名称: 000423.XSHE; 正确率:0.548148；正例个数:135；正确个数:74
正常回归：股票名称: 000503.XSHE; 正确率:0.482014；正例个数:139；正确个数:67
正常回归：股票名称: 000623.XSHE; 正确率:0.46

In [30]:
score_stock_pd = pd.DataFrame.from_dict(score_stock_dict, orient='index',columns = ['TP','Predict_counts','Correct_counts'])
score_stock_pd.to_csv("stock_rate.csv")

# 开始针对数据进行预测

In [49]:
Stock_list = set(list(data_normalize['code']))
Buy_List_Dict = {}
k = 11
MS_pd = pd.read_csv("MS_list.csv")


for stock_name in Stock_list:

    # 测试数据
    X =  np.array(data_normalize.loc[data_normalize['code'] == stock_name,Col_Total])
    # 测试结果
    y = np.array(data_normalize.loc[data_normalize['code'] == stock_name,"win_rate"])

    # 分别生成训练数据、测试数据
    x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0)
    
    # 单一模型预测
    # 训练模型
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(x_train, y_train)
    
    predict_data_series = pd.Series(data_extend_test.loc[data_extend_test['code'] == stock_name,Col_Total].iloc[-1,:])
    
    
    predict_data_normal_pd = normalize_by_list(MS_pd,predict_data_series,stock_name,Col_Total)
    
    y_pred = model.predict([list(predict_data_normal_pd.iloc[0,:])])
    

000568.XSHE
2018-06-27
000100.XSHE
2018-06-27
000630.XSHE
2018-06-27
000559.XSHE
2018-06-27
000413.XSHE
2018-06-27
000728.XSHE
2018-06-27
000002.XSHE
2018-06-27
000540.XSHE
2018-06-27
000402.XSHE
2018-06-27
000060.XSHE
2018-06-27
000538.XSHE
2018-06-27
000425.XSHE
2018-06-27
000625.XSHE
2018-06-27
000069.XSHE
2018-06-27
000725.XSHE
2018-06-27
000415.XSHE
2018-06-27
000423.XSHE
2018-06-27
000503.XSHE
2018-06-27
000623.XSHE
2018-06-27
000001.XSHE
2018-06-27
000333.XSHE
2018-06-27
000627.XSHE
2018-06-27
000651.XSHE
2018-06-27
000063.XSHE
2018-06-27
000709.XSHE
2018-06-27
000166.XSHE
2018-06-27
000723.XSHE
2018-06-27
000338.XSHE
2018-06-27
000671.XSHE
2018-06-27
000157.XSHE
2018-06-27


In [None]:

    print stock_name 
    print str(data_extend_test.loc[data_extend_test['code'] == stock_name,'date'][-20:-19].values[0])
    predict_data_normal_pd.to_csv(stock_name+str(data_extend_test.loc[data_extend_test['code'] == stock_name,'date'][-20:-19].values[0])+".csv")


In [50]:
data_normalize.to_csv("verify.csv")

IOError: [Errno 13] Permission denied: 'verify.csv'

In [None]:
# 读取模块进行预测

In [None]:
# 进入聚宽进行验证