In [6]:
from __future__ import division      #除数可以显示为float

from six import StringIO    #使用聚宽readfile函数
import numpy as np
import pandas as pd

import time                 #使用time stamp
import datetime             

import matplotlib.pyplot as plt

import math

import talib



In [7]:
# 功能：输入Pandas数据，生成对应的指标数据
# 输入：
# input_data_pd：输入的原始Pandas数据
# 格式：必须包含的列包括：'code','date','close','high','low','volume','open'，用于计算指标
# 输出：
# ret_pd：输出Pandas数据
# ret_col：返回的特征向量
# 计算 15个指标，用于计算的指标包括：
# 'EMA_5', 'EMA_10', 'EMA_gap', 'KDJ_K', 'KDJ_D', 'KDJ_J', 'RSI', 'MACD_dif', 'MACD_dea', 'MACD_macd','MOM_12', 'MOM_25', 'MOM_gap', 'Long_Short_Rate_OBV', 'Volume'

def Indicator_Generate(input_data_pd):
    
    ret_pd = pd.DataFrame()
    ret_pd = input_data_pd
    
    # 返回的特征向量
    ret_col = ['EMA_5', 'EMA_10', 'EMA_gap', 'KDJ_K', 'KDJ_D', 'KDJ_J', 'RSI', 'MACD_dif', 'MACD_dea', 'MACD_macd','MOM_12', 'MOM_25', 'MOM_gap', 'Long_Short_Rate_OBV', 'Volume']
    
    stock_list = set(list(input_data_pd['code']))
    
    # 计算相应的值
    for stock_name in stock_list:
    
        ################################   1、生成MACD信息   ###################################################
        ret_pd["MACD_dif"] = None
        ret_pd["MACD_dea"] = None
        ret_pd["MACD_macd"] = None
        
        for stock_name in stock_list:
            dif = []
            dea = []
            macd = []

            macd_price = ret_pd.loc[ret_pd['code'] == stock_name,"close"]

            dif, dea, macd = talib.MACD(np.array(macd_price), fastperiod=5, slowperiod=10, signalperiod=5)

            ret_pd.loc[ret_pd['code'] == stock_name,"MACD_dif"] = np.array(dif)
            ret_pd.loc[ret_pd['code'] == stock_name,"MACD_dea"] = np.array(dea)
            ret_pd.loc[ret_pd['code'] == stock_name,"MACD_macd"] = np.array(macd)


        ################################   2、生成均线信息   ###################################################
        ret_pd["EMA_5"] = None
        ret_pd["EMA_10"] = None
        ret_pd["EMA_gap"] = None

        
        
        for stock_name in stock_list:
            ema_5 = []
            ema_10 = []
            ema_gap = []


            ema_price = ret_pd.loc[ret_pd['code'] == stock_name,"close"]
            ema_5 = talib.EMA(np.array(ema_price), timeperiod=5)
            ema_10 = talib.EMA(np.array(ema_price), timeperiod=10)
            ema_gap = talib.EMA(np.array(ema_price), timeperiod=5) - talib.EMA(np.array(ema_price), timeperiod=10)


            ret_pd.loc[ret_pd['code'] == stock_name,"EMA_5"] = np.array(ema_5)
            ret_pd.loc[ret_pd['code'] == stock_name,"EMA_10"] = np.array(ema_10)
            ret_pd.loc[ret_pd['code'] == stock_name,"EMA_gap"] = np.array(ema_gap)


        ################################   3、生成KDJ指标   ###################################################
        ret_pd["KDJ_K"] = None
        ret_pd["KDJ_D"] = None    
        ret_pd["KDJ_J"] = None
        
        
        
        for stock_name in set(stock_list):
            K_values = []
            D_values = []
            J_values = []

            kdj_price = ret_pd.loc[ret_pd['code'] == stock_name,['high','low','close']]
            kdj_price = kdj_price.fillna(0)

                #                  (Today's Close - LowestLow)
                # FASTK(Kperiod) = --------------------------- * 100
                #                   (HighestHigh - LowestLow)

                # FASTD(FastDperiod) = MA Smoothed FASTK over FastDperiod

                # SLOWK(SlowKperiod) = MA Smoothed FASTK over SlowKperiod

                # SLOWD(SlowDperiod) = MA Smoothed SLOWK over SlowDperiod

            K_values, D_values = talib.STOCH(kdj_price['high'].values,
                                               kdj_price['low'].values,
                                               kdj_price['close'].values,
                                               fastk_period=9,
                                               slowk_period=3,
                                               slowk_matype=0,
                                               slowd_period=3,
                                               slowd_matype=0)


            J_values = 3 * np.array(K_values) - 2 * np.array(D_values)

            ret_pd.loc[ret_pd['code'] == stock_name,"KDJ_K"] = np.array(K_values)
            ret_pd.loc[ret_pd['code'] == stock_name,"KDJ_D"] = np.array(D_values)
            ret_pd.loc[ret_pd['code'] == stock_name,"KDJ_J"] = np.array(J_values)


        ################################   4、开始计算RSI指标   ###################################################
        ret_pd["RSI"] = None

        
        for stock_name in set(stock_list):
            rsi_values = []

            rsi_price = ret_pd.loc[ret_pd['code'] == stock_name,['high','low','close']]

            rsi_price = rsi_price.fillna(0)

            rsi_values = talib.RSI(np.array(rsi_price['close']), 12)       #RSI的天数一般是6、12、24

            ret_pd.loc[ret_pd['code'] == stock_name,"RSI"] = np.array(rsi_values)

        ret_pd = ret_pd.fillna(0)

        ################################   5、开始计算动量指标“MOM”   ###################################################
        ret_pd["MOM_12"] = None
        ret_pd["MOM_25"] = None
        # 选择一条10日均线作为中间线，判断
        ret_pd["MOM_gap"] = None

        
        for stock_name in set(stock_list):

            MOM_values = []
            MOM_gap_values = []

            mom_price = list(ret_pd[ret_pd['code'] == stock_name]['close'])


            MOM_12_values = talib.MOM(np.array(mom_price), timeperiod = 12)

            MOM_25_values = talib.MOM(np.array(mom_price), timeperiod = 25)

            MOM_gap_values = MOM_25_values - MOM_12_values

            ret_pd.loc[ret_pd['code'] == stock_name,"MOM_12"] = np.array(MOM_12_values)
            ret_pd.loc[ret_pd['code'] == stock_name,"MOM_25"] = np.array(MOM_25_values)

            ret_pd.loc[ret_pd['code'] == stock_name,"MOM_gap"] = np.array(MOM_gap_values)


        ################################   6、开始计算能量指标OBV   ###################################################
        ret_pd["Long_Short_Rate_OBV"] = None


        
        for stock_name in set(stock_list):
            OBV_values = []


            obv_price = ret_pd.loc[ret_pd['code'] == stock_name,['high','low','close','volume']]

            obv_price = obv_price.fillna(0)

            # 通过价格进行调整
            Long_Short_Rate_OBV_values = []


            OBV_gap = 0 
            for i in range(0,len(obv_price['close'])):
                OBV_gap = ((obv_price['close'].values[i]-obv_price['low'].values[i]) \
                               -(obv_price['high'].values[i]-obv_price['close'].values[i])) \
                              /(obv_price['high'].values[i]-obv_price['low'].values[i])


                if np.isnan(OBV_gap):
                    OBV_gap = 0

                if i == 0:
                    Long_Short_Rate_OBV_values.append(obv_price['volume'].values[i])
                else:
                    Long_Short_Rate_OBV_values.append(float(OBV_gap)*float(obv_price['volume'].values[i]))

            ret_pd.loc[ret_pd['code'] == stock_name,"Long_Short_Rate_OBV"] = np.array(Long_Short_Rate_OBV_values)       


        ################################  7、开始计算成交量   ###################################################
        ret_pd["Volume"] = None

        for stock_name in set(stock_list):

            Volume_price = ret_pd.loc[ret_pd['code'] == stock_name,['high','low','close','volume']]

            ret_pd.loc[ret_pd['code'] == stock_name,"Volume"] = np.array(Volume_price['volume'])       

        ret_pd = ret_pd.fillna(0)
    
    return ret_pd,ret_col

In [8]:
# 功能：数据数据后，按照日期，进行相应的扩展，将前几天的数据进行扩展，分别显示前N天的数据
# 输入：
# input_data_pd：输入的原始Pandas数据，函数中必须包含'code'，'date'
# day_count：需要向前扩展的天数
# Col_Name：需要扩展的列，避免将'code'、'date'也同步扩展了
# 输出：
# ret_pd：输出扩展后的Pandas数据
# Col_Total:输出扩展后的列表列名
def Indicator_Extend(input_data_pd,day_count,Col_Name):

    # 生成总的列表
    Col_Total = []
    for col in Col_Name:
        Col_Total.append(col)
        
        # 生成前两天的值
        for day_count_i in range(1, 3):
            Col_Total.append(str(str(col) + "_pre" + str(day_count_i)))
    
    
    
    # 初始化数据
    ret_pd = pd.DataFrame(columns = (['code','date'] +Col_Total))    
    ret_pd[['code','date']] =  input_data_pd.loc[:,['code','date']]
    
    ret_pd[Col_Name] =  input_data_pd.loc[:,Col_Name]
    Stock_list = set(list(input_data_pd['code']))
    
    ##############################################  1、 赋值前1-2天的值      ############################################################

    for col in Col_Name:

        # 修改列属性为float,之后数据透视需要
        ret_pd[col] = ret_pd[col].astype(float)

        for day_count_i in range(1, day_count + 1):

            ret_pd.loc[:, str(str(col) + "_pre" + str(day_count_i))] = None

            # 修改列属性为float,之后数据透视需要
            ret_pd[str(str(col) + "_pre" + str(day_count_i))] = ret_pd[str(str(col) + "_pre" + str(day_count_i))].astype(float)

            for stock_name in Stock_list:
                temp = []
                temp = list(input_data_pd.loc[input_data_pd['code'] == stock_name, col])
                for i in range(0, day_count_i):
                    temp.insert(0,0)

                
                ret_pd.loc[input_data_pd['code'] == stock_name, str(str(col) + "_pre" + str(day_count_i))] = np.array(temp[:-day_count_i])

    ############################################  2、 删除无效数据，更改数据类型      ############################################################

    # 删除前60的数据，因为前面部分的数据是数据积累阶段，大部分数据有问题
    for stock_name in Stock_list:
        ret_pd = ret_pd.drop(ret_pd[ret_pd['code'] == stock_name].iloc[:60].index, axis=0)

    return ret_pd,Col_Total

In [9]:
# normalize_by_list使用索引表对数据进行归一化，生成模型可识别的样本
# file_name：索引文件名，记录均值方差，需遵守相应表格列名称规则
# sample_data_pd：pandas数据结构，仅一行，记录样本归一化前的原始数据
# stock_name：股票名，用于去索引文件中查找对应的数据
# cols：列名，归一化的相应列名

def normalize_by_list(file_name,sample_data_pd,stock_name,cols):
    # ret_pd是返回的pd数组
    ret_pd = pd.DataFrame(columns = cols)
    
    # 创建列，赋值为None
    ret_pd[cols] = None
    
    # 提取平均值
    MS_pd = pd.DataFrame()
    MS_pd = pd.read_csv(file_name)
        
    # 逐列计算对应归一化的值
    for col in cols:
        mean = 0
        std = 0
        
        # 查找表中的数据
        mean = MS_pd.loc[MS_pd['code']==stock_name,str(col+"_mean")]
        std = MS_pd.loc[MS_pd['code']==stock_name,str(col+"_std")]

        # 计算
        val = (float(sample_data_pd.loc[:,col].values[0]) - mean)/(0.5*std)
        
        temp = 0
        if abs(math.floor(val)) <=3:
            temp = math.floor(val)
        else:
            temp = 4*val/abs(val)

        ret_pd.loc[:,col] = np.array([float(temp)])

    return ret_pd


In [10]:
# 1、从基础数据表中读取数据，生成指标数据
data_test = pd.DataFrame()
data_test,Col_Name = Indicator_Generate(pd.read_csv("price_list.csv"))


data_extend_test = pd.DataFrame()

data_extend_test,Col_Total = Indicator_Extend(data_test,2,Col_Name)

# 3、对单一的数据进行测试验证，四个参数必须满足

file_name = "MS_List_Index.csv"

data_verify_pd = pd.DataFrame()
data_normalize_pd = pd.DataFrame()

for stock_name in ['000002.XSHE','000001.XSHE']:

    # test Data
    data_verify_pd = data_extend_test.loc[data_extend_test['code'] == stock_name,Col_Total].iloc[22:23,:]
    date = str(list(data_extend_test.loc[data_extend_test['code'] == stock_name,'date'])[22]).replace("/","-")

    save_file = stock_name + date
    data_normalize_pd = normalize_by_list(file_name,data_verify_pd,stock_name,Col_Total)
    data_normalize_pd.to_csv(save_file)



In [None]:
# 提取最后一行作为相应的特征向量

# 结合MS_List_Index归一化向量

In [None]:
# 读取模块进行预测

In [None]:
# 进入聚宽进行验证