In [1]:
# from hmmlearn.hmm import GaussianHMM
from hmmlearn import hmm
import numpy as np
from matplotlib import cm, pyplot as plt
import matplotlib.dates as dates
import pandas as pd
from datetime import timedelta,date

import sklearn
import warnings
warnings.filterwarnings("ignore")

def GHMM_Generate(Sec):
    
    beginDate = '2006-1-1'
    endDate = '2014-12-30'
    n_state = 4

    data_raw = get_price(Sec, start_date = beginDate, end_date=endDate, frequency='daily', fields=['close','volume','money'],fq = "pre")
    
    
    logRet_5 = np.log(np.array(data_raw['close'][5:])) - np.log(np.array(data_raw['close'][:-5]))

    logRet_20 = np.log(np.array(data_raw['close'][20:])) - np.log(np.array(data_raw['close'][:-20]))

    logVol_5 = np.log(np.array(data_raw['volume'][5:])) - np.log(np.array(data_raw['volume'][:-5]))

    logVol_20 = np.log(np.array(data_raw['volume'][20:])) - np.log(np.array(data_raw['volume'][:-20]))

    logMoney_5 = np.log(np.array(data_raw['money'][5:])) - np.log(np.array(data_raw['money'][:-5]))

    logMoney_20 = np.log(np.array(data_raw['money'][20:])) - np.log(np.array(data_raw['money'][:-20]))

    std = pd.rolling_std(data_raw['close'].pct_change(),20)

    data_len = len(data_raw['close']) - 50

    Train_Data = np.column_stack([logRet_5[-data_len:], \
                              logRet_20[-data_len:], \
                              logVol_5[-data_len:], \
                              logVol_20[-data_len:], \
                              logMoney_5[-data_len:], \
                              logMoney_20[-data_len:], \
                              std[-data_len:]])
    
    
    Date = pd.to_datetime(data_raw.index[-data_len:])
    
    model = hmm.GaussianHMM(n_components= n_state, covariance_type="full", n_iter=2000).fit(Train_Data)

    hidden_states = model.predict(Train_Data)
    
    state_pd = pd.concat([pd.DataFrame(Date[-data_len:]),pd.DataFrame(list(hidden_states[-data_len:]))],axis = 1,names = ['date','state'])
    state_pd.columns = ['date','state']

    state_choose = state_pd[(state_pd['date'] > datetime.datetime.strptime("2008-1-1", "%Y-%m-%d"))&(state_pd['date'] < datetime.datetime.strptime("2008-10-31", "%Y-%m-%d"))]['state'].value_counts()
    state_BigLoss = state_choose[state_choose == state_choose.max()].index[0]

    state_choose = state_pd[(state_pd['date'] > datetime.datetime.strptime("2007-1-1", "%Y-%m-%d"))&(state_pd['date'] < datetime.datetime.strptime("2007-10-1", "%Y-%m-%d"))]['state'].value_counts()
    state_BigBonus = state_choose[state_choose == state_choose.max()].index[0]

    state_choose = state_pd[(state_pd['date'] > datetime.datetime.strptime("2011-1-1", "%Y-%m-%d"))&(state_pd['date'] < datetime.datetime.strptime("2013-1-1", "%Y-%m-%d"))]['state'].value_counts()
    state_MinorLoss = state_choose[state_choose == state_choose.max()].index[0]

    state_choose = state_pd[(state_pd['date'] > datetime.datetime.strptime("2014-6-1", "%Y-%m-%d"))&(state_pd['date'] < datetime.datetime.strptime("2014-12-31", "%Y-%m-%d"))]['state'].value_counts()
    state_MinorBonus = state_choose[state_choose == state_choose.max()].index[0]

    return model,state_BigLoss,state_BigBonus,state_MinorLoss,state_MinorBonus



def single_state_estimation(Sec,model_read,enddate = "2018-12-20"):
    HMM_model = model_read[0]
    state_BigLoss = model_read[1]
    state_BigBonus = model_read[2]
    state_MinorLoss = model_read[3]
    state_MinorBonus =  model_read[4]

    # 生成模型对应的参数
    data_raw = get_price(Sec, count = 100, end_date=enddate, frequency='daily', fields=['close','volume','money'],fq = "pre")

    logRet_5 = np.log(np.array(data_raw['close'][5:])) - np.log(np.array(data_raw['close'][:-5]))

    logRet_20 = np.log(np.array(data_raw['close'][20:])) - np.log(np.array(data_raw['close'][:-20]))

    logVol_5 = np.log(np.array(data_raw['volume'][5:])) - np.log(np.array(data_raw['volume'][:-5]))

    logVol_20 = np.log(np.array(data_raw['volume'][20:])) - np.log(np.array(data_raw['volume'][:-20]))

    logMoney_5 = np.log(np.array(data_raw['money'][5:])) - np.log(np.array(data_raw['money'][:-5]))

    logMoney_20 = np.log(np.array(data_raw['money'][20:])) - np.log(np.array(data_raw['money'][:-20]))

    std = pd.rolling_std(data_raw['close'].pct_change(),20)

    data_len = len(data_raw['close']) - 50


    Train_Data = np.column_stack([logRet_5[-data_len:], \
                                  logRet_20[-data_len:], \
                                  logVol_5[-data_len:], \
                                  logVol_20[-data_len:], \
                                  logMoney_5[-data_len:], \
                                  logMoney_20[-data_len:], \
                                  std[-data_len:]])
    
    

    hidden_states = HMM_model.predict(Train_Data)
    current_states = hidden_states[-1]
    
    if current_states == state_BigLoss:
        return 'bl'
    elif current_states == state_BigBonus:
        return 'bb'
    elif current_states == state_MinorLoss:
        return 'ml'
    elif current_states == state_MinorBonus:
        return 'mb'
    else:
        return 0
    


In [2]:
# data_total_pd = pd.read_csv(".\\HMM\\data_total_pd.csv",index_col = 0)
data_total_pd = pd.read_csv("data_total_pd.csv",index_col = 0)

In [3]:
data_total_pd

Unnamed: 0,price_raw,price_next,price_diff,state,alpha,score,market_yield,adjust_yield
2015/1/5,3641.54,3641.06,-0.48,bb,1,0,-0.000132,0
2015/1/6,3641.06,3643.79,2.73,bb,1,0,0.000750,0
2015/1/7,3643.79,3559.26,-84.53,bb,1,0,-0.023472,0
2015/1/8,3559.26,3546.72,-12.54,bb,1,0,-0.003529,0
2015/1/9,3546.72,3513.58,-33.14,bb,1,0,-0.009388,0
2015/1/12,3513.58,3514.04,0.46,bb,1,0,0.000131,0
2015/1/13,3514.04,3502.42,-11.62,bb,1,0,-0.003312,0
2015/1/14,3502.42,3604.12,101.70,bb,1,0,0.028623,0
2015/1/15,3604.12,3635.15,31.03,bb,1,0,0.008573,0
2015/1/16,3635.15,3355.16,-279.99,bb,1,0,-0.080151,0


In [4]:
def cal_alpha(data_total_pd,n,win_num):
    # 每个日期进行循环比对
    date_operate = data_total_pd.index
    
    score_list = []
    for date_i in date_operate[:]:

        # 状态的pandas数组
        States_pd_current = data_total_pd[(data_total_pd.index <= date_i)&(data_total_pd.index > (date_i - timedelta(days = n)))]

        score_current = 0
        for index, row in States_pd_current.iterrows():
            if row['state'] == "bb":
                score_current += 1
            elif row['state'] == "bl":
                score_current += 0
            elif row['state'] == "mb":
                score_current += 1
            elif row['state'] == "ml":
                score_current += 0
                
        if score_current >= win_num:
            score_list.append(1)
        else:
            score_list.append(0)

    data_total_pd['score'] = np.array(score_list)
    return data_total_pd


In [5]:
# 转换为datetime格式
data_total_pd['date'] = data_total_pd.index
data_total_pd
data_total_pd['date'] = pd.to_datetime(data_total_pd['date'])
data_total_pd
data_total_pd.set_index('date',inplace=True)

In [6]:
# 开始循环，寻找合理参数
# 存储参数的数组
para_list = []
i = 0
for n_windows in range(0,20):
    for win_num in range(int(n_windows/2),n_windows):
        # 仓位的关键参数，初始值默认为1，每次循环要重新赋值
        alpha = np.ones(data_total_pd.shape[0])
        data_total_pd['alpha'] = alpha

        # 评分，评分初始值默认为0，每次循环要重新赋值
        score = np.zeros(data_total_pd.shape[0])
        data_total_pd['score'] = score

        # 通过仓位调整后的收益率，每次循环要重新赋值
        data_total_pd['adjust_yield'] = np.zeros(data_total_pd.shape[0])


        data_total_pd = cal_alpha(data_total_pd,n_windows,win_num)

        # 避免后续出错
        data_total_pd.fillna(0,inplace = True)

        # 胜率
        # 防止分母为0
        if data_total_pd[data_total_pd['score']>0].shape[0] == 0:
            win_rate = 0 
        else:
            win_rate = \
                float(data_total_pd[(data_total_pd['score']>0)&(data_total_pd['market_yield']>0)].shape[0]) / float(data_total_pd[data_total_pd['score']>0].shape[0])

        # 计算尝试总数
        try_count = data_total_pd[data_total_pd['score']>0].shape[0]
        
        
        # 如果胜率高于0.6（0.54是所有全猜为正的胜率），则进行记录
        if win_rate > 0.6:
            # 记录
            para_list.append([n_windows,win_num,win_rate,try_count])

            # 提示                   
            print "找到参数组：窗口长度为%d，关键获胜次数为%d,胜率为%f，尝试次数为%d"%(n_windows,win_num,win_rate,try_count)
   
        

找到参数组：窗口长度为6，关键获胜次数为5,胜率为0.620690，尝试次数为87
找到参数组：窗口长度为12，关键获胜次数为9,胜率为0.605263，尝试次数为152
找到参数组：窗口长度为12，关键获胜次数为10,胜率为0.657143，尝试次数为70
找到参数组：窗口长度为13，关键获胜次数为10,胜率为0.657143，尝试次数为70
找到参数组：窗口长度为18，关键获胜次数为14,胜率为0.623932，尝试次数为117
找到参数组：窗口长度为19，关键获胜次数为14,胜率为0.619048，尝试次数为126
找到参数组：窗口长度为19，关键获胜次数为15,胜率为0.660714，尝试次数为56


In [7]:
para_pd = pd.DataFrame(para_list)

para_pd.to_csv("best_para_HMM_positive.csv")

In [8]:
def cal_alpha_negtive(data_total_pd,n,lose_num):
    # 每个日期进行循环比对
    date_operate = data_total_pd.index
    
    score_list = []
    for date_i in date_operate[:]:

        # 状态的pandas数组
        States_pd_current = data_total_pd[(data_total_pd.index <= date_i)&(data_total_pd.index > (date_i - timedelta(days = n)))]

        score_current = 0
        for index, row in States_pd_current.iterrows():
            if row['state'] == "bb":
                score_current += 0
            elif row['state'] == "bl":
                score_current += 1
            elif row['state'] == "mb":
                score_current += 0
            elif row['state'] == "ml":
                score_current += 1
                
        if score_current >= lose_num:
            score_list.append(-1)
        else:
            score_list.append(0)

    data_total_pd['score'] = np.array(score_list)
    return data_total_pd


In [9]:
# 开始循环，寻找合理参数
# 存储参数的数组
para_list_negtive = []
i = 0
for n_windows in range(0,20):
    for lose_num in range(int(n_windows/2),n_windows):
        # 仓位的关键参数，初始值默认为1，每次循环要重新赋值
        alpha = np.ones(data_total_pd.shape[0])
        data_total_pd['alpha'] = alpha

        # 评分，评分初始值默认为0，每次循环要重新赋值
        score = np.zeros(data_total_pd.shape[0])
        data_total_pd['score'] = score

        # 通过仓位调整后的收益率，每次循环要重新赋值
        data_total_pd['adjust_yield'] = np.zeros(data_total_pd.shape[0])


        data_total_pd = cal_alpha_negtive(data_total_pd,n_windows,lose_num)

        # 避免后续出错
        data_total_pd.fillna(0,inplace = True)

        # 胜率
        # 防止分母为0
        if data_total_pd[data_total_pd['score']<0].shape[0] == 0:
            lose_rate = 0 
        else:
            lose_rate = \
                float(data_total_pd[(data_total_pd['score']<0)&(data_total_pd['market_yield']<0)].shape[0]) / float(data_total_pd[data_total_pd['score']<0].shape[0])

        # 计算尝试总数
        try_count = data_total_pd[data_total_pd['score']<0].shape[0]

        '
        '
        # 如果胜率高于0.6（0.54是所有全猜为正的胜率），则进行记录
        if lose_rate > 0.5:
            # 记录
            para_list_negtive.append([n_windows,lose_num,lose_rate,try_count])

            # 提示                   
            print "找到参数组：窗口长度为%d，关键获胜次数为%d,胜率为%f，尝试次数为%d"%(n_windows,lose_num,lose_rate,try_count)
   
        

找到参数组：窗口长度为3，关键获胜次数为2,胜率为0.506993，尝试次数为286
找到参数组：窗口长度为7，关键获胜次数为4,胜率为0.506452，尝试次数为310
找到参数组：窗口长度为8，关键获胜次数为5,胜率为0.501730，尝试次数为289
找到参数组：窗口长度为10，关键获胜次数为6,胜率为0.505300，尝试次数为283
