In [20]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
%matplotlib inline
from matplotlib.pylab import rcParams
from sklearn.metrics import accuracy_score, precision_score, recall_score, log_loss
from bt_classes import my_backtest
rcParams['figure.figsize'] = 20,10
warnings.filterwarnings('ignore')

In [7]:
orig_df = pd.read_csv('../xau_1d_20y.csv') # 读取20年1天的数据
orig_df['datetime'] = pd.to_datetime(orig_df['date']) # 转换时间格式
orig_df = orig_df.set_index('datetime') # 设置索引

df = orig_df.copy() # 拷贝得到待处理数据集
df['log_r'] = np.log(df['close']) - np.log(df['open']) # 计算对数收益率
df['label'] = np.sign(df['log_r'].shift(-1)) # 得到涨跌情况作为标签
df['label'][df['label']==-1] = 0 # 0为跌，1为涨
df['label'].fillna(0,inplace=True) # 用0去填充缺失的标签

In [16]:
# 失效的衡量：单点不算，而是去度量一个时间窗内的数据，比如说一个时间窗里的收益率、回撤、精确率准确率召回率、最大连续错误、交叉熵、
class test_indicator():
    def __init__(self, y_true, y_pred):
        assert len(y_pred) == len(y_true), f'Length of y is {len(y_pred)} while y_pred {len(y_true)}'
        self.y_pred = y_pred
        self.y_true = y_true
        self.suc_fail, fail_suc = np.array(self.successive_distribution())
        # 算这个比例是否要加权？
        self.successive_n_failure_rate = None
        self.period_entropy = None
        self.period_drawback = None
        self.volatility = None

    def successive_distribution(self):
        y_output = self.y_pred
        y = self.y_true
        continues_suc = 0
        continues_fail = 0
        result = []
        maxx = 0
        for i in range(0,len(y_output)):
            if y_output[i] == y[i]:
                continues_suc+=1
                if continues_fail!=0:
                    result.append(-continues_fail)
                    if continues_fail > maxx:
                        maxx = continues_fail
                    continues_fail = 0
            else:
                continues_fail+=1
                if continues_suc != 0:
                    result.append(continues_suc)
                    if continues_suc > maxx:
                        maxx = continues_suc
                    continues_suc = 0

        length = maxx+1
        suc_result = [[0] * length for i in range(length)]
        fail_result = [[0]*length for i in range(length)]

        for i in range(len(result)-1):
            if result[i]>0:
                suc_result[result[i]][-result[i+1]]+=1
            else:
                fail_result[-result[i]][result[i+1]]+=1
        return suc_result, fail_result

    def plot_suc_fail(self):
        plt.figure(figsize=(16,10))
        suc_result = pd.DataFrame(self.suc_fail).sort_index(ascending=False)
        sns.heatmap(suc_result.iloc[:-1,1:],cmap='Blues',annot=True, fmt='.0f')
        plt.yticks(rotation=0)
        ax = plt.gca()
        plt.xlabel('Successive Wrong',fontsize=16)
        plt.ylabel('Successive Correct',fontsize=16)
        plt.show()
        plt.close()

    def plot_fail_suc(self):
        plt.figure(figsize=(16,10))
        fail_result = pd.DataFrame(self.suc_fail).sort_index(ascending=False)
        sns.heatmap(fail_result.iloc[:-1,1:],cmap='Blues',annot=True, fmt='.0f')
        plt.yticks(rotation=0)
        ax = plt.gca()
        plt.xlabel('Successive Wrong',fontsize=16)
        plt.ylabel('Successive Correct',fontsize=16)
        plt.show()
        plt.close()

    def get_accuracy(self):
        return accuracy_score(self.y_true, self.y_pred)

    def get_precision(self):
        return precision_score(self.y_true, self.y_pred)

    def get_recall(self):
        return recall_score(self.y_true, self.y_pred)

    def get_successive_n_failure_rate(self,n=5):
        if n > len(self.suc_fail):
            return 0
        return self.suc_fail[:,n:].sum() / self.suc_fail.sum()
    
    def get_entropy(self):
        return log_loss(self.y_true, self.y_pred)
    
    def get_drawback(self):
        # TODO: calculate maximum drawback of this phrase.
        return 0
    def get_volatility(self):
        # 
        return 0

In [17]:
y_true = np.random.randint(0,2,10000) # 随机获取1万个0或1
y_pred = np.random.randint(0,2,10000)

In [18]:
ti = test_indicator(y_true,y_pred)

In [None]:
ti.suc_fail[:,5:].sum() / ti.suc_fail.sum()

In [None]:
suc_fail, fail_suc = np.array(ti.successive_distribution())

In [None]:
ti.plot_suc_fail()

In [21]:
ti.get_entropy()

17.31103454852704