# Outline

### 一. 项目数据的处理
#### 背景情况
1. 只有部分的样本含有标签，并且标签是文本类型
2. 对于每一个样本，都是一段时序的特征 

####  处理内容
1. 将文本标签进行比特位编码(六个根因)，并且扩充到所有样本上，没有标签的就按全零统计
2. 将每一个样本进行有效非空属性的提取，并提炼其序列的代表值做为最终的特征 
3. 按列合并样本和标签形成训练数据

### 二. 复现：产生式模型和基于多元高斯的后验推断
#### baseline算法介绍
详见目录 `./baseline_description.pdf`
#### 主要工作
1. 从训练集中用多元高斯分布拟合 $p(x|R_i),i=1,2,3$
2. 对于待预测的每条样本计算 $3\times 2=6$ 种生成概率(三种根因且每种根因是二元变量)
3. 为统计后的结果划分阈值后和label进行比较和得分计算 

#### 改进的想法 (详见主目录)
1. 保持生成模型，改进特征
2. 针对生成模型中的多元高斯拟合的效果进行改进
3. 其他机器学习分类模型的效果比较

# Setup

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import warnings
warnings.filterwarnings('ignore')

# Prepare

In [3]:
df = pd.read_csv("Data/train_label.csv")

# Content

### 对标签进行数字编码

In [4]:
# 查看train_label是否有空值
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407 entries, 0 to 1406
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   sample_index   1407 non-null   int64 
 1   root-cause(s)  1407 non-null   object
dtypes: int64(1), object(1)
memory usage: 22.1+ KB


In [5]:
df.head()

Unnamed: 0,sample_index,root-cause(s)
0,670,rootcause3
1,673,rootcause3
2,675,rootcause3
3,684,rootcause3
4,690,rootcause3


In [12]:
# 声明比特位矩阵,初始化序列号
c = df.values
new = np.zeros([c.shape[0],7])
new[:,0] = c[:,0]
new

array([[ 670.,    0.,    0., ...,    0.,    0.,    0.],
       [ 673.,    0.,    0., ...,    0.,    0.,    0.],
       [ 675.,    0.,    0., ...,    0.,    0.,    0.],
       ...,
       [2981.,    0.,    0., ...,    0.,    0.,    0.],
       [2982.,    0.,    0., ...,    0.,    0.,    0.],
       [2983.,    0.,    0., ...,    0.,    0.,    0.]])

In [13]:
for i in range(c.shape[0]):# 快速遍历一个DF模块
    new[i,1:] = sum(map(lambda x:np.eye(6)[int(x)-1], re.findall("\d", c[i,1])))
list(new)

[array([670.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([673.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([675.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([684.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([690.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([694.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([696.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([700.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([703.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([704.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([712.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([717.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([720.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([721.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([730.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([735.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([737.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([745.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([768.,   0.,   0.,   1.,   0.,   0.,   0.]),
 array([770.

In [None]:
# 使用txt保存numpy,实际上以csv保存
# np.savetxt('../Data/processed_label.csv',new,delimiter=',')

In [16]:
# header=None是一个神奇的参数
label= pd.read_csv("Data/processed_label.csv",header=None)
label

Unnamed: 0,0,1,2,3,4,5,6
0,670.0,0.0,0.0,1.0,0.0,0.0,0.0
1,673.0,0.0,0.0,1.0,0.0,0.0,0.0
2,675.0,0.0,0.0,1.0,0.0,0.0,0.0
3,684.0,0.0,0.0,1.0,0.0,0.0,0.0
4,690.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
1402,2979.0,0.0,0.0,1.0,0.0,0.0,0.0
1403,2980.0,0.0,0.0,1.0,0.0,0.0,0.0
1404,2981.0,0.0,0.0,1.0,0.0,0.0,0.0
1405,2982.0,0.0,0.0,1.0,0.0,0.0,0.0


### 扩充标签至整个样本集

In [17]:
# 统计一个需要"零label"的样本量计量列表
# 自生成空标签迭代时需要用到列表
counts=[]
for i in range(2983):
    if i not in label.values[:,0]:
        counts.append(i)

In [18]:
label0 = pd.concat([pd.DataFrame([[i,0,0,0,0,0,0]]) for i in counts])
label=pd.concat([label0,label])
label=label.sort_values([0]).reset_index(drop=True)
label.iloc[:,1:]

Unnamed: 0,1,2,3,4,5,6
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
2979,0.0,0.0,1.0,0.0,0.0,0.0
2980,0.0,0.0,1.0,0.0,0.0,0.0
2981,0.0,0.0,1.0,0.0,0.0,0.0
2982,0.0,0.0,1.0,0.0,0.0,0.0


### 提取样本序列中的代表

In [20]:
# 已经知道以下这些给定列不含有空值，属于有效数据
selected_features = np.array([\
       0,  1,  2,  3,  4,  5,  6, \
       11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, \
       60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75])

# 定义路径
PATH = os.path.join('Data','train')

In [21]:
# 这是一个生成数据的过程
sample = pd.read_csv(PATH +'/'+ '0.csv')
newdata = sample.iloc[:,selected_features+1]
sample0 = newdata.dropna().describe().iloc[[1],:]# 初始化sample0，提取有效数据中的均值代表

In [22]:
sample0.head()

Unnamed: 0,feature0,feature1,feature2,feature3_1,feature3_2,feature3_3,feature3_4,feature11,feature12,feature13,...,feature61_5,feature61_6,feature61_7,feature69_0,feature69_1,feature69_2,feature69_3,feature69_4,feature69_5,feature69_6
mean,1092.5208,24.73,3.9524,20.88,0.0,0.0,1368.44,0.0152,5.2284,340252.32,...,1.9804,6.4868,4.0176,3.1656,8.148,4.704,4.5932,6.6012,1.7108,6.7084


---
**需要一定的运行时间**

In [None]:
# # 对于每个时序样本都提炼出均值代表
# # 因此，上面的初始化是为了从append开始
# for i in range(2983):
#     data = pd.read_csv(PATH +'/'+ str(i+1)+'.csv')
#     newdata = data.iloc[:,selected_features+1]
#     sample0 = sample0.append(newdata.dropna().describe().iloc[[1],:],ignore_index=True)

In [None]:
# sample0

![image.png](attachment:image.png)

In [None]:
# # 最后按列合并
# aiops_data = pd.concat([sample0,label.iloc[:,1:]],axis=1)
# # 存储
# FIN_PATH = os.path.join('Data','aiops_data.csv')
# aiops_data.to_csv(FIN_PATH,index=False)

In [23]:
FIN_PATH = os.path.join('Data','aiops_data.csv')
pd.read_csv(FIN_PATH)

Unnamed: 0,feature0,feature1,feature2,feature3_1,feature3_2,feature3_3,feature3_4,feature11,feature12,feature13,...,feature69_3,feature69_4,feature69_5,feature69_6,1,2,3,4,5,6
0,1092.520800,24.730000,3.952400,20.880000,0.000000,0.000000,1368.440000,0.015200,5.228400,340252.320000,...,4.593200,6.601200,1.710800,6.708400,0.0,0.0,0.0,0.0,0.0,0.0
1,684.389444,18.527222,3.968333,12.333333,0.000000,0.000000,1290.888889,0.005000,8.860000,325444.444444,...,2.163333,0.475556,-0.339444,3.132778,0.0,0.0,0.0,0.0,0.0,0.0
2,369.106667,12.888889,3.447778,73.666667,0.000000,479.333333,794.222222,0.146667,11.042222,331398.333333,...,-4.681111,-8.103333,-4.478889,-6.862222,0.0,0.0,0.0,0.0,0.0,0.0
3,261.204286,15.137143,2.047143,32.500000,1184.928571,93.642857,0.000000,0.907143,11.715000,320818.142857,...,-7.741429,-8.435000,-4.255000,-12.586429,0.0,0.0,0.0,0.0,0.0,0.0
4,603.202500,25.222500,2.010833,32.083333,1310.083333,45.750000,0.000000,0.012500,5.756667,339515.250000,...,12.569167,9.187500,13.438333,3.495833,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2979,263.856667,13.133333,2.343333,37.666667,1253.333333,0.000000,294.333333,0.073333,12.533333,391001.333333,...,-12.976667,-14.166667,-12.793333,-15.250000,0.0,0.0,1.0,0.0,0.0,0.0
2980,284.380000,15.090000,2.930000,53.000000,0.000000,1514.000000,0.000000,0.000000,6.730000,377350.000000,...,-18.560000,5.250000,3.000000,-8.380000,0.0,0.0,1.0,0.0,0.0,0.0
2981,215.077500,14.237500,2.970000,24.000000,0.000000,1573.000000,0.000000,0.000000,8.700000,338963.750000,...,-2.532500,-11.002500,-11.780000,-13.767500,0.0,0.0,1.0,0.0,0.0,0.0
2982,247.040000,13.620000,1.980000,33.000000,1568.000000,0.000000,0.000000,0.000000,9.430000,334564.000000,...,-5.130000,-1.310000,3.810000,0.630000,0.0,0.0,1.0,0.0,0.0,0.0


### 为机器学习算法准备数据

在数据处理方面，baseline直接将上一阶段处理好的数据`aiops_data`准备给算法。所以我们先来简单看一下baseline的表现以及原理

In [3]:
data = pd.read_csv("Data/aiops_data.csv")
data.shape

(2984, 62)

In [4]:
# 去除空行与浮点化操作
_data = data.copy()
_data = _data.dropna()
_data = _data.astype('float')
_data.shape

(2450, 62)

In [5]:
# 以下代码手动划分出90个样本作为测试集，剩下2360留下来训练
sample_num = 30
test_index = []

import random
random.seed( 10 )

for a in range(3): 
    b = np.array( _data[_data[str(a+1)]==1].index.tolist() )
    sample_list = [i for i in range(len(b))]
    sample_list = random.sample(sample_list, sample_num)
    test_index.append(b[sample_list])
c = np.concatenate(test_index)
test = _data.loc[c,:]
train = _data.drop(index=c)
train.shape,test.shape

((2360, 62), (90, 62))

### 从样本Data中计算似然概率：所有label(Ri=0/1)的可能性(3*2=6种)

首先讨论根因一

In [6]:
# 取出所有root1=1和root1=0的样本
R1_1 = train[train['1']==1]
R1_0 = train[train['1']==0]
# 取出有效的特征
R1_1 = R1_1[['feature15','feature13']]
R1_0 = R1_0[['feature15','feature13']]
# 按列计算均值；计算属性之间的协方差
np.mean(R1_1.values,axis=0), np.cov(R1_1.values.T)

(array([   610.2594892 , 115589.43657458]),
 array([[8.88444079e+04, 1.54100590e+07],
        [1.54100590e+07, 3.80097940e+09]]))

In [7]:
def NormalPara(X): # NomalPara接受一组array
    return np.mean(X,axis=0), np.cov(X.T)

In [8]:
mu_R1_0,cov_R1_0 = NormalPara(R1_0.values)
mu_R1_1,cov_R1_1 = NormalPara(R1_1.values)
mu_R1_1,cov_R1_1

(array([   610.2594892 , 115589.43657458]),
 array([[8.88444079e+04, 1.54100590e+07],
        [1.54100590e+07, 3.80097940e+09]]))

In [10]:
# 为每一个分类创建一个向量，记录0和1两种取值
R1_pdf=[]
# 导入多元高斯，送入均值和协方差进行拟合
from scipy.stats import multivariate_normal
R1_pdf.append(multivariate_normal(mu_R1_0,cov_R1_0))
R1_pdf.append(multivariate_normal(mu_R1_1,cov_R1_1))
# 得到一份冻结密度函数向量
R1_pdf

[<scipy.stats._multivariate.multivariate_normal_frozen at 0x17476763550>,
 <scipy.stats._multivariate.multivariate_normal_frozen at 0x174767639a0>]

接下来同样计算`R2=1`和`R2=0`以及`R3`的参数

In [9]:
# 取出所有root1=1和root1=0的样本
R2_1 = train[train['2']==1]
R2_0 = train[train['2']==0]
R3_1 = train[train['3']==1]
R3_0 = train[train['3']==0]
# 取出有效特征
R2_1 = R2_1[[\
    "feature19",\
    "feature61_0", "feature61_1","feature61_2",  "feature61_3","feature61_4","feature61_5","feature61_6","feature61_7",\
    "feature69_0", "feature69_1","feature69_2","feature69_3","feature69_4",  "feature69_5","feature69_6", \
    "feature28_0", "feature28_1","feature28_2",  "feature28_3","feature28_4","feature28_5","feature28_6","feature28_7",\
    "feature36_0", "feature36_1","feature36_2",  "feature36_3","feature36_4", "feature36_5","feature36_6","feature36_7"\
    ]]
R2_0 = R2_0[[\
    "feature19",\
    "feature61_0", "feature61_1","feature61_2",  "feature61_3","feature61_4","feature61_5","feature61_6","feature61_7",\
    "feature69_0", "feature69_1","feature69_2","feature69_3","feature69_4",  "feature69_5","feature69_6", \
    "feature28_0", "feature28_1","feature28_2",  "feature28_3","feature28_4","feature28_5","feature28_6","feature28_7",\
    "feature36_0", "feature36_1","feature36_2",  "feature36_3","feature36_4", "feature36_5","feature36_6","feature36_7"\
    ]]
R3_1 = R3_1[[\
    "feature60",\
    "feature61_0", "feature61_1","feature61_2",  "feature61_3","feature61_4","feature61_5","feature61_6","feature61_7",\
    "feature69_0", "feature69_1","feature69_2","feature69_3","feature69_4",  "feature69_5","feature69_6", \
    "feature20_0","feature20_1","feature20_2","feature20_3","feature20_4", "feature20_5","feature20_6","feature20_7", 
    "feature28_0","feature28_1","feature28_2","feature28_3","feature28_4","feature28_5","feature28_6","feature28_7",\
    "feature36_0", "feature36_1","feature36_2","feature36_3","feature36_4","feature36_5","feature36_6","feature36_7"\
    ]]
R3_0 = R3_0[[\
    "feature60",\
    "feature61_0", "feature61_1","feature61_2",  "feature61_3","feature61_4","feature61_5","feature61_6","feature61_7",\
    "feature69_0", "feature69_1","feature69_2","feature69_3","feature69_4",  "feature69_5","feature69_6", \
    "feature20_0","feature20_1","feature20_2","feature20_3","feature20_4", "feature20_5","feature20_6","feature20_7", 
    "feature28_0","feature28_1","feature28_2","feature28_3","feature28_4","feature28_5","feature28_6","feature28_7",\
    "feature36_0", "feature36_1","feature36_2","feature36_3","feature36_4","feature36_5","feature36_6","feature36_7"\
    ]]
# 按列计算均值；计算属性之间的协方差
mu_R2_1,cov_R2_1 = NormalPara(R2_1.values)
mu_R2_0,cov_R2_0 = NormalPara(R2_0.values)
mu_R3_1,cov_R3_1 = NormalPara(R3_1.values)
mu_R3_0,cov_R3_0 = NormalPara(R3_0.values)

接下来同样计算R2和R3的密度函数向量

In [11]:
R2_pdf,R3_pdf=[],[]
R2_pdf.append(multivariate_normal(mu_R2_0,cov_R2_0))
R2_pdf.append(multivariate_normal(mu_R2_1,cov_R2_1+np.eye(32)*0.0000001))# 奇异矩阵添加约束
R3_pdf.append(multivariate_normal(mu_R3_0,cov_R3_0))
R3_pdf.append(multivariate_normal(mu_R3_1,cov_R3_1))

In [12]:
# prior
pri_R1, pri_R2, pri_R3 = [0.5,0.5],[0.5,0.5],[0.5,0.5]

# pro_product(): 计算样本为某种根因组合的概率
# pro_product()接受一条行DF类型的样本和给定的生成概率
def pro_product(sample,r1,r2,r3):
    likelihood1 = R1_pdf[r1].pdf(sample[['feature15','feature13']].values)
    likelihood2 = R2_pdf[r2].pdf(sample[[\
    "feature19",\
    "feature61_0", "feature61_1","feature61_2",  "feature61_3","feature61_4","feature61_5","feature61_6","feature61_7",\
    "feature69_0", "feature69_1","feature69_2","feature69_3","feature69_4",  "feature69_5","feature69_6", \
    "feature28_0", "feature28_1","feature28_2",  "feature28_3","feature28_4","feature28_5","feature28_6","feature28_7",\
    "feature36_0", "feature36_1","feature36_2",  "feature36_3","feature36_4", "feature36_5","feature36_6","feature36_7"\
    ]].values)
    likelihood3 = R3_pdf[r3].pdf(sample[[\
    "feature60",\
    "feature61_0", "feature61_1","feature61_2",  "feature61_3","feature61_4","feature61_5","feature61_6","feature61_7",\
    "feature69_0", "feature69_1","feature69_2","feature69_3","feature69_4",  "feature69_5","feature69_6", \
    "feature20_0","feature20_1","feature20_2","feature20_3","feature20_4", "feature20_5","feature20_6","feature20_7", 
    "feature28_0","feature28_1","feature28_2","feature28_3","feature28_4","feature28_5","feature28_6","feature28_7",\
    "feature36_0", "feature36_1","feature36_2","feature36_3","feature36_4","feature36_5","feature36_6","feature36_7"\
    ]].values)
    return likelihood1*likelihood2*likelihood3*pri_R1[r1]*pri_R1[r2]*pri_R1[r3]

### 测试：记录并比较生成概率之间的大小

In [13]:
# predict接受一行DF格式的样本
def predict(sample):
    record=[] # 完整的数据结果：记录取值情况以及该情况下的概率
    
    # 在生成模型中，需要编写一个循环穷尽每一条样本的所有离散分类取值
    # 两层：根因R的类型和根因R的取值
    for r1 in range(2): 
        for r2 in range(2):
            for r3 in range(2):
                record.append([r1,r2,r3,pro_product(sample,r1,r2,r3)]) 
    record = np.array(record)
    record[:,3] = record[:,3]/sum(record[:,3]) # 将第三列的所有生成概率进行归一化
    
    # 分别取出R1，R2和R3的生成概率，求和后就是单个的概率
    return sum(record[record[:,0]==1][:,3]),sum(record[record[:,1]==1][:,3]),sum(record[record[:,2]==1][:,3])

In [14]:
pred = np.zeros([test.shape[0],6])# 实际上没有根因4,5,6
for i in range(test.shape[0]):
    pred[i,0], pred[i,1], pred[i,2] = predict(test[i:i+1])
    
# 设立阈值
temp = pred
temp = temp > 0.8

In [16]:
label = test[['1','2','3','4','5','6']].values
label

array([[1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0.

In [17]:
# 隐含了布尔操作
# 求交集
np.sum(temp*label,axis=1)# match到正确的每个样本的得分  

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1.])

In [18]:
np.sum(temp*(1-label),axis=1)# match到错误的每个样本的得分  

array([1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0.,
       0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1., 0.,
       0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [19]:
np.sum(label,axis=1)# 每个根因信息为1分

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 1., 2., 2., 2., 2., 2., 2., 1., 2., 1., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.])

In [21]:
# 最终的得分计算
plus=np.sum(temp*label,axis=1)
minus=np.sum(temp*(1-label),axis=1)
np.mean((plus-minus)/np.sum(label,axis=1))

0.4222222222222222