# Setup


In [1]:
import pandas as pd
import numpy as np
import re
import os

# Prepare

In [2]:
df = pd.read_csv("Data/train_label.csv")
PATH = os.path.join('Data','train')
sample = pd.read_csv(PATH +'/'+ '0.csv')
label= pd.read_csv("Data/processed_label.csv",header=None)
data = pd.read_csv("Data/aiops_data.csv")
data = data.dropna()
_data = data.copy()
_data = _data.astype('float')
# 以下代码手动划分出90个样本作为测试集
sample_num = 30
test_index = []

import random
random.seed( 10 )

for a in range(3): 
    b = np.array( _data[_data[str(a+1)]==1].index.tolist() )
    sample_list = [i for i in range(len(b))]
    sample_list = random.sample(sample_list, sample_num)
    test_index.append(b[sample_list])
c = np.concatenate(test_index)
test = _data.loc[c,:]
train = _data.drop(index=c)

# Code

### 操纵DF中的值&提取值中的数字成分

In [6]:
# 熟悉df的格式提取
df.iloc[:1]

Unnamed: 0,sample_index,root-cause(s)
0,670,rootcause3


In [11]:
df.iloc[:1].values[0,1] # 注意取完values以后是一个二维矩阵 

'rootcause3'

In [15]:
string = df.iloc[:1].values[0,1]
re.findall("\d", string) # 要注意的是re提取的结果一般都会以列表返回

['3']

In [9]:
# 直接变换成矩阵提取
df.values

array([[670, 'rootcause3'],
       [673, 'rootcause3'],
       [675, 'rootcause3'],
       ...,
       [2981, 'rootcause3'],
       [2982, 'rootcause3'],
       [2983, 'rootcause3']], dtype=object)

In [12]:
df.values[1,1]

'rootcause3'

### 将数字编码成指定位数的比特码
### 对矩阵切片进行部分赋值

In [17]:
# 假设(6位 + 序号位 = 7位)的向量
np.zeros([1,7])

array([[0., 0., 0., 0., 0., 0., 0.]])

In [32]:
# 先为序列号赋值
new = np.zeros([1,7])
new[0,0] = df.values[0,0]
new

array([[670.,   0.,   0.,   0.,   0.,   0.,   0.]])

In [34]:
# 将1,3两位置为1
new[0,1:] = sum(map(lambda x:np.eye(6)[x-1],[1,3]))

In [35]:
new

array([[670.,   1.,   0.,   1.,   0.,   0.,   0.]])

### DF选择特定的列(特定的特征列)

In [39]:
sample.head()

Unnamed: 0,Date & Time,feature0,feature1,feature2,feature3_1,feature3_2,feature3_3,feature3_4,feature3_5,feature3_6,...,feature77_6,feature77_7,feature85_0,feature85_1,feature85_2,feature85_3,feature85_4,feature85_5,feature85_6,feature85_7
0,\t2020-09-23 23:55:55,,,,,,,,,,...,,,,,,,,,,
1,\t2020-09-23 23:55:56,,,,,,,,,,...,,,,,,,,,,
2,\t2020-09-23 23:55:57,,5.83,1.0,6.0,0.0,0.0,0.0,,,...,,,,,,,,,,
3,\t2020-09-23 23:55:58,371.29,18.74,3.9,23.0,0.0,0.0,666.0,,,...,,,,,,,,,,
4,\t2020-09-23 23:55:59,345.73,23.8,3.94,13.0,0.0,0.0,684.0,,,...,,,,,,,,,,


In [38]:
a = np.array([0,1])# 想选择0和1号特征
# 取特定的特征列不会出错，但是取行会出错:iloc[:[a,b]]
sample.iloc[:,a+1].head()# 需要加1，因为第一列是时间序列

Unnamed: 0,feature0,feature1
0,,
1,,
2,,5.83
3,371.29,18.74
4,345.73,23.8


### 自生数行数DF&按行合并DF

In [66]:
# 想要自生成特定行数的DF，内容全部保持一致
rows = [1,2,3] # 生成3行  
pd.concat([pd.DataFrame([[i,0,0,0,0,0,0]]) for i in rows])

Unnamed: 0,0,1,2,3,4,5,6
0,1,0,0,0,0,0,0
0,2,0,0,0,0,0,0
0,3,0,0,0,0,0,0


In [68]:
# 可以改变列的名称
pd.concat([pd.DataFrame([[i,0,0,0,0,0,0]],columns=range(2,9)) for i in rows])

Unnamed: 0,2,3,4,5,6,7,8
0,1,0,0,0,0,0,0
0,2,0,0,0,0,0,0
0,3,0,0,0,0,0,0


In [50]:
label

Unnamed: 0,0,1,2,3,4,5,6
0,670.0,0.0,0.0,1.0,0.0,0.0,0.0
1,673.0,0.0,0.0,1.0,0.0,0.0,0.0
2,675.0,0.0,0.0,1.0,0.0,0.0,0.0
3,684.0,0.0,0.0,1.0,0.0,0.0,0.0
4,690.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
1402,2979.0,0.0,0.0,1.0,0.0,0.0,0.0
1403,2980.0,0.0,0.0,1.0,0.0,0.0,0.0
1404,2981.0,0.0,0.0,1.0,0.0,0.0,0.0
1405,2982.0,0.0,0.0,1.0,0.0,0.0,0.0


In [69]:
# 和label进行组合
label0 = pd.concat([pd.DataFrame([[i,0,0,0,0,0,0]],columns=range(2,9)) for i in rows])
label = pd.concat([label0,label]) # 按列合并只是需要添加axis=1

In [70]:
label

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0
0,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0
0,,,3.0,0.0,0.0,0.0,0.0,0.0,0.0
0,670.0,0.0,0.0,1.0,0.0,0.0,0.0,,
1,673.0,0.0,0.0,1.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...
1402,2979.0,0.0,0.0,1.0,0.0,0.0,0.0,,
1403,2980.0,0.0,0.0,1.0,0.0,0.0,0.0,,
1404,2981.0,0.0,0.0,1.0,0.0,0.0,0.0,,
1405,2982.0,0.0,0.0,1.0,0.0,0.0,0.0,,


In [72]:
# 对序号排序
label.sort_values([0]).reset_index(drop=True)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,670.0,0.0,0.0,1.0,0.0,0.0,0.0,,
1,673.0,0.0,0.0,1.0,0.0,0.0,0.0,,
2,675.0,0.0,0.0,1.0,0.0,0.0,0.0,,
3,684.0,0.0,0.0,1.0,0.0,0.0,0.0,,
4,690.0,0.0,0.0,1.0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...
1405,2982.0,0.0,0.0,1.0,0.0,0.0,0.0,,
1406,2983.0,0.0,0.0,1.0,0.0,0.0,0.0,,
1407,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1408,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0


--- 
### 计算DF的均值和协方差

In [9]:
R1_1 = data[data['1']==1]
R1_1 = R1_1[['feature13','feature15']]

In [10]:
# 样式如下
R1_1

Unnamed: 0,feature13,feature15
915,168339.090909,839.022727
916,61464.500000,326.750000
917,138147.615385,683.410256
918,42397.454545,286.363636
919,114935.129032,529.354839
...,...,...
2620,126930.500000,794.000000
2662,147249.000000,735.000000
2725,60999.666667,431.000000
2917,202953.000000,991.000000


In [11]:
np.mean(R1_1.values,axis=0)# 传递的是values，如果不加参数，计算的就是全部了

array([114413.40244318,    597.07062204])

In [12]:
R1_1.values

array([[1.68339091e+05, 8.39022727e+02],
       [6.14645000e+04, 3.26750000e+02],
       [1.38147615e+05, 6.83410256e+02],
       [4.23974545e+04, 2.86363636e+02],
       [1.14935129e+05, 5.29354839e+02],
       [4.40628889e+04, 2.33844444e+02],
       [7.63240000e+04, 4.12000000e+02],
       [1.09013579e+05, 5.26578947e+02],
       [8.08096000e+04, 4.73800000e+02],
       [7.51096000e+04, 4.46320000e+02],
       [7.51096000e+04, 4.46320000e+02],
       [9.14098000e+04, 4.26000000e+02],
       [9.68389268e+04, 4.37780488e+02],
       [8.71610000e+04, 5.26636364e+02],
       [8.97500000e+04, 3.99000000e+02],
       [1.06110933e+05, 5.79866667e+02],
       [6.64111500e+04, 4.18475000e+02],
       [1.31774000e+05, 6.07000000e+02],
       [4.78969474e+04, 4.11052632e+02],
       [1.83863629e+04, 3.50774194e+02],
       [6.15806250e+04, 4.38187500e+02],
       [3.37114000e+05, 1.35000000e+03],
       [5.12259412e+04, 4.68941176e+02],
       [1.36009000e+05, 6.24000000e+02],
       [2.136950

In [13]:
# 或者使用df来计算再转换为values
np.mean(R1_1)

feature13    114413.402443
feature15       597.070622
dtype: float64

In [14]:
np.mean(R1_1).values

array([114413.40244318,    597.07062204])

In [15]:
# 注意协方差矩阵的定义
# 机器学习中讨论的是特征和特征之间的协方差，需要写成行的形式
R1_1.T

Unnamed: 0,915,916,917,918,919,920,921,922,924,925,...,2603,2605,2606,2613,2614,2620,2662,2725,2917,2946
feature13,168339.090909,61464.5,138147.615385,42397.454545,114935.129032,44062.888889,76324.0,109013.578947,80809.6,75109.6,...,128001.333333,212295.0,132673.0,75764.0,75285.0,126930.5,147249.0,60999.666667,202953.0,65163.5
feature15,839.022727,326.75,683.410256,286.363636,529.354839,233.844444,412.0,526.578947,473.8,446.32,...,696.333333,872.0,707.0,429.0,561.0,794.0,735.0,431.0,991.0,382.0


In [16]:
np.cov(R1_1.T)

array([[3.53637125e+09, 1.44439670e+07],
       [1.44439670e+07, 8.33537192e+04]])

In [17]:
# 或者使用array
np.cov(R1_1.values.T)

array([[3.53637125e+09, 1.44439670e+07],
       [1.44439670e+07, 8.33537192e+04]])

---
### 设置遍历DF的每一行

In [3]:
test

Unnamed: 0,feature0,feature1,feature2,feature3_1,feature3_2,feature3_3,feature3_4,feature11,feature12,feature13,...,feature69_3,feature69_4,feature69_5,feature69_6,1,2,3,4,5,6
2276,106.140000,13.750000,2.00,4.0,893.0,0.0,0.000000,0.000,5.770000,148456.000000,...,-17.690000,-0.190000,-1.810000,-9.130000,1.0,0.0,0.0,0.0,0.0,0.0
919,290.150323,20.798710,4.00,0.0,0.0,0.0,529.419355,0.000,8.985806,114935.129032,...,15.508387,13.587742,13.897097,12.763548,1.0,0.0,0.0,0.0,0.0,0.0
2006,251.220000,17.960000,3.00,0.0,0.0,695.0,0.000000,0.000,20.610000,172918.000000,...,9.000000,7.500000,18.190000,4.500000,1.0,0.0,0.0,0.0,0.0,0.0
2089,229.900000,21.710000,3.98,4.0,0.0,0.0,573.000000,0.000,9.880000,140405.000000,...,12.310000,12.000000,19.880000,23.440000,1.0,0.0,0.0,0.0,0.0,0.0
2946,117.655000,14.515000,3.69,35.5,0.0,0.0,346.500000,1.395,8.505000,65163.500000,...,9.625000,11.565000,6.970000,13.565000,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2199,106.900000,14.610000,1.98,24.0,1333.0,0.0,0.000000,0.370,19.480000,328802.000000,...,-14.000000,4.060000,-2.380000,-7.690000,0.0,0.0,1.0,0.0,0.0,0.0
2266,283.750000,14.193333,2.00,0.0,1470.0,0.0,0.000000,0.000,9.286667,369158.333333,...,0.400000,2.856667,11.106667,3.273333,0.0,0.0,1.0,0.0,0.0,0.0
1961,262.180000,13.070000,2.00,0.0,1508.0,0.0,0.000000,0.000,9.680000,379889.000000,...,-3.810000,-2.750000,-7.190000,-3.750000,0.0,0.0,1.0,0.0,0.0,0.0
2804,298.490000,16.830000,1.98,23.0,1350.0,0.0,0.000000,0.000,13.720000,340086.000000,...,-12.500000,-14.750000,-6.810000,1.880000,0.0,0.0,1.0,0.0,0.0,0.0


In [4]:
# 切分测试样本单位
test[3:4] # 取第四行

Unnamed: 0,feature0,feature1,feature2,feature3_1,feature3_2,feature3_3,feature3_4,feature11,feature12,feature13,...,feature69_3,feature69_4,feature69_5,feature69_6,1,2,3,4,5,6
2089,229.9,21.71,3.98,4.0,0.0,0.0,573.0,0.0,9.88,140405.0,...,12.31,12.0,19.88,23.44,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# # 遍历+抽取每一行的模板
# for i in range(test_data.shape[0]):
#     print(test_data[i:i+1])

---
### 操作矩阵（取列/求和）

In [5]:
A = np.array([[0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        7.29413619e-176],
       [0.00000000e+000, 0.00000000e+000, 1.00000000e+000,
        3.44257411e-120],
       [0.00000000e+000, 1.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [0.00000000e+000, 1.00000000e+000, 1.00000000e+000,
        0.00000000e+000],
       [1.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        7.29413619e-176],
       [1.00000000e+000, 0.00000000e+000, 1.00000000e+000,
        3.44257411e-120],
       [1.00000000e+000, 1.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [1.00000000e+000, 1.00000000e+000, 1.00000000e+000,
        0.00000000e+000]])
# 将第三列的所有生成概率进行归一化
A[:,3] = A[:,3]/sum(A[:,3])
A[:,3]

array([1.05940148e-56, 5.00000000e-01, 0.00000000e+00, 0.00000000e+00,
       1.05940148e-56, 5.00000000e-01, 0.00000000e+00, 0.00000000e+00])

In [6]:
# 分别取出R1，R2和R3的生成概率，求和后就是单个的概率
# 以R1为例
# p_R1 = A[]
sum(A[A[:,0]==1][:,3])

0.5