In [178]:
import json
import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [179]:
keyTestTPath='data/key_test_t.csv'
# keyTestTPath='backup/forTest/keyTestT.csv'

In [180]:
keyTestT = pd.read_csv(keyTestTPath, usecols=['dimension', 'results'])

In [184]:
# 取keyTestT的前1000条数据
keyTestT = keyTestT[:1000]

In [185]:
keyTestT.head()

Unnamed: 0,results,dimension
0,"{""#wrk_qps_avg"": 1547599.39, ""wrk_latency_avg""...","{""cvm_cpu"": ""62"", ""platform"": ""qcloud"", ""test_..."
1,"{""openssl_des_size_16"": 27246690.0, ""openssl_d...","{""cvm_cpu"": ""32"", ""platform"": ""qcloud"", ""test_..."
2,"{""#ls_time"": 18}","{""dd"": ""name_11"", ""cvm_cpu"": ""1"", ""platform"": ..."
3,"{""#ls_time"": 18}","{""dd"": ""name_11"", ""cvm_cpu"": ""1"", ""platform"": ..."
4,"{""#mlc_idle_latency"": 90.6}","{""cvm_cpu"": ""8"", ""platform"": ""default"", ""test_..."


In [186]:
def judgeJson(keyTest):
    # 判断dimension，results是否是合法的json，若不是则将这一行删除
    erroNum=0
    keyTest = keyTest.reset_index(drop=True)
    for i in tqdm.tqdm(range(len(keyTest))):
        try:
            json.loads(keyTest.loc[i, 'dimension'])
            json.loads(keyTest.loc[i, 'results'])
        except:
            keyTest.drop(i, inplace=True)
            erroNum+=1
    # json解析错误个数
    print('json解析错误个数：', erroNum)
    return keyTest

# 打印数组的前三个元素
def showVec(outputRes,num=3):
    for i in range(len(outputRes)):
        if i<num:
            print(outputRes[i])
        else:
            break

def showDict(dict,num=3):
    # 打印字典的前三个元素
    for i, (key, value) in enumerate(dict.items()):
        if i<num:
            print(f"{key}: {value}")
        else:
            break

# 获得所有的key
def getAllKeys(inputJson):
    for key in inputJson:
        if isinstance(inputJson[key], dict):
            getAllKeys(inputJson[key])  # 递归调用
        else:
            return key

In [187]:
keyTestT = judgeJson(keyTestT)

100%|██████████| 1000/1000 [00:00<00:00, 13698.73it/s]

json解析错误个数： 9





In [188]:
keyTestT.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 991 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   results    991 non-null    object
 1   dimension  991 non-null    object
dtypes: object(2)
memory usage: 63.2+ KB


In [189]:
def getInputFeature(keyTestT):
    errornum = 0
    # 指定需要提取的字段
    res=[]
    keys_to_extract = ['cvm_cpu', 'cvm_memory', 'cvm_cpu_qos', 'cvm_os_type']
    for index, row in tqdm.tqdm(keyTestT.iterrows(), total=keyTestT.shape[0]):
        dimension = row['dimension']
        # templist=[]
        templist=['cvm_cpu', 'cvm_memory', 'cvm_cpu_qos', 'cvm_os_type']
        try:
            dimensionJson = json.loads(dimension)
            for key in dimensionJson: 
                if key in keys_to_extract:  # 判断该字段是否需要提取
                    if isinstance(dimensionJson[key], dict):
                        key_value = getAllKeys(dimensionJson[key])
                    else:
                        key_value = dimensionJson[key]
                    if key=='cvm_memory':
                        key_value=float(key_value.split(' ')[0])
                    elif key=='cvm_cpu':
                        key_value=float(key_value)
                    elif key=='cvm_cpu_qos':
                        key_value=1.0
                    templist[keys_to_extract.index(key)]=key_value
            res.append(templist.copy())
        except:
            errornum += 1
    print('json解析错误的数量：', errornum)
    res = np.array(res)
    return res

def getInput(inputDim):
    data = np.array(inputDim)
    encoders = []
    for i in range(data.shape[1]):
        encoder = LabelEncoder()
        encoder.fit(data[:, i])
        data[:, i] = encoder.transform(data[:, i])
        encoders.append(encoder)

    data = data.astype(float)
    return data, encoders

# 通过encoders，将data解码回去
def decodeInput(inputEncoder, encodedInput):
    decodedInput = np.empty_like(encodedInput).astype(str)
    for i in range(encodedInput.shape[1]):
        encoder = inputEncoder[i]
        decodedInput[:, i] = encoder.inverse_transform(encodedInput[:, i].astype(int))
        
    return decodedInput

In [190]:
inputDim = getInputFeature(keyTestT)

100%|██████████| 991/991 [00:00<00:00, 4657.02it/s]

json解析错误的数量： 0





In [191]:
print(inputDim.shape)
showVec(inputDim,5)

(991, 4)
['62.0' '214.0' '1.0' 'CentOS Linux release 8.2.2004 (Core)']
['32.0' '64.0' '1.0' 'CentOS Linux release 8.2.2004 (Core)']
['1.0' '2.0' 'cvm_cpu_qos' 'cvm_os_type']
['1.0' '2.0' 'cvm_cpu_qos' 'cvm_os_type']
['8.0' '16.0' 'cvm_cpu_qos' 'CentOS Linux release 7.2 (Final)']


In [192]:
input, inputEncoder=getInput(inputDim)

In [193]:
print(input.shape)
showVec(input,5)

(991, 4)
[21.  8.  0.  7.]
[16. 30.  0.  7.]
[ 0.  6.  1. 16.]
[ 0.  6.  1. 16.]
[26.  4.  1.  0.]


In [194]:
originInput = decodeInput(inputEncoder,input)

In [195]:
showVec(originInput,5)

['62.0' '214.0' '1.0' 'CentOS Linux release 8.2.2004 (C']
['32.0' '64.0' '1.0' 'CentOS Linux release 8.2.2004 (C']
['1.0' '2.0' 'cvm_cpu_qos' 'cvm_os_type']
['1.0' '2.0' 'cvm_cpu_qos' 'cvm_os_type']
['8.0' '16.0' 'cvm_cpu_qos' 'CentOS Linux release 7.2 (Final)']


In [196]:
def getResultArgList(resultArgList):
    for index, row in tqdm.tqdm(keyTestT.iterrows(), total=keyTestT.shape[0]):
        result = row['results']
        resultJson = json.loads(result)
        for key in resultJson:
            if key.find('#') != -1:
                resultArgList.append(key)
    # 去重
    resultArgList = list(set(resultArgList))
    print('指标个数：', len(resultArgList))
    return resultArgList

def getOutFeature(keyTestT, resultArgList):
    res=[]
    keys_to_extract = resultArgList
    for index, row in tqdm.tqdm(keyTestT.iterrows(), total=keyTestT.shape[0]):
        result = row['results']
        templist=keys_to_extract
        resultJson = json.loads(result)
        for key in resultJson:
            if key in keys_to_extract:
                templist[keys_to_extract.index(key)]=resultJson[key]
                    # templist[templist.index(key)]=resultJson[key]
        # res.append(templist) 很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪很奇怪
        res.append(templist.copy())

    res = np.array(res)
    return res

def getOutput(outputRes):
    data = np.array(outputRes)
    print(data.shape)
    encoders = []
    for i in range(data.shape[1]):
        encoder = LabelEncoder()
        encoder.fit(data[:, i])
        data[:, i] = encoder.transform(data[:, i])
        encoders.append(encoder)

    data = data.astype(float)
    return data, encoders

# 通过encoders，将data解码回去
def decodeOutput(outputEncoder, encodedOutput):
    decodedOutput = np.empty_like(encodedOutput).astype(str)
    for i in range(encodedOutput.shape[1]):
        encoder = outputEncoder[i]
        decodedOutput[:, i] = encoder.inverse_transform(encodedOutput[:, i].astype(int))
        
    return decodedOutput

In [197]:
resultArgList=[]
resultArgList = getResultArgList(resultArgList)

100%|██████████| 991/991 [00:00<00:00, 5445.07it/s]

指标个数： 240





In [198]:
ouputRes=getOutFeature(keyTestT, resultArgList)

100%|██████████| 991/991 [00:00<00:00, 2926.46it/s]


In [199]:
print(ouputRes.shape)
showVec(ouputRes,5)

(991, 240)
['#redis_client_sadd' '#mlc_2:1_read_write'
 '#iperf3_tcp_down_received_bandwidth_min' '#ls_time'
 '#openssl_sha256_size_256' '#unixbench_context'
 '#mlperf_inference_classification_and_detection_mean' '#fio_write_iops'
 '#mlc_samenuma_latency_min' '#netperf_TCP_RR_32_128'
 '#mlperf_inference_classification_and_detection_90.00 percentile latency'
 '#ipi_special_cpu_self_max_ipi_time' '#openssl_aes_256_size_16'
 '#openssl_md5_size_256' '#netperf_TCP_Throughput_1400' '#spp_lat_p99'
 '#contextswitch_default_ctx' '#redis_client_get'
 '#specjbb2015_metric_critical' '#hrtimer_avg'
 '#mlperf_inference_classification_and_detection_80.00 percentile latency'
 '#super_pi_sys_time' '#openssl_md5_size_16' '#stream_ht1_copy'
 '#mlc_samenuma_bandwidth_min' '#ipi_special_cpu_remote_total_time'
 '#cyclictest_lat_percentiles_95' '#mlc_3:1_read_write'
 '#lmbench_L2_latency' '#specjbb2015_sla_75000'
 '#netperf_UDP_RecvThroughput_1400' '#tcpping_latency_max'
 '#ping_first_packet' '#tensorflowben

In [200]:
output, outputEncoder=getOutput(ouputRes)

(991, 240)


In [204]:
print(output.shape)
showVec(output,5)

(991, 240)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.

In [202]:
originOutput = decodeOutput(outputEncoder,output)

In [203]:
showVec(originOutput,5)

['#redis_client_sadd' '#mlc_2:1_read_write'
 '#iperf3_tcp_down_received_bandwi' '#ls_time' '#openssl_sha256_size_256'
 '#unixbench_context' '#mlperf_inference_classification' '#fio_write_iops'
 '#mlc_samenuma_latency_min' '#netperf_TCP_RR_32_128'
 '#mlperf_inference_classification' '#ipi_special_cpu_self_max_ipi_ti'
 '#openssl_aes_256_size_16' '#openssl_md5_size_256'
 '#netperf_TCP_Throughput_1400' '#spp_lat_p99'
 '#contextswitch_default_ctx' '#redis_client_get'
 '#specjbb2015_metric_critical' '#hrtimer_avg'
 '#mlperf_inference_classification' '#super_pi_sys_time'
 '#openssl_md5_size_16' '#stream_ht1_copy' '#mlc_samenuma_bandwidth_min'
 '#ipi_special_cpu_remote_total_ti' '#cyclictest_lat_percentiles_95'
 '#mlc_3:1_read_write' '#lmbench_L2_latency' '#specjbb2015_sla_75000'
 '#netperf_UDP_RecvThroughput_1400' '#tcpping_latency_max'
 '#ping_first_packet' '#tensorflowbenchmarks_TrainingSp'
 '#iperf3_tcp_up_received_bandwidt' '#contextswitch_same_node_ctx'
 '#openssl_aes_256_size_64' '#netp