In [1]:
import pandas as pd
from collections import Counter
import re
import matplotlib.pyplot as plt
import heapq
import json
import concurrent.futures
from tqdm import tqdm

In [2]:
def validate_json(json_str):
    try:
        json.loads(json_str)
        return True
    except:
        return False

def process_row(row):
    if validate_json(row.dimension) and validate_json(row.results):
        return row
    else:
        # 打印出错的行号
        print(f'error json: {row.Index}\n')
        return None

def judgeJson(keyTestT):
    # 使用并行处理加速处理过程
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(process_row, keyTestT.itertuples()), total=len(keyTestT)))

    # 过滤掉返回 None 的结果并重置索引
    keyTestT = pd.DataFrame([r for r in results if r is not None])
    keyTestT = keyTestT.reset_index(drop=True)
    return keyTestT

In [3]:
# 判断data/文件夹下是否有filterFile.csv文件
# 如果有则读取，如果没有则生成
try:
    df = pd.read_csv('data/filterFile.csv')
    print('filterFile.csv exists')
except FileNotFoundError:
    df = pd.read_csv('data/origin_files/key_test_t.csv')
    df = df[['results', 'dimension','results_key']]
    df = judgeJson(df)
    df.to_csv('data/filterFile.csv', index=False)
    print('filterFile.csv created')

filterFile.csv exists


In [4]:
# 针对results_key这一列，计数每一行的值出现的次数
ResultsKeyCount = df['results_key'].value_counts()
ResultsKeyCount = pd.DataFrame(ResultsKeyCount)
ResultsKeyCount.columns = ['number']

In [None]:
ResultsKeyCount.head(10)

In [None]:
# 判断key_test中的dimension列中是否同时存在tool_name和test_name字段，如果不存在则error+1
error = 0
for i in range(len(df)):
    if 'tool_name' not in df['dimension'][i] or 'test_name' not in df['dimension'][i]:
        error += 1
print(error)

In [30]:
def getToolName(df):
    tool_names = []
    test_name = []
    for dim in df['dimension']:
        if dim:
            dim_dict = json.loads(dim)
            if 'tool_name' in dim_dict:
                tool_names.append(dim_dict['tool_name'])
            if 'test_name' in dim_dict:
                test_name.append(dim_dict['test_name'])

    tool_name_counts = Counter(tool_names)
    test_name_counts = Counter(test_name)
    return tool_name_counts, test_name_counts


In [31]:
# 统计tool_name值出现的次数
toolNameCount, testNameCount = getToolName(df)
# 将Counter对象toolNameCount转换为DataFrame
toolNameCount = pd.DataFrame.from_dict(toolNameCount, orient='index')
toolNameCount.columns = ['number']
# 将Counter对象testNameCount转换为DataFrame
testNameCount = pd.DataFrame.from_dict(testNameCount, orient='index')
testNameCount.columns = ['number']
# 将toolNameCount和testNameCount按照number值降序排列
toolNameCount = toolNameCount.sort_values(by='number', ascending=False)
testNameCount = testNameCount.sort_values(by='number', ascending=False)

In [32]:
toolNameCount.head(10)

Unnamed: 0,number
stream,101301
unixbench,78977
speccpu2017,46576
ls,44148
netperf,32252
vray,32015
mlc,27041
super_pi,21005
ping,20116
wzry,18589


In [33]:
testNameCount.head(10)

Unnamed: 0,number
stream_baseline,100212
unixbench_baseline,76567
demo,67427
speccpu2017_baseline,46285
netperf_baseline,30801
mlc_baseline,26799
super_pi_baseline,20989
ping_baseline,18899
wzry_baseline,18538
perf_bench_baseline,12937


In [34]:
# 获得toolNameCount的前5的index
toolNameCount_index = toolNameCount.index[:10].tolist()

In [35]:
print(toolNameCount_index)

['stream', 'unixbench', 'speccpu2017', 'ls', 'netperf', 'vray', 'mlc', 'super_pi', 'ping', 'wzry']


In [36]:

def getKeyNum(df):
    # 提取 results 列中带有 "#" 号的字段
    pattern = r'#\w+'  
    results = df['results'].str.cat(sep=' ')  # 将所有 results 列的数据合并为一个字符串
    hashtags = set(re.findall(pattern, results))  # 使用正则表达式提取带 "#" 号的字段，并去重

    # 统计每个带 "#" 号的字段在整个文件中出现的次数
    hashtags_dict = {}
    for hashtag in hashtags:
        count = results.count(hashtag)
        hashtags_dict[hashtag] = count
    return hashtags_dict


def getTop10Key(hashtags_dict):
    # 获取字典中数量前十的字段
    top_n = 10  # 自定义获取前几个字段
    top_n_fields = heapq.nlargest(top_n, hashtags_dict, key=hashtags_dict.get)
    
    print("数量前十的字段：")
    for field in top_n_fields:
        print("字段名: {:<30s} 出现次数: {:d}".format(field, hashtags_dict[field]))


def getDivideFile(toolNameCount_index, df):
    key_test = df.copy()
    # 根据toolNameCount_index筛选出tool_name值为toolNameCount_index的行，并重置索引，保存为toolNameCount_index对应名字的csv文件
    for tool_name in toolNameCount_index:
        tool_name_df = key_test[key_test['dimension'].str.contains(tool_name)]
        tool_name_df = tool_name_df.reset_index(drop=True)
        # 只保留results_key,results,dimension列
        tool_name_df = tool_name_df[['results_key', 'results', 'dimension']]
        tool_name_df.to_csv('data/processed_files/' + tool_name + '.csv', index=False)

In [37]:
getDivideFile(toolNameCount_index, df)

In [38]:
streamDF = pd.read_csv('data/processed_files/stream.csv')
streamDFhashtags_dict = getKeyNum(streamDF)
getTop10Key(streamDFhashtags_dict)

数量前十的字段：
字段名: #stream_copy                   出现次数: 94146
字段名: #stream_triad                  出现次数: 94146
字段名: #stream_scale                  出现次数: 8369
字段名: #stream_add                    出现次数: 8369
字段名: #stream_ht2_triad              出现次数: 6038
字段名: #stream_ht1_copy               出现次数: 6038
字段名: #stream_ht1_triad              出现次数: 6038
字段名: #stream_ht2_copy               出现次数: 6038
字段名: #hrtimer_max                   出现次数: 748
字段名: #hrtimer_avg                   出现次数: 748


In [39]:
unixbenchDF = pd.read_csv('data/processed_files/unixbench.csv')
unixbenchDFhashtags_dict = getKeyNum(unixbenchDF)
getTop10Key(unixbenchDFhashtags_dict)

数量前十的字段：
字段名: #unixbench_cpu                 出现次数: 23439
字段名: #unixbench_context             出现次数: 21972
字段名: #unixbench_fork                出现次数: 12387
字段名: #unixbench_api                 出现次数: 9037
字段名: #unixbench_shell               出现次数: 8901
字段名: #unixbench_floating_score      出现次数: 4681
字段名: #unixbench_int_cpu             出现次数: 4130
字段名: #unixbench_total               出现次数: 104
字段名: #unixbench_system_call_overhead 出现次数: 101
字段名: #unixbench_shell_scripts_8_concurrent 出现次数: 100


In [40]:
speccpuDF = pd.read_csv('data/processed_files/speccpu2017.csv')
speccpuDFhashtags_dict = getKeyNum(speccpuDF)
getTop10Key(speccpuDFhashtags_dict)

数量前十的字段：
字段名: #speccpu2017_intspeed          出现次数: 13581
字段名: #speccpu2017_fprate            出现次数: 12197
字段名: #speccpu2017_intrate           出现次数: 11610
字段名: #speccpu2017_fpspeed           出现次数: 8526
字段名: #ping_avg                      出现次数: 32
字段名: #ping_min                      出现次数: 16
字段名: #ping_first_packet             出现次数: 16
字段名: #ping_avg_99                   出现次数: 16
字段名: #ping_max                      出现次数: 16
字段名: #ping_percent_99               出现次数: 16


In [41]:
lsDF = pd.read_csv('data/processed_files/ls.csv')
lsDFhashtags_dict = getKeyNum(lsDF)
getTop10Key(lsDFhashtags_dict)

数量前十的字段：
字段名: #ls_time                       出现次数: 44228
字段名: #stream_triad                  出现次数: 4334
字段名: #stream_copy                   出现次数: 4334
字段名: #unixbench_cpu                 出现次数: 3164
字段名: #unixbench_context             出现次数: 2930
字段名: #stream_add                    出现次数: 2905
字段名: #stream_scale                  出现次数: 2905
字段名: #unixbench_api                 出现次数: 2810
字段名: #unixbench_shell               出现次数: 2667
字段名: #unixbench_fork                出现次数: 2634


In [42]:
netperfDF = pd.read_csv('data/processed_files/netperf.csv')
netperfDFhashtags_dict = getKeyNum(netperfDF)
getTop10Key(netperfDFhashtags_dict)

数量前十的字段：
字段名: #netperf_UDP_PPS_64            出现次数: 10165
字段名: #netperf_TCP_Throughput_1500   出现次数: 8433
字段名: #netperf_UDP_PPS_64_SEND       出现次数: 4805
字段名: #netperf_UDP_RecvThroughput_1400 出现次数: 4622
字段名: #netperf_TCP_Throughput_256    出现次数: 4528
字段名: #netperf_TCP_Throughput_1500_SEND 出现次数: 3812
字段名: #netperf_TCP_CRR_32_128        出现次数: 3555
字段名: #netperf_recv_throughput       出现次数: 3547
字段名: #netperf_TCP_Throughput_1400   出现次数: 3528
字段名: #netperf_TCP_RR_32_128         出现次数: 3182


In [43]:
vrayDF = pd.read_csv('data/processed_files/vray.csv')
vrayDFhashtags_dict = getKeyNum(vrayDF)
getTop10Key(vrayDFhashtags_dict)

数量前十的字段：
字段名: #vray_render_time              出现次数: 32004
字段名: #ls_time                       出现次数: 6
字段名: #wzry_trainning_time           出现次数: 2
字段名: #wzry_training_time            出现次数: 1


In [44]:
mlcDF = pd.read_csv('data/processed_files/mlc.csv')
mlcDFhashtags_dict = getKeyNum(mlcDF)
getTop10Key(mlcDFhashtags_dict)

数量前十的字段：
字段名: #mlc_idle_latency              出现次数: 7284
字段名: #mlc_samenuma_latency_min      出现次数: 6882
字段名: #mlc_samenuma_latency_max      出现次数: 6882
字段名: #mlc_samenuma_bandwidth_max    出现次数: 4523
字段名: #mlc_samenuma_bandwidth_min    出现次数: 4523
字段名: #mlc_3                         出现次数: 4173
字段名: #mlc_all_read                  出现次数: 4173
字段名: #mlc_loaded_latency_max        出现次数: 4170
字段名: #mlc_loaded_latency_min        出现次数: 4170
字段名: #mlc_loaded_bandwidth_min      出现次数: 4167


In [45]:
super_piDF = pd.read_csv('data/processed_files/super_pi.csv')
super_piDFhashtags_dict = getKeyNum(super_piDF)
getTop10Key(super_piDFhashtags_dict)

数量前十的字段：
字段名: #super_pi_user_time            出现次数: 21005
字段名: #super_pi_sys_time             出现次数: 21005
字段名: #super_pi_real_time            出现次数: 21005


In [46]:
pingDF = pd.read_csv('data/processed_files/ping.csv')
pingDFhashtags_dict = getKeyNum(pingDF)
getTop10Key(pingDFhashtags_dict)

数量前十的字段：
字段名: #ping_avg                      出现次数: 32822
字段名: #ping_max                      出现次数: 20416
字段名: #ping_min                      出现次数: 18389
字段名: #ping_percent_99               出现次数: 14809
字段名: #ping_avg_99                   出现次数: 14433
字段名: #ping_percent_90               出现次数: 14164
字段名: #ping_first_packet             出现次数: 13774
字段名: #ping_flood_1ms                出现次数: 5983
字段名: #tcpping_distribution_99       出现次数: 3431
字段名: #tcpping_latency_max           出现次数: 1727


In [47]:
wzryDF = pd.read_csv('data/processed_files/wzry.csv')
wzryDFhashtags_dict = getKeyNum(wzryDF)
getTop10Key(wzryDFhashtags_dict)

数量前十的字段：
字段名: #wzry_training_time            出现次数: 18587
字段名: #hrtimer_max                   出现次数: 150
字段名: #hrtimer_avg                   出现次数: 150
字段名: #hrtimer_overflow              出现次数: 150
字段名: #hrtimer_min                   出现次数: 150
字段名: #cyclictest_lat_percentiles_99 出现次数: 90
字段名: #unixbench_fork                出现次数: 64
字段名: #unixbench_cpu                 出现次数: 64
字段名: #unixbench_api                 出现次数: 64
字段名: #unixbench_floating_score      出现次数: 64
