In [None]:
import os
import re

# Define the path to the data folder
data_folder = 'data_profiling_perf'

data = {}
pattern = re.compile(r'^\s*([\d,]+)\s+([\w\.-]+).*$', re.MULTILINE)

for f in os.listdir(data_folder):
    if f.endswith('.log'):
        program, threads, dataset = f.split('-')
        threads = int(threads)
        dataset = dataset.replace('.log', '')

        if program not in data: data[program] = {}
        if threads not in data[program]: data[program][threads] = {}

        # Parse the file content
        matches = []
        with open(os.path.join(data_folder, f), 'r') as file:
            matches = pattern.findall(file.read())

        # Create a dataframe from the parsed data
        _data = {}
        for match in matches:
            value, metric = match
            _data[metric] = int(value.replace(',', ''))
        # _data['cache-miss-ratio'] = _data['cache-misses'] / _data['cache-references']
        _data['L1-miss-ratio'] = _data['L1-dcache-load-misses'] / _data['L1-dcache-loads']
        _data['L2-miss-ratio'] = _data['l2_rqsts.demand_data_rd_miss'] / _data['l2_rqsts.all_demand_data_rd']
        _data['L3-miss-ratio'] = _data['LLC-load-misses'] / _data['LLC-loads']
        _data['L1-miss-rate'] = _data['L1-dcache-load-misses'] / _data['mem_inst_retired.all_loads']
        _data['L2-miss-rate'] = _data['l2_rqsts.demand_data_rd_miss'] / _data['mem_inst_retired.all_loads']
        _data['L3-miss-rate'] = _data['LLC-load-misses'] / _data['mem_inst_retired.all_loads']

        data[program][threads][dataset] = _data
    

for program, threads_data in data.items():
    print(program)
    for threads, datasets in threads_data.items():
        print(f'  {threads} threads')
        for dataset, df in datasets.items():
            print(f'    {dataset}')
            print(df)
            print()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

FONT_TITLE = 18
FONT_AXES = 18
FONT_TICKS = 14
FONT_LEGEND = 14

datasets = [
    # 'Social_Network_1',
    # 'Web_Graph_1',
    # 'Collaboration_Network_1',
    # 'Synthetic_Dense_1',
    'Road_Network_1',
    'Road_Network_2',
    'kNN_Graph_1',
    'Synthetic_Sparse_1',
]

programs = ['large', 'small', 'classic']

for threads in [24, 32]:
    for kind in ['rate', 'ratio']:
        for metric in ['L1-miss', 'L2-miss', 'L3-miss']:
            metric = metric + '-' + kind
            # Extract cache miss ratios for each dataset and program
            cache_miss_ratios = {program: [data[program][threads][dataset][metric] for dataset in datasets] for program in programs}

            # Plot the cache miss ratios
            x = np.arange(len(datasets))  # the label locations
            width = 0.2  # the width of the bars

            fig, ax = plt.subplots(figsize=(12, 6))

            for i, program in enumerate(programs):
                ax.bar(x + i * width, cache_miss_ratios[program], width, label=program)

            ax.set_ylabel(metric, fontsize=FONT_TICKS)
            ax.set_title(f'{metric} for {threads} threads', fontsize=FONT_TITLE)
            ax.set_xticks(x + width / len(programs))
            ax.set_xticklabels(datasets, rotation=45, fontsize=FONT_TICKS-2)
            ax.legend(loc='best', fontsize=FONT_LEGEND)
            ax.grid(True, linestyle=':')

            fig.tight_layout()
            plt.show()

In [None]:
folders = ['classic', 'small', 'large']
datasets = [
    'Social_Network_1',
    'Web_Graph_1',
    'Collaboration_Network_1',
    'Synthetic_Dense_1',
    'Road_Network_1',
    'Road_Network_2',
    'kNN_Graph_1',
    'Synthetic_Sparse_1',
]
escaped_datasets = [dataset.replace('_', ' ') for dataset in datasets]

for threads in [24, 48, 96]:
    # Initialize an empty dataframe to store the results
    average_cache_misses = pd.DataFrame(index=escaped_datasets, columns=folders)

    # Iterate over each folder and dataset to calculate the average cache misses
    for folder in folders:
        for dataset, escaped_dataset in zip(datasets, escaped_datasets):
            df = data[folder][threads][dataset]['L2CACHE']
            # Calculate the average cache misses
            # print(f"Folder: {folder} -- Dataset: {dataset}")
            # print(df[df['Metric'] == 'L2 miss ratio'].iloc[0, 1:].astype(float))
            miss_ratios = df[df['Metric'] == 'L2 miss ratio'].iloc[0, 1:].astype(float)
            min_cache_misses = miss_ratios.min()
            max_cache_misses = miss_ratios.max()
            avg_cache_misses = miss_ratios.mean()
            # Store the result in the dataframe
            average_cache_misses.loc[escaped_dataset, folder] = f'{int(min_cache_misses*100)}/{int(max_cache_misses*100)}/{int(avg_cache_misses*100)}'

    # Display the resulting dataframe
    print(f'====== {threads=} ======')
    print(average_cache_misses)
    print(average_cache_misses.to_latex())