In [50]:
import pandas as pd
import argparse
from collections import defaultdict


def parse_args():
    parser = argparse.ArgumentParser(description="CPU alignment")
    parser.add_argument(
        'cpu_trace_csv', type=str,
        help='The CSV stage log file from the program being profiled.'
    )
    parser.add_argument(
        'stage_trace_csv', type=str,
        help='The CPU usage CSV log file from profiler.'
    )
    return parser.parse_args()




In [53]:

cpu_trace_csv = "simplemt.cpu_trace.csv"
stage_trace_csv = "simplemt.stage_trace.csv"

# Prepare the dataframes
cpu_trace_df = pd.read_csv(cpu_trace_csv)
cpu_trace_df.columns = ["st", "ed", "cpu"]

stage_trace_df = pd.read_csv(stage_trace_csv, header=None)
stage_trace_df.columns = ["stage", "status", "lineno", "ts"]
stage_trace_df = stage_trace_df[["stage", "ts"]]
stage_trace_df = stage_trace_df.groupby("stage").agg(["min", "max"])
stage_trace_df.columns = ["st", "ed"]


In [54]:
cpu_trace_df

Unnamed: 0,st,ed,cpu
0,1701127815272151995,1701127815272210038,0.0
1,1701127815772741378,1701127815772827303,3068.5
2,1701127816273352269,1701127816273425349,3094.2
3,1701127816773946704,1701127816774006850,3094.4
4,1701127817274097756,1701127817274176890,3098.9
5,1701127817774703154,1701127817774765597,3102.4
6,1701127818275345907,1701127818275452493,3105.9
7,1701127818775981274,1701127818776058889,3074.2
8,1701127819276581045,1701127819276646970,2990.4
9,1701127819777174727,1701127819777260625,2996.5


In [56]:
stage_trace_df

Unnamed: 0_level_0,st,ed
stage,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1701127815274238758,1701127815274354219
2,1701127815274390637,1701127815274424893
3,1701127815274458879,1701127815274493072
4,1701127815274528022,1701127815276059524
5,1701127815276111976,1701127822708767430
6,1701127822708792826,1701127822708803005


In [None]:

# Prepare a dictionary to record the cpu usage of each sage.
# For each row, do a range join if the cpu trace is within the stage trace time range.
cpu_agg_trace = defaultdict(list)
for i, cpu_row in cpu_trace_df.iterrows():  # row -> cpu trace
    for j, stage_row in stage_trace_df.iterrows():  # row2 -> stage trace
        # if cpu trace is within stage trace for either side, count it as is.
        if stage_row["st"] <= cpu_row["st"] and cpu_row["ed"] <= stage_row["ed"]:
            cpu_usage = cpu_row["cpu"]
            cpu_agg_trace[j].append((i, cpu_usage))

cpu_agg_trace = {
    k: sum(cpu_usage for _, cpu_usage in v) / len(v)
    for k, v in cpu_agg_trace.items()
}

# Make child_list a dataframe
aggregated_stage_df = pd.DataFrame.from_dict(cpu_agg_trace, orient="index")

# join aggregated_stage_df with c
final_df = stage_trace_df.join(aggregated_stage_df)
print(final_df.to_csv())