# Benchmark Data Compilation Notebook

In this Notebook, our objective is to compile benchmark data from the `\bulk_benchmarks` and `\single_benchmarks` directories, generating two distinct `.csv` files with raw data. 
These files will serve as datasets for direct comparison with the benchmarks of the CytoSnake workflows. 


## Imports 

In [1]:
import sys
import json
import pathlib
import pandas as pd
from datetime import datetime

sys.path.append("../../../")
from src.benchmark_utils import get_benchmark_files

# Parameters Used in this Notebook

Follow parameters used in this notebook.

In [2]:
# inputs
working_dir = pathlib.Path().resolve()
single_benchmark_dir = pathlib.Path("./single_benchmarks").resolve(strict=True)
bulk_benchmark_dir = pathlib.Path("./bulk_benchmarks").resolve(strict=True)

# outputs paths
single_benchmark_csv = working_dir / "single_benchmarks.csv"
bulk_benchmark_csv = working_dir / "bulk_benchmarks.csv"

## Loading all JSON files 

Here we are loading all the JSON files. 
The file name structure of the JSON files is `{Plate_name}_{type}_{process}_benchmarks.json`.
Also we are loading the file size information. 

In [3]:
single_json_files = list(get_benchmark_files(single_benchmark_dir, ext="json"))
bulk_json_files = list(get_benchmark_files(bulk_benchmark_dir, ext="json"))

# loading json file that contains file size information
with open("./file_size.json", encoding="utf-8", mode="r") as content:
    plate_size = json.load(content)

In [4]:
# applying time format
tformat = "%Y-%m-%d %H:%M:%S.%f"

# collecting all data
raw_benchmark_data = []

# iterating each json file and extract data
for single_json_file in single_json_files:
    # collecting data from just file name
    plate_name = single_json_file.stem.split("_nf1")[0]
    data_type = "singlecell"
    file_size = plate_size[plate_name]
    process_name = single_json_file.stem.split("singlecell_")[1].split("_benchmark")[0]

    # opening json file to extract benchmark information
    with open(single_json_file, encoding="utf-8", mode="r") as contents:
        benchmark_data = json.load(contents)

        # accessing to all metadata from benchmarks
        meta_data = benchmark_data["metadata"]
        selected_data = {
            "pid": meta_data["pid"],
            "process_name": process_name,
            "input_data_name": plate_name,
            "start_time": datetime.strptime(meta_data["start_time"], tformat),
            "end_time": datetime.strptime(meta_data["end_time"], tformat),
            "time_duration": (
                datetime.strptime(meta_data["end_time"], tformat)
                - datetime.strptime(meta_data["start_time"], tformat)
            ).total_seconds(),
            "total_allocations": int(meta_data["total_allocations"]),
            "peak_memory": round(int(meta_data["peak_memory"]) / 1024**2, 3),
            "file_size": plate_size[plate_name],
        }

    # append to list
    raw_benchmark_data.append(selected_data)

In [5]:
# create to dataframe
benchmark_df = pd.DataFrame(raw_benchmark_data)
benchmark_df.to_csv("nf1_single_cell_complete_benchmark.csv", index=False)
benchmark_df

Unnamed: 0,pid,process_name,input_data_name,start_time,end_time,time_duration,total_allocations,peak_memory,file_size
0,296901,normalize,Plate_4,2023-11-09 12:15:44.326,2023-11-09 12:15:47.675,3.349,2697600,622.491,222.945
1,296901,normalize,Plate_3_prime,2023-11-09 12:15:28.642,2023-11-09 12:15:33.599,4.957,2711400,1150.83,444.938
2,296901,normalize,Plate_3,2023-11-09 12:15:09.348,2023-11-09 12:15:15.394,6.046,2711861,1538.915,551.629
3,296901,aggregate,Plate_3_prime,2023-11-09 12:15:41.071,2023-11-09 12:15:42.108,1.037,668551,250.406,444.938
4,296901,feature_select,Plate_1,2023-11-09 12:14:56.535,2023-11-09 12:14:58.624,2.089,1638491,263.136,4.727
5,296901,annotate,Plate_3_prime,2023-11-09 12:15:25.512,2023-11-09 12:15:25.897,0.385,7667,517.028,444.938
6,296901,aggregate,Plate_4,2023-11-09 12:15:53.123,2023-11-09 12:15:54.005,0.882,688175,133.504,222.945
7,296901,aggregate,Plate_1,2023-11-09 12:14:58.787,2023-11-09 12:14:59.278,0.491,497903,4.767,4.727
8,296901,normalize,Plate_2,2023-11-09 12:15:00.073,2023-11-09 12:15:01.863,1.79,2015427,102.281,30.598
9,296901,annotate,Plate_1,2023-11-09 12:14:54.557,2023-11-09 12:14:54.627,0.07,4206,6.753,4.727


In [6]:
# applying time format
tformat = "%Y-%m-%d %H:%M:%S.%f"

# collecting all data
raw_benchmark_data = []

# iterating each json file and extract data
for bulk_json_file in bulk_json_files:
    # collecting data from just file name
    plate_name = bulk_json_file.stem.split("_nf1")[0]
    data_type = "bulk"
    file_size = plate_size[plate_name]
    process_name = bulk_json_file.stem.split("bulk_")[1].split("_benchmark")[0]

    # opening json file to extract benchmark information
    with open(single_json_file, encoding="utf-8", mode="r") as contents:
        benchmark_data = json.load(contents)

        # accessing to all metadata from benchmarks
        meta_data = benchmark_data["metadata"]
        selected_data = {
            "pid": meta_data["pid"],
            "process_name": process_name,
            "input_data_name": plate_name,
            "start_time": datetime.strptime(meta_data["start_time"], tformat),
            "end_time": datetime.strptime(meta_data["end_time"], tformat),
            "time_duration": (
                datetime.strptime(meta_data["end_time"], tformat)
                - datetime.strptime(meta_data["start_time"], tformat)
            ).total_seconds(),
            "total_allocations": int(meta_data["total_allocations"]),
            "peak_memory": round(int(meta_data["peak_memory"]) / 1024**2, 3),
            "file_size": plate_size[plate_name],
        }

    # append to list
    raw_benchmark_data.append(selected_data)

In [7]:
benchmark_df = pd.DataFrame(raw_benchmark_data)
benchmark_df.to_csv("nf1_bulk_complete_benchmark.csv", index=False)
benchmark_df

Unnamed: 0,pid,process_name,input_data_name,start_time,end_time,time_duration,total_allocations,peak_memory,file_size
0,296901,normalize,Plate_3,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,551.629
1,296901,feature_select,Plate_4,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,222.945
2,296901,annotate,Plate_1,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,4.727
3,296901,aggregate,Plate_4,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,222.945
4,296901,aggregate,Plate_3,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,551.629
5,296901,aggregate,Plate_2,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,30.598
6,296901,annotate,Plate_4,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,222.945
7,296901,aggregate,Plate_1,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,4.727
8,296901,feature_select,Plate_3_prime,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,444.938
9,296901,annotate,Plate_3,2023-11-09 12:15:04.495,2023-11-09 12:15:04.939,0.444,502871,22.485,551.629
