In [1]:
import re
import os
import pathlib
import json
from typing import List, TypedDict

import numpy as np
import pandas as pd

In [2]:
# Root of the project
project_dir = pathlib.Path().resolve().parent

# Directory containing experiment results
results_dir = (pathlib.Path(project_dir) / "results").resolve()

# Directory for tidied data (output directory)
data_dir = (pathlib.Path(project_dir) / "data").resolve()

In [3]:
# Represents a single data point of system utilization
class ResourceResult(TypedDict):
    experiment: str
    mesh: str
    qps: str
    pod: str
    container: str
    time: int
    cpu: float
    mem: float


def parse_resource_result(result_file: pathlib.Path) -> List[ResourceResult]:
    """ Reads a result file and parses the data.

    The returned data is a list of ResourceResults that represent a
    single metric over a time span of 15 minutes.
    """
    rows = []

    # Regex to extra# 0 -> Full match
    # 1 -> mem/cpu results
    # 2 -> Mesh
    # 3 -> Requested QPS
    name_re = re.compile("^(mem|cpu)_([a-z]+)_(\d+|MAX).*json$")

    # Extract metadata from the result
    matches = name_re.match(result_file.name)

    experiment = result_file.parent.name
    metric = matches[1]
    mesh = matches[2].capitalize()
    qps = matches[3]

    with open(result_file) as f:
        data = json.load(f)

    # The resutl is lister per pod/container
    for container in data:

        # Metadata (dimensions)
        meta = container["metric"]

        # Actual values in list[unixtime, value]
        values = container["values"]

        for v in values:
            row: ResourceResult = {
                "experiment": experiment,
                "mesh": mesh,
                "requested_qps": qps,
                "pod": meta["pod"],
                "container": meta["container"],
                "time": v[0],
                metric: v[1],
            }
            rows.append(row)

    return rows


In [4]:
"""
Clean the obtained results from the experiments
- Remove unnecessary columns
- Extract metadata from filenames
- Construct and clean data into a pandas.Series object
- Convert data measures in seconds to ms
"""


# Represents a binned data point of a fortio result file
class FortioObservation(TypedDict):
    # Dimensions
    experiment: str
    mesh: str
    requested_qps: str
    protocol: str
    payload: int

    # Variables
    actual_qps: str
    latency: float


def parse_fortio_result(result_file: pathlib.Path) -> List[FortioObservation] :
    """
    Reads a fortio results JSON file, cleans the data and returns
    it as a pandas.Series
    """
    results = []

    # fortio reports in seconds, convert to miliseconds
    multiplier = 1000

    # Regex to extract metadata
    # 0 -> Full match
    # 1 -> Protocol (http/grpc)
    # 2 -> Mesh
    # 3 -> Requested QPS
    # 4 -> Transfer in bytes
    # 5 -> Repetitionct dimensions from the filename
    name_re = re.compile("^([a-z]+)_([a-z]+)_(\d+|MAX)_(\d+)_(\d+).*json$")


    # Extract metadata from the result
    matches = name_re.match(result_file.name)

    with open(result_file) as f:
        data = json.load(f)

    if "Error" in data:
        raise ValueError(f"{result_file.name} contains an error: {data['Error']}")
    
    sample_arrays = []
    for d in data["DurationHistogram"]["Data"]:
        low = d["Start"] * multiplier
        high = d["End"] * multiplier
        size = d["Count"]
        sample_arrays.append(np.random.uniform(low=low, high=high, size=size))

    samples = np.concatenate(sample_arrays)

    obs: FortioObservation = {
        "experiment": result_file.parent.name,
        "mesh": matches[2].capitalize(),
        "requested_qps": matches[3],
        "protocol": matches[1],
        "payload": matches[4],
        "actual_qps": data["ActualQPS"],
        "latency": samples,
    }

    results.append(obs)
    return results

In [5]:
# Get all experiments directories (filters out old data)
experiment_dir_re = re.compile("^\d{2}_([a-zA-Z_])+$")
experiment_dirs = [x for x in results_dir.iterdir() if x.is_dir() and re.match(experiment_dir_re, x.name)]

result_re = re.compile("^(cpu|mem|http|grpc)_(\w+)_(\d+|MAX).*\.json$")

# Contains the two types of results
results = {
    "fortio": [],
    "resource": [],
}

for d in experiment_dirs:
    print(f"Processing experiment: {d.name}")

    files = d.glob("*.json")
    for f in files:
        matches = result_re.match(f.name)

        if matches is None:
            raise ValueError(f"Invalid file: {f.name}")
        
        result_type = matches[1]

        # Parse result files
        try:
            if result_type == "cpu" or result_type == "mem":
                results["resource"].extend(parse_resource_result(f))
            elif result_type == "http" or result_type == "grpc":
                results["fortio"].extend(parse_fortio_result(f))
        except ValueError as err:
            print(err)
            

Processing experiment: 02_http_constant_throughput
Processing experiment: 01_http_max_throughput
Processing experiment: 04_grpc_max_throughput
grpc_traefik_MAX_0_1_2022-06-27T11:13:12Z.json contains an error: Aborting because of error
Processing experiment: 03_http_payload


In [6]:
""" Create a pandas DataFrame for resource results
- Each row represents a single observation
- Each observation takes a form of type ResourceResult
- Rows are merged based on time/pod/container -> this halves rows as both CPU/mem metrics share unix timestamps
- Convert unix timestamps to pd.DateTime
"""

# Initial dataset
resource_df = pd.DataFrame(data=results["resource"])

# Merged CPU/mem results
resource_df = resource_df.groupby(by=["experiment", "mesh", "requested_qps", "pod", "container", "time"], as_index=False).first()

# Convert time column to datetime
resource_df["date"] = pd.to_datetime(resource_df["time"],unit="s")
resource_df["cpu"] = pd.to_numeric(resource_df["cpu"])
resource_df["mem"] = pd.to_numeric(resource_df["mem"])

resource_df = resource_df.drop("time", axis=1)

resource_df.to_csv(data_dir / "resource_results.feather")
resource_df.head()

Unnamed: 0,experiment,mesh,requested_qps,pod,container,cpu,mem,date
0,01_http_max_throughput,Baseline,MAX,target-fortio-746f85d498-tmrfm,fortio,0.000719,1072.776342,2022-06-26 14:31:58
1,01_http_max_throughput,Baseline,MAX,target-fortio-746f85d498-tmrfm,fortio,0.000719,1072.776342,2022-06-26 14:32:01
2,01_http_max_throughput,Baseline,MAX,target-fortio-746f85d498-tmrfm,fortio,0.049509,18820.872955,2022-06-26 14:32:04
3,01_http_max_throughput,Baseline,MAX,target-fortio-746f85d498-tmrfm,fortio,0.0632,24103.189715,2022-06-26 14:32:07
4,01_http_max_throughput,Baseline,MAX,target-fortio-746f85d498-tmrfm,fortio,0.0632,24103.189715,2022-06-26 14:32:10


In [7]:
""" Create a pandas DataFrame for fortio results
- Each row represents a binned observation
"""

fortio_df = pd.DataFrame(data=results["fortio"])
fortio_df.to_feather(data_dir / "fortio_results.feather")
fortio_df.head()

Unnamed: 0,experiment,mesh,requested_qps,protocol,payload,actual_qps,latency
0,02_http_constant_throughput,Traefik,500,http,0,419.003821,"[0.3885955809167401, 0.35178629357734, 0.39257..."
1,02_http_constant_throughput,Istio,1,http,0,0.966557,"[0.8516197639315487, 0.8741539395657092, 0.871..."
2,02_http_constant_throughput,Linkerd,100,http,0,99.965385,"[0.28963468661425174, 0.28852236048631275, 0.2..."
3,02_http_constant_throughput,Linkerd,1000,http,0,994.641782,"[0.18606631408045254, 0.1841971223829685, 0.19..."
4,02_http_constant_throughput,Cilium,500,http,0,499.000306,"[0.17054266850987035, 0.17150125866605476, 0.1..."
