In [None]:
import json
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
from pprint import pprint

In [None]:
# retrieve original model benchmarks
folder_path_original_benchmarks = "logs/third_experiment"
data = {}
for filename in os.listdir(folder_path_original_benchmarks):
    with open(os.path.join(folder_path_original_benchmarks, filename), "r") as f:
        raw_data = json.load(f)
        model_name = raw_data["model_name"]
        data[model_name] = {}
        data[model_name]["original_model_benchmarks"] = raw_data["original_model_benchmarks"]
        data[model_name]["quantized_model_benchmarks"] = raw_data["quantized_model_benchmarks"]

## Average Bitwidth per Component

In [None]:
models_in_correct_order = [
    "HuggingFaceTB/SmolLM-135M-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.1-8B-Instruct",
]

fig, ax = plt.subplots(1, len(models_in_correct_order), figsize=(20, 5))
# sort the models in the correct order
data = {k: data[k] for k in models_in_correct_order}
for i, (model_name, model_data) in enumerate(data.items()):
    original_model_benchmarks = model_data["original_model_benchmarks"]["token_generation_stats"]
    quantized_model_benchmarks = model_data["quantized_model_benchmarks"]["token_generation_stats"]

    original_model_average_tokens_per_second = original_model_benchmarks[
        "average_tokens_per_second"]
    quantized_model_average_tokens_per_second = quantized_model_benchmarks[
        "average_tokens_per_second"]

    ax[i].bar(
        ["Original", "Quantized"],
        [original_model_average_tokens_per_second,
            quantized_model_average_tokens_per_second],
        color=["tab:blue", "tab:orange"]
    )
    # label absolute values on first bar chart
    ax[i].text(
        0,
        original_model_average_tokens_per_second,
        f"{original_model_average_tokens_per_second:.2f}",
        ha="center",
        va="bottom",
        color="black",
    )
    # label performance difference on second bar chart with absolute values
    performance_difference = quantized_model_average_tokens_per_second - \
        original_model_average_tokens_per_second
    ax[i].text(
        1,
        quantized_model_average_tokens_per_second,
        f"{performance_difference:.2f}",
        ha="center",
        va="bottom",
        color="red",
    )
    ax[i].set_title(model_name)
    ax[i].set_ylabel("Tokens per second")
    ax[i].set_ylim(0, 1250)
    # ax[i].grid(axis="y")

fig.savefig("visualizations/third_experiment_performance_comparison.png")
plt.show()