# Cache Impact Benchmark
This notebook measures the latency difference between cache-disabled and cache-enabled runs of the Terminal C runtime pipeline.

In [1]:
import time
from pathlib import Path
import pandas as pd
from IPython.display import display
import os
import sys

PROJECT_NAME = "terminalC"
PROJECT_DIR = os.path.join(os.path.abspath('.').split(PROJECT_NAME)[0], PROJECT_NAME)
sys.path.append(PROJECT_DIR)

from terminalc.runtime_core.pipelines.runtime_pipeline import RuntimePipeline
from terminalc.runtime_core.config import load_runtime_config
from terminalc.runtime_core.pipelines.llm_client import LocalTransformersClient


In [4]:
PROMPT_SCENARIOS = [
    {
        "name": "btc_close_variants",
        "focus": "query_cache",
        "description": "Same BTC close query phrased differently to stress the DuckDB query cache.",
        "prompts": [
            "What was the closing price of BTC on Oct 15, 2025?",
            "Give me the closing price of BTC on Oct 15, 2025.",
        ],
    },
    {
        "name": "btc_ohlc_variants",
        "focus": "query_cache",
        "description": "Two phrasings that need the same OHLC window so only the query cache should help.",
        "prompts": [
            "List the open, high, low, and close prices for BTC between Oct 15 and Oct 17, 2025.",
            "Show BTC's OHLC candles between Oct 15 and Oct 17, 2025.",
        ],
    },
    {
        "name": "sol_ada_high_variants",
        "focus": "query_cache",
        "description": "SOL vs ADA high comparison asked twice with different wording.",
        "prompts": [
            "Which asset had the highest high on Oct 20, 2025: SOL or ADA?",
            "Between SOL and ADA, who posted the higher high on Oct 20, 2025?",
        ],
    },
    {
        "name": "btc_news_repeat",
        "focus": "prompt_cache",
        "description": "Exact duplicate news briefing to highlight prompt cache hits.",
        "prompts": [
            "Summarize the news for Bitcoin between Nov 1, 2025 and Nov 7, 2025.",
            "Summarize the news for Bitcoin between Nov 1, 2025 and Nov 7, 2025.",
        ],
    },
    {
        "name": "btc_trend_repeat",
        "focus": "prompt_cache",
        "description": "Identical technical analysis request repeated twice.",
        "prompts": [
            "Analyze BTC's trend on Nov 1, 2025 using both price action and RSI.",
            "Analyze BTC's trend on Nov 1, 2025 using both price action and RSI.",
        ],
    },
    {
        "name": "allocation_repeat",
        "focus": "prompt_cache",
        "description": "Capital allocation instruction repeated word-for-word.",
        "prompts": [
            "If I have 10,000 USD to deploy right now, how would you distribute it across BTC, ETH, and SOL based on the past month's signals and why?",
            "If I have 10,000 USD to deploy right now, how would you distribute it across BTC, ETH, and SOL based on the past month's signals and why?",
        ],
    },
]

SCENARIO_OVERVIEW = pd.DataFrame(
    [
        {
            "scenario": scenario["name"],
            "focus": scenario["focus"],
            "description": scenario["description"],
            "prompt_count": len(scenario["prompts"]),
        }
        for scenario in PROMPT_SCENARIOS
    ]
)

PROMPT_TESTS = [
    {
        "scenario": scenario["name"],
        "focus": scenario["focus"],
        "variant": idx + 1,
        "prompt": prompt,
    }
    for scenario in PROMPT_SCENARIOS
    for idx, prompt in enumerate(scenario["prompts"])
]

CACHE_MODES = [
    {"name": "no_cache", "query_cache_enabled": False, "prompt_cache_enabled": False},
    {"name": "query_cache_only", "query_cache_enabled": True, "prompt_cache_enabled": False},
    {"name": "prompt_cache_only", "query_cache_enabled": False, "prompt_cache_enabled": True},
    {"name": "all_caches", "query_cache_enabled": True, "prompt_cache_enabled": True},
]


def clear_cache_dir(path: Path) -> None:
    path = Path(path)
    path.mkdir(parents=True, exist_ok=True)
    for item in path.iterdir():
        if item.is_file():
            item.unlink()


def clear_prompt_cache(cfg) -> None:
    clear_cache_dir(cfg.cache.prompt_cache_dir)


def clear_query_cache(cfg) -> None:
    clear_cache_dir(cfg.cache.query_cache_dir)


def clear_all_caches(cfg) -> None:
    clear_query_cache(cfg)
    clear_prompt_cache(cfg)


def resolve_small_model_path(cfg) -> Path | None:
    endpoint = cfg.models.small_model_endpoint
    if not endpoint:
        return None
    local_root = cfg.models.local_model_dir
    local_root_path = local_root if isinstance(local_root, Path) else Path(local_root)
    candidate = local_root_path / endpoint.replace("/", os.sep)
    model_path = candidate if candidate.exists() else local_root_path / endpoint
    model_path = model_path.resolve()
    return model_path if model_path.exists() else None


def build_small_client(cfg, use_adapter: bool) -> LocalTransformersClient:
    model_path = resolve_small_model_path(cfg)
    if model_path is None:
        raise FileNotFoundError("Small model weights are not available locally.")
    adapter_path = cfg.models.small_model_adapter_dir if use_adapter else None
    return LocalTransformersClient(str(model_path), adapter_path=str(adapter_path) if adapter_path else None)


def available_model_variants(cfg) -> list[str]:
    variants: list[str] = []
    if '018219422' in PROJECT_DIR:
        if resolve_small_model_path(cfg):
            variants.append("small")
            if cfg.models.small_model_adapter_dir:
                variants.append("small_lora")
    else:
        if cfg.models.large_model_endpoint:
            variants.append("large")
    return variants

def build_pipeline_for_variant(model_variant: str, cfg) -> RuntimePipeline:
    if model_variant == "small":
        small_client = build_small_client(cfg, use_adapter=False)
        return RuntimePipeline(model_type=None, llm_client=small_client, config=cfg)
    if model_variant == "small_lora":
        return RuntimePipeline(model_type="small", config=cfg)
    return RuntimePipeline(model_type=model_variant, config=cfg)


def measure_latency(pipeline, prompt_tests, cfg, *, query_cache_enabled: bool = True, prompt_cache_enabled: bool = True):
    rows = []
    for test in prompt_tests:
        if not query_cache_enabled:
            clear_query_cache(cfg)
        if not prompt_cache_enabled:
            clear_prompt_cache(cfg)
        start = time.perf_counter()
        result = pipeline.run(test["prompt"])
        elapsed = time.perf_counter() - start
        rows.append(
            {
                "prompt": test["prompt"],
                "scenario": test["scenario"],
                "focus": test["focus"],
                "variant": test["variant"],
                "latency_sec": elapsed,
                "model_name": result.model_name,
            }
        )
    return pd.DataFrame(rows)


In [5]:
cfg = load_runtime_config()
model_variants = available_model_variants(cfg)
if not model_variants:
    raise RuntimeError("No configured models available for benchmarking.")

print("Benchmarking models:", ", ".join(model_variants))
display(SCENARIO_OVERVIEW)

result_frames = []

for model_variant in model_variants:
    for cache_mode in CACHE_MODES:
        clear_all_caches(cfg)
        pipeline = build_pipeline_for_variant(model_variant, cfg)
        df = measure_latency(
            pipeline,
            PROMPT_TESTS,
            cfg=cfg,
            query_cache_enabled=cache_mode["query_cache_enabled"],
            prompt_cache_enabled=cache_mode["prompt_cache_enabled"],
        )
        df = df.assign(
            model_variant=model_variant,
            cache_mode=cache_mode["name"],
            query_cache_enabled=cache_mode["query_cache_enabled"],
            prompt_cache_enabled=cache_mode["prompt_cache_enabled"],
        )
        result_frames.append(df)

results = pd.concat(result_frames, ignore_index=True)
display(results)

scenario_summary = (
    results.groupby(["model_variant", "cache_mode", "focus", "scenario", "variant"])["latency_sec"]
    .mean()
    .reset_index()
)
display(scenario_summary)

cache_summary = (
    results.groupby(["model_variant", "cache_mode"])["latency_sec"]
    .agg(["mean", "median", "min", "max", "count"])
    .reset_index()
)
display(cache_summary)

results.to_csv(f"{PROJECT_DIR}/results/cache_test_result/large_cache_results.csv")

Benchmarking models: large


Unnamed: 0,scenario,focus,description,prompt_count
0,btc_close_variants,query_cache,Same BTC close query phrased differently to st...,2
1,btc_ohlc_variants,query_cache,Two phrasings that need the same OHLC window s...,2
2,sol_ada_high_variants,query_cache,SOL vs ADA high comparison asked twice with di...,2
3,btc_news_repeat,prompt_cache,Exact duplicate news briefing to highlight pro...,2
4,btc_trend_repeat,prompt_cache,Identical technical analysis request repeated ...,2
5,allocation_repeat,prompt_cache,Capital allocation instruction repeated word-f...,2


Unnamed: 0,prompt,scenario,focus,variant,latency_sec,model_name,model_variant,cache_mode,query_cache_enabled,prompt_cache_enabled
0,"What was the closing price of BTC on Oct 15, 2...",btc_close_variants,query_cache,1,5.554118,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False
1,"Give me the closing price of BTC on Oct 15, 2025.",btc_close_variants,query_cache,2,5.342233,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False
2,"List the open, high, low, and close prices for...",btc_ohlc_variants,query_cache,1,7.14204,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False
3,Show BTC's OHLC candles between Oct 15 and Oct...,btc_ohlc_variants,query_cache,2,8.213021,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False
4,"Which asset had the highest high on Oct 20, 20...",sol_ada_high_variants,query_cache,1,3.460927,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False
5,"Between SOL and ADA, who posted the higher hig...",sol_ada_high_variants,query_cache,2,4.476927,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False
6,"Summarize the news for Bitcoin between Nov 1, ...",btc_news_repeat,prompt_cache,1,20.564779,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False
7,"Summarize the news for Bitcoin between Nov 1, ...",btc_news_repeat,prompt_cache,2,14.548347,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False
8,"Analyze BTC's trend on Nov 1, 2025 using both ...",btc_trend_repeat,prompt_cache,1,6.860895,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False
9,"Analyze BTC's trend on Nov 1, 2025 using both ...",btc_trend_repeat,prompt_cache,2,7.452122,meta-llama/Llama-3.3-70B-Instruct:cheapest,large,no_cache,False,False


Unnamed: 0,model_variant,cache_mode,focus,scenario,variant,latency_sec
0,large,all_caches,prompt_cache,allocation_repeat,1,13.894902
1,large,all_caches,prompt_cache,allocation_repeat,2,0.014491
2,large,all_caches,prompt_cache,btc_news_repeat,1,13.341009
3,large,all_caches,prompt_cache,btc_news_repeat,2,0.01251
4,large,all_caches,prompt_cache,btc_trend_repeat,1,9.99517
5,large,all_caches,prompt_cache,btc_trend_repeat,2,0.004818
6,large,all_caches,query_cache,btc_close_variants,1,7.219842
7,large,all_caches,query_cache,btc_close_variants,2,4.152852
8,large,all_caches,query_cache,btc_ohlc_variants,1,7.390261
9,large,all_caches,query_cache,btc_ohlc_variants,2,7.718829


Unnamed: 0,model_variant,cache_mode,mean,median,min,max,count
0,large,all_caches,5.929996,5.686347,0.004818,13.894902,12
1,large,no_cache,10.130764,7.297081,3.460927,20.564779,12
2,large,prompt_cache_only,5.557117,3.936927,0.023399,19.630291,12
3,large,query_cache_only,7.722899,7.367455,3.925245,12.642794,12
