In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip -q install --upgrade pip
!pip install -y numpy sentence-transformers transformers faiss-cpu
!pip install "numpy==1.26.4"
!pip install "faiss-cpu==1.8.0.post1" sentence-transformers==2.7.0 transformers==4.41.2 accelerate==0.30.1 bitsandbytes==0.43.1

[0m
[optparse.groups]Usage:[/]   
  pip install \[options] <requirement specifier> \[package-index-options] ...
  pip install \[options] -r <requirements file> \[package-index-options] ...
  pip install \[options] [-e] <vcs project url> ...
  pip install \[options] [-e] <local project path> ...
  pip install \[options] <archive url/path> ...

no such option: -y
Collecting nvidia-cudnn-cu12==9.10.2.21 (from torch>=1.11.0->sentence-transformers==2.7.0)
  Downloading nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl.metadata (1.8 kB)
Collecting nvidia-nvshmem-cu12==3.3.20 (from torch>=1.11.0->sentence-transformers==2.7.0)
  Downloading nvidia_nvshmem_cu12-3.3.20-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.1 kB)
Collecting triton==3.5.0 (from torch>=1.11.0->sentence-transformers==2.7.0)
  Downloading triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (1.7 kB)
Downloading nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27

In [None]:
import os, re, json, time, math, statistics
from pathlib import Path

import numpy as np
import torch
import faiss

from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM

print("NumPy:", np.__version__)
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

NumPy: 1.26.4
Torch: 2.9.0+cu128
CUDA available: True
GPU: Tesla T4


In [None]:
BASE_DIR = Path("/content/drive/MyDrive/PerfWattLab_RAG")
INDEX_DIR = BASE_DIR / "index"
OUT_DIR = BASE_DIR / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

index_path = INDEX_DIR / "faiss.index"
chunks_path = INDEX_DIR / "chunks.json"

assert index_path.exists(), f"Missing {index_path}"
assert chunks_path.exists(), f"Missing {chunks_path}"

index = faiss.read_index(str(index_path))
with open(chunks_path, "r") as f:
    chunks = json.load(f)

print("Loaded FAISS ntotal:", index.ntotal)
print("Loaded chunks:", len(chunks))
print("Example chunk id:", chunks[0]["chunk_id"])

Loaded FAISS ntotal: 5
Loaded chunks: 5
Example chunk id: doc1.txt::chunk0


In [None]:
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
reranker_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"

embedder = SentenceTransformer(embed_model_name)
if torch.cuda.is_available():
    embedder = embedder.to("cuda")

reranker = CrossEncoder(reranker_name, device="cuda" if torch.cuda.is_available() else "cpu")

print("Embedder:", embed_model_name)
print("Reranker:", reranker_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Embedder: sentence-transformers/all-MiniLM-L6-v2
Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2


In [None]:
gen_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(gen_model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    gen_model_name,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.eval()

if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    try:
        torch.set_float32_matmul_precision("high")
    except Exception:
        pass

print("Generator:", gen_model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generator: TinyLlama/TinyLlama-1.1B-Chat-v1.0


  _C._set_float32_matmul_precision(precision)


In [None]:
def retrieve(query: str, top_k: int = 10):
    t0 = time.perf_counter()
    q_emb = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
    scores, idxs = index.search(q_emb, top_k)
    t1 = time.perf_counter()

    results = []
    for score, i in zip(scores[0], idxs[0]):
        c = chunks[int(i)]
        results.append({
            "chunk_id": c["chunk_id"],
            "doc_id": c["doc_id"],
            "text": c["text"],
            "score": float(score)
        })
    return results, (t1 - t0) * 1000.0


def rerank(query: str, retrieved, top_k: int = 5):
    t0 = time.perf_counter()

    pairs = [(query, r["text"]) for r in retrieved]
    scores = reranker.predict(pairs)

    for r, s in zip(retrieved, scores):
        r["rerank_score"] = float(s)

    reranked = sorted(retrieved, key=lambda x: x["rerank_score"], reverse=True)

    seen = set()
    deduped = []
    for r in reranked:
        if r["chunk_id"] in seen:
            continue
        seen.add(r["chunk_id"])
        deduped.append(r)

    t1 = time.perf_counter()
    return deduped[:top_k], (t1 - t0) * 1000.0


def build_prompt(query: str, context_chunks):
    context = "\n\n".join([f"[{i+1}] {c['text']}" for i, c in enumerate(context_chunks)])
    prompt = f"""You are a helpful assistant. Use the context to answer the question.
If the context is not enough, say you are not sure.

Context:
{context}

Question:
{query}

Answer:"""
    return prompt

In [None]:
@torch.inference_mode()
def generate_text(prompt: str, max_new_tokens: int, do_sample: bool, temperature: float, top_p: float):
    t0 = time.perf_counter()

    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature if do_sample else None,
        top_p=top_p if do_sample else None,
        use_cache=True
    )

    t1 = time.perf_counter()

    gen_tokens = int(out.shape[1] - inputs["input_ids"].shape[1])
    seconds = max(t1 - t0, 1e-9)
    toks_per_sec = gen_tokens / seconds

    text = tokenizer.decode(out[0], skip_special_tokens=True)
    return text, seconds * 1000.0, gen_tokens, toks_per_sec

In [None]:
def rag_once(query: str, retrieve_k: int, rerank_k: int, gen_cfg: dict):
    retrieved, t_retr = retrieve(query, top_k=retrieve_k)
    reranked, t_rer = rerank(query, retrieved, top_k=rerank_k)

    prompt = build_prompt(query, reranked)
    gen_text, t_gen, gen_tokens, tps = generate_text(
        prompt=prompt,
        max_new_tokens=gen_cfg["max_new_tokens"],
        do_sample=gen_cfg["do_sample"],
        temperature=gen_cfg["temperature"],
        top_p=gen_cfg["top_p"]
    )

    total = t_retr + t_rer + t_gen
    return {
        "retrieval_ms": t_retr,
        "rerank_ms": t_rer,
        "generation_ms": t_gen,
        "total_ms": total,
        "gen_tokens": gen_tokens,
        "toks_per_sec": tps,
        "answer_preview": gen_text[-400:]
    }

In [None]:
queries = [
    "What is CUDA and why is it useful?",
    "What is Triton Inference Server used for?",
    "Why do people use FAISS in RAG systems?",
    "What are Prometheus and Grafana used for?",
    "Explain dynamic batching in simple terms."
]

queries = queries * 6  #
print("Total benchmark queries:", len(queries))

Total benchmark queries: 30


In [None]:
sweep = [
    {"name": "det_96",  "max_new_tokens": 96,  "do_sample": False, "temperature": 0.0, "top_p": 1.0},
    {"name": "det_160", "max_new_tokens": 160, "do_sample": False, "temperature": 0.0, "top_p": 1.0},
    {"name": "det_256", "max_new_tokens": 256, "do_sample": False, "temperature": 0.0, "top_p": 1.0},
    {"name": "samp_160", "max_new_tokens": 160, "do_sample": True, "temperature": 0.7, "top_p": 0.9},
]

In [None]:
def percentile(xs, p):
    xs = sorted(xs)
    if not xs:
        return None
    k = (len(xs) - 1) * (p / 100.0)
    f = math.floor(k)
    c = math.ceil(k)
    if f == c:
        return xs[int(k)]
    return xs[f] * (c - k) + xs[c] * (k - f)

all_rows = []

for cfg in sweep:
    print("\nRunning config:", cfg["name"])
    run_rows = []

    for q in queries:
        r = rag_once(q, retrieve_k=10, rerank_k=5, gen_cfg=cfg)
        run_rows.append(r)

    retrieval = [x["retrieval_ms"] for x in run_rows]
    rerank_t = [x["rerank_ms"] for x in run_rows]
    gen_t = [x["generation_ms"] for x in run_rows]
    total = [x["total_ms"] for x in run_rows]
    tps = [x["toks_per_sec"] for x in run_rows]

    summary = {
        "config": cfg["name"],
        "max_new_tokens": cfg["max_new_tokens"],
        "do_sample": cfg["do_sample"],
        "p50_total_ms": percentile(total, 50),
        "p95_total_ms": percentile(total, 95),
        "p50_gen_ms": percentile(gen_t, 50),
        "p95_gen_ms": percentile(gen_t, 95),
        "p50_toks_per_sec": percentile(tps, 50),
        "p95_toks_per_sec": percentile(tps, 95),
        "mean_retrieval_ms": float(np.mean(retrieval)),
        "mean_rerank_ms": float(np.mean(rerank_t)),
    }
    all_rows.append(summary)

    print("p50 total ms:", round(summary["p50_total_ms"], 2), "p95 total ms:", round(summary["p95_total_ms"], 2))
    print("p50 gen ms:", round(summary["p50_gen_ms"], 2), "p95 gen ms:", round(summary["p95_gen_ms"], 2))
    print("p50 toks per sec:", round(summary["p50_toks_per_sec"], 2))

out_json = OUT_DIR / "day2_sweep_results.json"
with open(out_json, "w") as f:
    json.dump(all_rows, f, indent=2)

print("\nSaved:", out_json)


Running config: det_96
p50 total ms: 1685.97 p95 total ms: 3248.77
p50 gen ms: 1668.84 p95 gen ms: 3229.24
p50 toks per sec: 36.13

Running config: det_160
p50 total ms: 1649.45 p95 total ms: 5258.66
p50 gen ms: 1633.63 p95 gen ms: 5242.15
p50 toks per sec: 36.22

Running config: det_256
p50 total ms: 1776.53 p95 total ms: 7651.09
p50 gen ms: 1753.9 p95 gen ms: 7631.67
p50 toks per sec: 35.7

Running config: samp_160
p50 total ms: 3239.31 p95 total ms: 5375.68
p50 gen ms: 3219.44 p95 gen ms: 5359.8
p50 toks per sec: 34.93

Saved: /content/drive/MyDrive/PerfWattLab_RAG/outputs/day2_sweep_results.json


In [None]:
import csv

out_csv = OUT_DIR / "day2_sweep_results.csv"
with open(out_csv, "w", newline="") as f:
    w = csv.DictWriter(f, fieldnames=list(all_rows[0].keys()))
    w.writeheader()
    w.writerows(all_rows)

print("Saved:", out_csv)

Saved: /content/drive/MyDrive/PerfWattLab_RAG/outputs/day2_sweep_results.csv


In [None]:
best = sorted(all_rows, key=lambda r: (r["p95_total_ms"], -r["p50_toks_per_sec"]))[0]
print("Best config:", best["config"])
print(json.dumps(best, indent=2))

Best config: det_96
{
  "config": "det_96",
  "max_new_tokens": 96,
  "do_sample": false,
  "p50_total_ms": 1685.9708845000796,
  "p95_total_ms": 3248.7748534001066,
  "p50_gen_ms": 1668.836912500069,
  "p95_gen_ms": 3229.244048949965,
  "p50_toks_per_sec": 36.13182342997985,
  "p95_toks_per_sec": 37.45599231621591,
  "mean_retrieval_ms": 8.285625600008947,
  "mean_rerank_ms": 12.292607633344232
}
