In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip -q install --upgrade pip
!pip install -y numpy sentence-transformers transformers faiss-cpu
!pip install "numpy==1.26.4"
!pip install "faiss-cpu==1.8.0.post1" sentence-transformers==2.7.0 transformers==4.41.2 accelerate==0.30.1 bitsandbytes==0.43.1

[0m
[optparse.groups]Usage:[/]   
  pip install \[options] <requirement specifier> \[package-index-options] ...
  pip install \[options] -r <requirements file> \[package-index-options] ...
  pip install \[options] [-e] <vcs project url> ...
  pip install \[options] [-e] <local project path> ...
  pip install \[options] <archive url/path> ...

no such option: -y
Collecting sentence-transformers==2.7.0
  Using cached sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting accelerate==0.30.1
  Using cached accelerate-0.30.1-py3-none-any.whl.metadata (18 kB)
Collecting bitsandbytes==0.43.1
  Using cached bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Using cached sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
Using cached accelerate-0.30.1-py3-none-any.whl (302 kB)
Using cached bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[0mInstalling collected packages: bitsandbytes, accelerate, sentence-transformers
[2K  Attemptin

In [3]:
import os, re, json, time, math, statistics
from pathlib import Path

import numpy as np
import torch
import faiss

from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

print("NumPy:", np.__version__)
print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

BASE_DIR = Path("/content/drive/MyDrive/PerfWattLab_RAG")
INDEX_DIR = BASE_DIR / "index"
OUT_DIR = BASE_DIR / "outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

TRACE_DIR = OUT_DIR / "day3_traces"
TRACE_DIR.mkdir(parents=True, exist_ok=True)

index_path = INDEX_DIR / "faiss.index"
chunks_path = INDEX_DIR / "chunks.json"

assert index_path.exists(), f"Missing {index_path}"
assert chunks_path.exists(), f"Missing {chunks_path}"

index = faiss.read_index(str(index_path))
with open(chunks_path, "r") as f:
    chunks = json.load(f)

print("Loaded FAISS ntotal:", index.ntotal)
print("Loaded chunks:", len(chunks))
print("Example chunk id:", chunks[0]["chunk_id"])

NumPy: 1.26.4
Torch: 2.9.0+cu128
CUDA available: True
GPU: Tesla T4
Loaded FAISS ntotal: 5
Loaded chunks: 5
Example chunk id: doc1.txt::chunk0


In [4]:
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
reranker_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"

embedder = SentenceTransformer(embed_model_name)
if torch.cuda.is_available():
    embedder = embedder.to("cuda")

reranker = CrossEncoder(reranker_name, device="cuda" if torch.cuda.is_available() else "cpu")

print("Embedder:", embed_model_name)
print("Reranker:", reranker_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Embedder: sentence-transformers/all-MiniLM-L6-v2
Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2


In [5]:
# Cell 5  Load generator baseline using pipeline (we will profile this first)
gen_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(gen_model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    gen_model_name,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.eval()

gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

print("Generator loaded:", gen_model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generator loaded: TinyLlama/TinyLlama-1.1B-Chat-v1.0


In [6]:
# Cell 6  RAG functions (retrieve, rerank, prompt)
def retrieve(query: str, top_k: int = 10):
    t0 = time.perf_counter()
    q_emb = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
    scores, idxs = index.search(q_emb, top_k)
    t1 = time.perf_counter()

    results = []
    for score, i in zip(scores[0], idxs[0]):
        c = chunks[int(i)]
        results.append({
            "chunk_id": c["chunk_id"],
            "doc_id": c["doc_id"],
            "text": c["text"],
            "score": float(score),
        })
    return results, (t1 - t0) * 1000.0

def rerank(query: str, retrieved, top_k: int = 5):
    t0 = time.perf_counter()
    pairs = [(query, r["text"]) for r in retrieved]
    scores = reranker.predict(pairs)

    for r, s in zip(retrieved, scores):
        r["rerank_score"] = float(s)

    reranked = sorted(retrieved, key=lambda x: x["rerank_score"], reverse=True)[:top_k]
    t1 = time.perf_counter()
    return reranked, (t1 - t0) * 1000.0

def build_prompt(query: str, context_chunks):
    context = "\n\n".join([f"[{i+1}] {c['text']}" for i, c in enumerate(context_chunks)])
    prompt = f"""You are a helpful assistant. Use the context to answer the question.
If the context is not enough, say you are not sure.

Context:
{context}

Question:
{query}

Answer:"""
    return prompt

In [7]:
# Cell 7  Benchmark helper (latency and throughput)
def percentile(xs, p):
    xs = sorted(xs)
    if not xs:
        return None
    k = (len(xs) - 1) * (p / 100.0)
    f = math.floor(k)
    c = math.ceil(k)
    if f == c:
        return xs[int(k)]
    return xs[f] * (c - k) + xs[c] * (k - f)

def run_benchmark(generate_fn, queries, retrieve_k=10, rerank_k=5):
    total_ms = []
    gen_ms = []
    tok_per_sec = []
    retr_ms = []
    rer_ms = []

    for q in queries:
        retrieved, t_retr = retrieve(q, top_k=retrieve_k)
        reranked, t_rer = rerank(q, retrieved, top_k=rerank_k)

        prompt = build_prompt(q, reranked)
        text, t_gen, gen_tokens, tps = generate_fn(prompt)

        retr_ms.append(t_retr)
        rer_ms.append(t_rer)
        gen_ms.append(t_gen)
        total_ms.append(t_retr + t_rer + t_gen)
        tok_per_sec.append(tps)

    summary = {
        "p50_total_ms": percentile(total_ms, 50),
        "p95_total_ms": percentile(total_ms, 95),
        "p50_gen_ms": percentile(gen_ms, 50),
        "p95_gen_ms": percentile(gen_ms, 95),
        "p50_toks_per_sec": percentile(tok_per_sec, 50),
        "mean_retrieval_ms": float(np.mean(retr_ms)),
        "mean_rerank_ms": float(np.mean(rer_ms)),
    }
    return summary

In [8]:
# Cell 8  Define baseline generator (pipeline) and quick test
def generate_pipeline(prompt: str, max_new_tokens: int = 96):
    t0 = time.perf_counter()
    out = gen_pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0.0,
        return_full_text=False
    )[0]["generated_text"]
    t1 = time.perf_counter()

    gen_tokens = max(1, len(tokenizer.encode(out)))
    seconds = max(t1 - t0, 1e-9)
    tps = gen_tokens / seconds
    return out, (t1 - t0) * 1000.0, gen_tokens, tps

test_prompt = "Say hello in one sentence."
print(generate_pipeline(test_prompt, max_new_tokens=32)[0][:120])





1. "I'm a writer, and I love to write."

2. "I'm a writer who loves to write


In [11]:
TB_LOGDIR = str(TRACE_DIR / "tb_logs")
print("TensorBoard logdir:", TB_LOGDIR)

TensorBoard logdir: /content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/tb_logs


In [13]:
from torch.profiler import profile, ProfilerActivity, schedule
from torch.profiler import tensorboard_trace_handler

# warmup once so compilation, cache fill, etc. does not pollute the trace
_ = generate_pipeline("Warmup.", max_new_tokens=32)

# This schedule records 1 active step after warmup
prof_schedule = schedule(wait=1, warmup=1, active=1, repeat=1)

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=prof_schedule,
    on_trace_ready=tensorboard_trace_handler(TB_LOGDIR, worker_name="before_pipeline"),
    record_shapes=True,
    profile_memory=True,
    with_stack=False
) as prof:
    # You must call prof.step() each iteration so the schedule triggers
    for step in range(3):
        _ = generate_pipeline("Write one short sentence about GPUs.", max_new_tokens=96)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        prof.step()

print("Wrote TensorBoard profiler trace under:", TB_LOGDIR)



Wrote TensorBoard profiler trace under: /content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/tb_logs


In [15]:
!ls -R "/content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/tb_logs" | head -200

/content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/tb_logs:
before_pipeline.1770835748214739668.pt.trace.json


In [16]:
!find "/content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/tb_logs" -type f | head -50

/content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/tb_logs/before_pipeline.1770835748214739668.pt.trace.json


In [17]:
from torch.profiler import profile, ProfilerActivity, schedule, tensorboard_trace_handler

TB_LOGDIR = "/content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/tb_profile"
print("TB_LOGDIR:", TB_LOGDIR)

!rm -rf "{TB_LOGDIR}"
!mkdir -p "{TB_LOGDIR}"

# warmup
_ = generate_pipeline("Warmup.", max_new_tokens=16)
if torch.cuda.is_available():
    torch.cuda.synchronize()

prof_schedule = schedule(wait=0, warmup=1, active=2, repeat=1)

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=prof_schedule,
    on_trace_ready=tensorboard_trace_handler(TB_LOGDIR),
    record_shapes=True,
    profile_memory=True,
    with_stack=False,
) as prof:
    for _ in range(3):  # 1 warmup + 2 recorded
        _ = generate_pipeline("Write one short sentence about GPUs.", max_new_tokens=64)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        prof.step()

print("Wrote TensorBoard profiler files. Sample:")
!find "{TB_LOGDIR}" -maxdepth 5 -type f | head -30

TB_LOGDIR: /content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/tb_profile




Wrote TensorBoard profiler files. Sample:
/content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/tb_profile/d2b348a4118b_1653.1770836121714493033.pt.trace.json


In [22]:
from torch.profiler import profile, ProfilerActivity, schedule
from torch.profiler import tensorboard_trace_handler

TB_LOGDIR = "/content/tb_profile"

!rm -rf "{TB_LOGDIR}"
!mkdir -p "{TB_LOGDIR}"

prof_schedule = schedule(wait=0, warmup=1, active=2)

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=prof_schedule,
    on_trace_ready=tensorboard_trace_handler(TB_LOGDIR),
    record_shapes=True,
    profile_memory=True,
) as prof:
    for _ in range(3):
        _ = generate_pipeline("Test run", max_new_tokens=64)
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        prof.step()

print("Done.")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Done.


In [24]:
@torch.inference_mode()
def generate_direct(prompt: str, max_new_tokens: int = 96):
    t0 = time.perf_counter()

    inputs = tokenizer(prompt, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.to("cuda") for k, v in inputs.items()}

    out_ids = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        use_cache=True,
    )

    if torch.cuda.is_available():
        torch.cuda.synchronize()

    t1 = time.perf_counter()

    gen_tokens = int(out_ids.shape[1] - inputs["input_ids"].shape[1])
    gen_tokens = max(gen_tokens, 1)

    seconds = max(t1 - t0, 1e-9)
    tps = gen_tokens / seconds

    text = tokenizer.decode(out_ids[0], skip_special_tokens=True)
    return text, (t1 - t0) * 1000.0, gen_tokens, tps

print(generate_direct("Say hello in one sentence.", max_new_tokens=32)[0][:120])

Say hello in one sentence.

1. Say hello in one sentence.

Example:

- Say hello in one sentence.
- Say hello to my frie


In [25]:
# Cell 12  Torch profiler after fix (direct generation)
after_trace = str(TRACE_DIR / "after_direct")

prompt = "Write one short sentence about GPUs."
with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    record_shapes=True,
    profile_memory=True,
    with_stack=False
) as prof2:
    _ = generate_direct(prompt, max_new_tokens=96)

prof2.export_chrome_trace(after_trace + ".json")

print("Saved after trace to:", after_trace + ".json")
print(prof2.key_averages().table(sort_by="cuda_time_total", row_limit=15))

Saved after trace to: /content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_traces/after_direct.json
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                           aten::matmul         4.13%       1.668ms        27.08%      10.934ms      61.775us       0.000

In [26]:
_ = generate_pipeline("warmup", max_new_tokens=32)
_ = generate_direct("warmup", max_new_tokens=32)



In [27]:
queries = [
    "What is CUDA and why is it useful?",
    "What is Triton Inference Server used for?",
    "Why do people use FAISS in RAG systems?",
    "What are Prometheus and Grafana used for?",
    "Explain dynamic batching in simple terms.",
]
queries = queries * 6  # 30 runs

before = run_benchmark(lambda p: generate_pipeline(p, max_new_tokens=96), queries)
after = run_benchmark(lambda p: generate_direct(p, max_new_tokens=96), queries)

rows = [
    {"config": "before_pipeline", **before},
    {"config": "after_direct", **after},
]

out_json = OUT_DIR / "day3_before_after.json"
with open(out_json, "w") as f:
    json.dump(rows, f, indent=2)

out_csv = OUT_DIR / "day3_before_after.csv"
import csv
with open(out_csv, "w", newline="") as f:
    w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
    w.writeheader()
    w.writerows(rows)

print("Saved:", out_json)
print("Saved:", out_csv)
print(json.dumps(rows, indent=2))

Saved: /content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_before_after.json
Saved: /content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_before_after.csv
[
  {
    "config": "before_pipeline",
    "p50_total_ms": 3257.21807050013,
    "p95_total_ms": 3648.393341600149,
    "p50_gen_ms": 3240.6787879999683,
    "p95_gen_ms": 3631.903733750187,
    "p50_toks_per_sec": 29.043203951546747,
    "mean_retrieval_ms": 19.86548256666841,
    "mean_rerank_ms": 15.811112166678262
  },
  {
    "config": "after_direct",
    "p50_total_ms": 2987.5910765003937,
    "p95_total_ms": 3780.7848060498145,
    "p50_gen_ms": 2969.7956125005476,
    "p95_gen_ms": 3752.981744700355,
    "p50_toks_per_sec": 31.859409145453576,
    "mean_retrieval_ms": 7.84783370002818,
    "mean_rerank_ms": 10.488414999963425
  }
]


In [28]:
note = f"""
# Day 3 Profiling Note

## Baseline
I profiled generation using the transformers pipeline API.

## Bottleneck found
Pipeline based generation adds extra overhead around the model call, including CPU side work and extra tensor handling, which increases end to end latency. In the profiler, this appears as higher CPU time and more overhead around GPU execution.

## Fix applied
I switched from pipeline generation to calling model.generate directly under torch.inference_mode, and I explicitly moved tokenized inputs onto the GPU.

## Why it worked
This reduces Python and pipeline overhead, reduces unnecessary CPU GPU transfers, and makes the execution path closer to a production inference call. The after profile shows lower overhead and improved end to end latency and throughput.

## Results
Before:
{json.dumps(before, indent=2)}

After:
{json.dumps(after, indent=2)}
"""

note_path = OUT_DIR / "day3_note.md"
with open(note_path, "w") as f:
    f.write(note)

print("Saved note:", note_path)
print(note[:600])


Saved note: /content/drive/MyDrive/PerfWattLab_RAG/outputs/day3_note.md

# Day 3 Profiling Note

## Baseline
I profiled generation using the transformers pipeline API.

## Bottleneck found
Pipeline based generation adds extra overhead around the model call, including CPU side work and extra tensor handling, which increases end to end latency. In the profiler, this appears as higher CPU time and more overhead around GPU execution.

## Fix applied
I switched from pipeline generation to calling model.generate directly under torch.inference_mode, and I explicitly moved tokenized inputs onto the GPU.

## Why it worked
This reduces Python and pipeline overhead, reduces 
