In [2]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [1]:
!pip -q install --upgrade pip
!pip install -y numpy sentence-transformers transformers faiss-cpu
!pip install "numpy==1.26.4"
!pip install "faiss-cpu==1.8.0.post1" sentence-transformers==2.7.0 transformers==4.41.2 accelerate==0.30.1 bitsandbytes==0.43.1


[optparse.groups]Usage:[/]   
  pip install \[options] <requirement specifier> \[package-index-options] ...
  pip install \[options] -r <requirements file> \[package-index-options] ...
  pip install \[options] [-e] <vcs project url> ...
  pip install \[options] [-e] <local project path> ...
  pip install \[options] <archive url/path> ...

no such option: -y
Collecting faiss-cpu==1.8.0.post1
  Using cached faiss_cpu-1.8.0.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting sentence-transformers==2.7.0
  Using cached sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers==4.41.2
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Using cached faiss_cpu-1.8.0.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
Using cached sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
Using cached transformers-4.41.2-py3-none-any.whl (9.1 MB)
Installing collected packages: faiss-cpu, t

In [2]:
import numpy as np
import torch
import faiss

print("NumPy:", np.__version__)
print("Torch:", torch.__version__, "CUDA:", torch.cuda.is_available())
print("FAISS OK")

import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

NumPy: 1.26.4
Torch: 2.9.0+cu128 CUDA: True
FAISS OK
CUDA available: True
GPU: Tesla T4


In [3]:
import os, re, json, time, math
from pathlib import Path
import numpy as np
import faiss

from sentence_transformers import SentenceTransformer, CrossEncoder
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [4]:
BASE_DIR = Path("/content/drive/MyDrive/PerfWattLab_RAG")
DATA_DIR = BASE_DIR / "data"
INDEX_DIR = BASE_DIR / "index"
OUT_DIR = BASE_DIR / "outputs"

for d in [DATA_DIR, INDEX_DIR, OUT_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("BASE_DIR:", BASE_DIR)

BASE_DIR: /content/drive/MyDrive/PerfWattLab_RAG


In [5]:
sample_docs = {
    "doc1.txt": "CUDA is a parallel computing platform and programming model developed by NVIDIA. It enables dramatic increases in computing performance by harnessing the power of the GPU.",
    "doc2.txt": "Triton Inference Server is an open source inference serving software that simplifies deployment of AI models at scale. It supports multiple frameworks and backends.",
    "doc3.txt": "FAISS is a library for efficient similarity search and clustering of dense vectors. It is commonly used for vector search in retrieval augmented generation pipelines.",
    "doc4.txt": "Prometheus is a monitoring system and time series database. Grafana is used to visualize metrics and build dashboards for observability.",
    "doc5.txt": "Dynamic batching combines multiple inference requests into a single batch to improve GPU throughput while maintaining latency constraints."
}

for name, text in sample_docs.items():
    (DATA_DIR / name).write_text(text)

print("Wrote sample docs to:", DATA_DIR)
print("Files:", [p.name for p in DATA_DIR.glob("*.txt")])

Wrote sample docs to: /content/drive/MyDrive/PerfWattLab_RAG/data
Files: ['doc1.txt', 'doc2.txt', 'doc3.txt', 'doc4.txt', 'doc5.txt']


In [6]:
def clean_text(s: str) -> str:
    s = s.replace("\u00a0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

docs = []
for p in sorted(DATA_DIR.glob("*.txt")):
    text = clean_text(p.read_text(errors="ignore"))
    if len(text) > 0:
        docs.append({"doc_id": p.name, "text": text})

print("Loaded docs:", len(docs))
print("Example:", docs[0]["doc_id"], docs[0]["text"][:120], "...")

Loaded docs: 5
Example: doc1.txt CUDA is a parallel computing platform and programming model developed by NVIDIA. It enables dramatic increases in comput ...


In [7]:
def chunk_text(text: str, chunk_size: int = 450, overlap: int = 80):
    if chunk_size <= overlap:
        raise ValueError("chunk_size must be > overlap")
    chunks_out = []
    start = 0
    while start < len(text):
        end = min(len(text), start + chunk_size)
        chunk = text[start:end].strip()
        if len(chunk) > 0:
            chunks_out.append(chunk)
        if end == len(text):
            break
        start = end - overlap
    return chunks_out

chunks = []
for d in docs:
    cs = chunk_text(d["text"], chunk_size=450, overlap=80)
    for i, c in enumerate(cs):
        chunks.append({
            "chunk_id": f"{d['doc_id']}::chunk{i}",
            "doc_id": d["doc_id"],
            "text": c
        })

print("Total chunks:", len(chunks))
print("Example chunk:", chunks[0]["chunk_id"], chunks[0]["text"])

Total chunks: 5
Example chunk: doc1.txt::chunk0 CUDA is a parallel computing platform and programming model developed by NVIDIA. It enables dramatic increases in computing performance by harnessing the power of the GPU.


In [8]:
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(embed_model_name)

if torch.cuda.is_available():
    embedder = embedder.to("cuda")

texts = [c["text"] for c in chunks]

t0 = time.perf_counter()
emb = embedder.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    convert_to_numpy=True,
    normalize_embeddings=True
)
t1 = time.perf_counter()

print("Embeddings shape:", emb.shape)
print("Embedding time seconds:", round(t1 - t0, 3))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Embeddings shape: (5, 384)
Embedding time seconds: 4.732


In [9]:
dim = emb.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(emb.astype(np.float32))

print("FAISS ntotal:", index.ntotal)

faiss.write_index(index, str(INDEX_DIR / "faiss.index"))
with open(INDEX_DIR / "chunks.json", "w") as f:
    json.dump(chunks, f, indent=2)

print("Saved index to:", INDEX_DIR)

FAISS ntotal: 5
Saved index to: /content/drive/MyDrive/PerfWattLab_RAG/index


In [10]:
reranker_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reranker = CrossEncoder(reranker_name, device="cuda" if torch.cuda.is_available() else "cpu")
print("Reranker loaded:", reranker_name)



config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Reranker loaded: cross-encoder/ms-marco-MiniLM-L-6-v2


In [11]:
gen_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(gen_model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    gen_model_name,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

gen_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

print("Generator loaded:", gen_model_name)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Generator loaded: TinyLlama/TinyLlama-1.1B-Chat-v1.0


In [12]:
def retrieve(query: str, top_k: int = 10):
    t0 = time.perf_counter()
    q_emb = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype(np.float32)
    scores, idxs = index.search(q_emb, top_k)
    t1 = time.perf_counter()

    results = []
    for score, i in zip(scores[0], idxs[0]):
        c = chunks[int(i)]
        results.append({
            "chunk_id": c["chunk_id"],
            "doc_id": c["doc_id"],
            "text": c["text"],
            "score": float(score)
        })

    return results, (t1 - t0) * 1000.0

In [13]:
def rerank(query: str, retrieved, top_k: int = 5):
    t0 = time.perf_counter()

    pairs = [(query, r["text"]) for r in retrieved]
    scores = reranker.predict(pairs)

    for r, s in zip(retrieved, scores):
        r["rerank_score"] = float(s)

    reranked = sorted(retrieved, key=lambda x: x["rerank_score"], reverse=True)

    seen = set()
    deduped = []
    for r in reranked:
        if r["chunk_id"] in seen:
            continue
        seen.add(r["chunk_id"])
        deduped.append(r)

    t1 = time.perf_counter()
    return deduped[:top_k], (t1 - t0) * 1000.0

In [14]:
def generate_answer(query: str, context_chunks, max_new_tokens: int = 160):
    t0 = time.perf_counter()

    context = "\n\n".join([f"[{i+1}] {c['text']}" for i, c in enumerate(context_chunks)])

    prompt = f"""You are a helpful assistant. Use the context to answer the question.
If the context is not enough, say you are not sure.

Context:
{context}

Question:
{query}

Answer:"""

    out = gen_pipe(
        prompt,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0.0,
        return_full_text=False
    )[0]["generated_text"]

    t1 = time.perf_counter()
    return out.strip(), (t1 - t0) * 1000.0

In [15]:
def rag_query(query: str, retrieve_k: int = 10, rerank_k: int = 5):
    retrieved, t_retr = retrieve(query, top_k=retrieve_k)
    reranked, t_rer = rerank(query, retrieved, top_k=rerank_k)
    answer, t_gen = generate_answer(query, reranked)

    return {
        "query": query,
        "timings_ms": {
            "retrieval_ms": round(t_retr, 2),
            "rerank_ms": round(t_rer, 2),
            "generation_ms": round(t_gen, 2),
            "total_ms": round(t_retr + t_rer + t_gen, 2)
        },
        "contexts": reranked,
        "answer": answer
    }

In [16]:
query = "What is Triton Inference Server and why is dynamic batching useful?"
res = rag_query(query)

print("Query:", res["query"])
print("Timings ms:", res["timings_ms"])
print("\nAnswer:\n", res["answer"])
print("\nTop contexts:")
for c in res["contexts"]:
    print("-", c["chunk_id"], "rerank_score:", round(c["rerank_score"], 4))

out_path = OUT_DIR / "gpu_example_result.json"
with open(out_path, "w") as f:
    json.dump(res, f, indent=2)

print("Saved GPU example result to:", out_path)



Query: What is Triton Inference Server and why is dynamic batching useful?
Timings ms: {'retrieval_ms': 91.75, 'rerank_ms': 86.12, 'generation_ms': 4751.83, 'total_ms': 4929.7}

Answer:
 Triton Inference Server is an open source inference serving software that simplifies deployment of AI models at scale. Dynamic batching combines multiple inference requests into a single batch to improve GPU throughput while maintaining latency constraints. This feature is useful for vector search in retrieval augmented generation pipelines, where the number of requests can be large and the latency constraints are critical.

Top contexts:
- doc2.txt::chunk0 rerank_score: 5.9291
- doc5.txt::chunk0 rerank_score: 3.9343
- doc3.txt::chunk0 rerank_score: -9.9402
- doc1.txt::chunk0 rerank_score: -10.4486
- doc4.txt::chunk0 rerank_score: -10.5862
Saved GPU example result to: /content/drive/MyDrive/PerfWattLab_RAG/outputs/gpu_example_result.json


In [18]:
print("Index file exists:", (INDEX_DIR / "faiss.index").exists())
print("Chunks file exists:", (INDEX_DIR / "chunks.json").exists())
print("GPU example output exists:", (OUT_DIR / "gpu_example_result.json").exists())

Index file exists: True
Chunks file exists: True
GPU example output exists: True
