In [1]:
import os
import numpy as np
import faiss  # The FAISS library
import requests
from openai import OpenAI
import dotenv

dotenv.load_dotenv()


True

In [2]:
NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")

EMBED_MODEL = "nvidia/llama-3.2-nv-embedqa-1b-v2"
RERANK_MODEL = "nvidia/llama-3.2-nv-rerankqa-1b-v2"
LLM_MODEL    = "meta/llama-3.1-70b-instruct"

embedding_client = OpenAI(
    api_key=NVIDIA_API_KEY,
    base_url="https://integrate.api.nvidia.com/v1"
)

llm_client = OpenAI(
    api_key=NVIDIA_API_KEY,
    base_url="https://integrate.api.nvidia.com/v1"
)


In [8]:
def chunk_text(text, chunk_size=500, overlap=50):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        end = min(start + chunk_size, len(words))
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks


def embed_text(text: str) -> np.ndarray:
    response = embedding_client.embeddings.create(
        input=[text],
        model=EMBED_MODEL,
        encoding_format="float",
        extra_body={"input_type": "passage", "truncate": "NONE"}
    )
    return np.array(response.data[0].embedding, dtype=np.float32)


In [9]:
doc_map = {}
all_embeddings = []
all_ids = [] 

faiss_index = None

In [10]:
def add_document_to_index(doc_text: str, start_id=0):
    """
    Chunk the document, embed each chunk, and add to a FAISS index.
    Returns the last used ID so you can continue numbering for the next doc.
    """
    global faiss_index, doc_map
    
    chunks = chunk_text(doc_text)
    
    chunk_vectors = []
    chunk_ids = []
    current_id = start_id
    
    for chunk in chunks:
        vec = embed_text(chunk)
        chunk_vectors.append(vec)
        chunk_ids.append(current_id)
        
        doc_map[current_id] = chunk
        current_id += 1
    
    chunk_vectors_np = np.vstack(chunk_vectors)
    chunk_ids_np = np.array(chunk_ids, dtype=np.int64)
    
    if faiss_index is None:
        embedding_dim = chunk_vectors_np.shape[1]
        index_flat = faiss.IndexFlatL2(embedding_dim)
        # index_flat = faiss.IndexFlatIP(embedding_dim)
        index_with_ids = faiss.IndexIDMap(index_flat)
        faiss_index = index_with_ids
    
    faiss_index.add_with_ids(chunk_vectors_np, chunk_ids_np)
    
    return current_id


In [11]:
sample_doc = """
NVIDIA H100 GPUs deliver unprecedented acceleration, 
featuring 3 TB/s of memory bandwidth. In combination 
with Grace CPU, it can achieve 900GB/s of chip-to-chip 
bandwidth, enabling blazing-fast HPC and AI workloads.
"""

last_id = add_document_to_index(sample_doc, start_id=0)
print("Index size:", faiss_index.ntotal)


Index size: 1


In [12]:
def retrieve_top_k_faiss(query: str, k=3):
    query_vec = embed_text(query)
    query_vec_2d = np.expand_dims(query_vec, axis=0)
    
    distances, ids = faiss_index.search(query_vec_2d, k)
    results = []

    for dist, doc_id in zip(distances[0], ids[0]):
        if doc_id == -1:
            continue
        passage_text = doc_map[doc_id]
        results.append((passage_text, dist))
    
    return results


In [19]:
RERANK_URL = "https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking"

def rerank_passages(query: str, passages: list):
    headers = {
        "Authorization": f"Bearer {NVIDIA_API_KEY}",
        "Accept": "application/json",
    }
    payload = {
        "model": RERANK_MODEL,
        "query": {"text": query},
        "passages": [{"text": p} for p in passages]
    }
    url = "https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking"
    
    response = requests.post(url, headers=headers, json=payload)
    response.raise_for_status()
    response_data = response.json()

    rankings = response_data["rankings"]

    results = []
    for r in rankings:
        idx = r["index"]
        logit = r["logit"]
        passage_text = passages[idx]
        results.append({"text": passage_text, "score": logit})
    
    sorted_results = sorted(results, key=lambda x: x["score"], reverse=True)
    return sorted_results


In [14]:
def generate_final_answer(query: str, passages: list, max_tokens=512):
    """
    Uses meta/llama-3.1-70b-instruct to generate a final answer
    given the query and a list of top passages.
    """

    context_text = "\n\n".join(passages)
    
    system_prompt = (
        "You are an AI assistant specialized in GPU technology. "
        "You have access to the following text snippets:\n\n"
        f"{context_text}\n\n"
        "Use these snippets to answer the user query accurately. "
        "If you are unsure, just say you don't know."
    )
    
    user_prompt = f"User Query: {query}"
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    
    completion = llm_client.chat.completions.create(
        model=LLM_MODEL,
        messages=messages,
        temperature=0.2,
        top_p=0.7,
        max_tokens=max_tokens,
        stream=False
    )
    
    return completion.choices[0].message.content.strip()


In [15]:
def answer_query(query: str, k=5, final_context_count=3):
    faiss_results = retrieve_top_k_faiss(query, k=k)
    passages = [r[0] for r in faiss_results]
    reranked_passages = rerank_passages(query, passages)
    final_passages = [p["text"] for p in reranked_passages[:final_context_count]]
    answer = generate_final_answer(query, final_passages)
    return answer

In [20]:
user_question = "What is the memory bandwidth of the H100 GPU?"
final_answer = answer_query(user_question)
print("Final Answer:\n", final_answer)

{'rankings': [{'index': 0, 'logit': 22.75}], 'usage': {'prompt_tokens': 64, 'total_tokens': 64}}
Final Answer:
 The memory bandwidth of the NVIDIA H100 GPU is 3 TB/s.
