In [None]:
#################### DOCX TO JSONL , simple conversion ##################### 

# token : 
!pip uninstall -y docx
!pip install python-docx

from docx import Document
import json

# Paths for full docx
docx_path = "/content/ug1023-sdaccel-user-guide.docx"
jsonl_path = "/content/ug1023-sdaccel-user-guide.jsonl"

def convert_docx_to_jsonl(docx_path, jsonl_path):
    doc = Document(docx_path)
    with open(jsonl_path, "w", encoding="utf-8") as jsonl_file:
        for i, para in enumerate(doc.paragraphs):
            text = para.text.strip()
            if text:  # Skip empty paragraphs
                record = {"id": f"para_{i}", "text": text}
                jsonl_file.write(json.dumps(record, ensure_ascii=False) + "\n")
    return jsonl_path

converted_path = convert_docx_to_jsonl(docx_path, jsonl_path)
converted_path


In [None]:
#################### ORIGINAL RAG #####################

# 1. Install dependencies
!pip install -q transformers datasets sentence-transformers faiss-cpu scikit-learn

# 2. Load & chunk your JSONL into passages
from datasets import load_dataset
from transformers import AutoTokenizer

# Load the raw JSONL (each line: {"id": ..., "text": ...})
ds = load_dataset("json", data_files="/content/ug1023-sdaccel-user-guide.jsonl", split="train")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

# Chunk long texts into <=512-token passages
def chunk_docs(example, max_len=512):
    ids, texts = [], []
    tok = tokenizer(example["text"], return_attention_mask=False)["input_ids"]
    for i in range(0, len(tok), max_len):
        chunk = tokenizer.decode(tok[i : i + max_len], skip_special_tokens=True)
        ids.append(f"{example['id']}_c{i}")
        texts.append(chunk)
    return {"id": ids, "text": texts}

ds_p = ds.map(chunk_docs, batched=False, remove_columns=ds.column_names).flatten()

# 3. Build embeddings + FAISS index
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")
passages = [txt if isinstance(txt, str) else " ".join(txt) for txt in ds_p["text"]]
embs = embedder.encode(passages, convert_to_numpy=True, normalize_embeddings=True)

dim = embs.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embs)
faiss.write_index(index, "cli_index.faiss")

# 4. Retrieval function
def retrieve(question: str, k: int = 5) -> list[str]:
    q_emb = embedder.encode([question], normalize_embeddings=True)
    _, idxs = index.search(q_emb, k)
    return [passages[i] for i in idxs[0]]

# 5. Load the BART generator
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-base")

# 6. Retrieval-augmented generation example
query = "What is Local Memory?"
top_k = retrieve(query, k=5)

prompt = "\n\n".join(top_k) + f"\n\nQuestion: {query}"
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    truncation=True,
    padding="longest",
    max_length=1024  # explicit truncation cutoff
)
outputs = model.generate(**inputs, max_new_tokens=60, num_beams=4)
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Answer:\n", answer)

# 7. Retrieval metrics: Precision@k and Recall@k
import numpy as np

def precision_recall_at_k(retriever_fn, eval_queries, ground_truth, k=5):
    precisions, recalls = [], []
    for q in eval_queries:
        preds = retriever_fn(q, k)
        trues = set(ground_truth.get(q, []))
        tp = len(set(preds) & trues)
        precisions.append(tp / k)
        recalls.append(tp / len(trues) if trues else 0.0)
    return np.mean(precisions), np.mean(recalls)

# Example evaluation data (replace with real labels)
eval_queries = ["What is Local Memory?", "Explain OpenCL Memory Model"]
ground_truth = {
    "What is Local Memory?": ["para_10_c0", "para_10_c1"],
    "Explain OpenCL Memory Model": ["para_12_c0", "para_12_c1"]
}

prec5, rec5 = precision_recall_at_k(retrieve, eval_queries, ground_truth, k=5)
print(f"Precision@5: {prec5:.3f}, Recall@5: {rec5:.3f}")

# 8. Embedding cluster–quality metrics: intra vs. inter cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
import itertools

def cluster_similarity(embeddings, clusters):
    intra_sims, inter_sims = [], []
    for label, idxs in clusters.items():
        if len(idxs) < 2:
            continue
        sims = cosine_similarity(embeddings[idxs], embeddings[idxs])
        intra_sims += [sims[i, j] for i, j in itertools.permutations(range(len(idxs)), 2)]
    labels = list(clusters)
    for a, b in itertools.combinations(labels, 2):
        sims = cosine_similarity(embeddings[clusters[a]], embeddings[clusters[b]])
        inter_sims += sims.flatten().tolist()
    return np.mean(intra_sims), np.mean(inter_sims)

# Example clusters (replace with real indices)
clusters = {
    "LocalMemory": [10, 11, 12],
    "GlobalMemory": [20, 21, 22]
}

intra, inter = cluster_similarity(embs, clusters)
print(f"Intra-cluster sim: {intra:.3f}, Inter-cluster sim: {inter:.3f}")


Answer:
 Local Memory is memory inside of the FPGA. This memory is typically implemented using block RAM elements in the CPU fabric. The block RAM element is typically used to store and transfer data that must be shared by multiple work items within the same compute unit.Local memory is defined as the region
Precision@5: 0.000, Recall@5: 0.000
Intra-cluster sim: 0.178, Inter-cluster sim: 0.189

In [None]:
#################### ORIGINAL RAG #####################

# 1. Install dependencies
!pip install -q transformers datasets sentence-transformers faiss-cpu

# 2. Load & chunk JSONL into passages (same as before)
from datasets import load_dataset
from transformers import AutoTokenizer

ds = load_dataset("json", data_files="/content/ug1023-sdaccel-user-guide.jsonl", split="train")
tokenizer = AutoTokenizer.from_pretrained("t5-base")

def chunk_docs(example, max_len=512):
    ids, texts = [], []
    tok = tokenizer(example["text"], return_attention_mask=False)["input_ids"]
    for i in range(0, len(tok), max_len):
        chunk = tokenizer.decode(tok[i : i + max_len], skip_special_tokens=True)
        ids.append(f"{example['id']}_c{i}")
        texts.append(chunk)
    return {"id": ids, "text": texts}

ds_p = ds.map(chunk_docs, batched=False, remove_columns=ds.column_names).flatten()

# 3. Sentence embeddings + FAISS HNSW index
from sentence_transformers import SentenceTransformer
import faiss, numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")
passages = [p if isinstance(p, str) else " ".join(p) for p in ds_p["text"]]
embs = embedder.encode(passages, convert_to_numpy=True, normalize_embeddings=True)

# Build HNSW index for sublinear lookup
dim = embs.shape[1]
index = faiss.IndexHNSWFlat(dim, 32)  # 32 neighbors per node
index.hnsw.efConstruction = 200
index.hnsw.efSearch = 50
index.add(embs)
faiss.write_index(index, "cli_hnsw_index.faiss")

# 4. Efficient retrieve
def efficient_retrieve(question: str, k: int = 5):
    q_emb = embedder.encode([question], normalize_embeddings=True)
    D, I = index.search(q_emb, k)
    return [passages[i] for i in I[0]]

# 5. Prepare FiD model inputs
from transformers import T5Tokenizer, T5ForConditionalGeneration

fid_tokenizer = T5Tokenizer.from_pretrained("t5-base")
fid_model     = T5ForConditionalGeneration.from_pretrained("t5-base")

def build_fid_input(query, contexts):
    # Prefix “question” for T5; join each context with special separator
    joined = " </s><s> ".join(contexts)
    return f"question: {query} contexts: {joined}"

# 6. Run Efficient RAG
query = "What is Local Memory?"
ctxs  = efficient_retrieve(query, k=8)  # retrieve more for FiD

fid_input = build_fid_input(query, ctxs)
inputs    = fid_tokenizer(fid_input, return_tensors="pt", truncation=True, max_length=1024)
outputs   = fid_model.generate(**inputs, max_new_tokens=60, num_beams=4)

answer = fid_tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Answer:\n", answer)


Answer:
 the region of system memory that is only accessible to the OpenCLTM device

Summary - 

# We’re using two open-source models in this pipeline:

# SentenceTransformer “all-MiniLM-L6-v2” for retrieval embeddings.

# A compact, 82 MB model optimized for sentence and passage embeddings with high semantic accuracy .

# Facebook BART-base (“facebook/bart-base”) as the generation model.

# A 139 M-parameter sequence-to-sequence Transformer pretrained on large text corpora, well-suited for summarization, question answering, and in-context generation .