In [None]:
# Cell 1: install all deps (and force-upgrade openai to >=1.0)
!pip install --quiet --upgrade \
    datasets \
    sentence-transformers \
    faiss-cpu \
    "openai>=1.0.0" \
    gradio


In [None]:
from IPython import get_ipython
get_ipython().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [None]:
# Cell 1: Prompt you for your OpenAI key at runtime
import os
from getpass import getpass

# If it’s not already set, ask for it
if not os.getenv("OPENAI_API_KEY"):
    key = getpass("🔑 Enter your OpenAI API key (it will be hidden): ")
    os.environ["OPENAI_API_KEY"] = key
else:
    print("✅ OPENAI_API_KEY already set in env.")


🔑 Enter your OpenAI API key (it will be hidden): ··········


In [21]:
# Cell 2: Verify & instantiate OpenAI client
import os, openai
from openai import OpenAI

# Confirm the key is now present
assert os.getenv("OPENAI_API_KEY"), "❌ OPENAI_API_KEY still missing!"

# Check SDK version
print("openai SDK version:", openai.__version__)
assert openai.__version__.startswith("1."), "Please upgrade openai to v1.x"

# Instantiate client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# (Optional) sanity check
models = client.models.list()
print("✅ Authenticated. Models available:", len(models.data))


openai SDK version: 1.75.0
✅ Authenticated. Models available: 63


In [22]:
# Cell 3: Load 20k abstracts from arXiv + 20k from PubMed (≈40k total)
from datasets import load_dataset, concatenate_datasets

# 1. Pull first 20k arXiv abstracts
ds_arxiv = load_dataset("scientific_papers", "arxiv", split="train[:20000]")

# 2. Pull first 20k PubMed abstracts
ds_pubmed = load_dataset("scientific_papers", "pubmed", split="train[:20000]")

# 3. Keep *only* the 'abstract' field in each
for d in (ds_arxiv, ds_pubmed):
    d = d.remove_columns([c for c in d.column_names if c != "abstract"])

# 4. Combine them
ds = concatenate_datasets([ds_arxiv, ds_pubmed])

print(f" Loaded {len(ds)} abstracts across arXiv & PubMed")


Downloading data:   0%|          | 0.00/3.62G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/880M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/119924 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/6633 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/6658 [00:00<?, ? examples/s]

✅ Loaded 40000 abstracts across arXiv & PubMed


In [23]:
# —— New Diagnostic Cell ——
print("Columns:", ds.column_names)
print("Example row:", ds[0])


Columns: ['article', 'abstract', 'section_names']
Example row: {'article': 'additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models .\nit is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models .\nmany examples of such estimators belong to the large class of regularized kernel based methods over a reproducing kernel hilbert space @xmath0 , see e.g. @xcite . in the last years\nmany interesting results on learning rates of regularized kernel based models for additive models have been published when the focus is on sparsity and when the classical least squares loss function is used , see e.g. @xcite , @xcite , @xc

**APPROXIMATE RUN TIME FOR CELL4 IN v2-8 TPU IS MORE THAN 12 HOURS.**

In [None]:
# Cell 4: Embed and Index all ~40k abstracts

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load the same embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Extract abstracts
abstracts = ds["abstract"]

# Compute embeddings in batches (this may take a few minutes)
batch_size = 512
emb_list = []
for i in range(0, len(abstracts), batch_size):
    batch = abstracts[i:i+batch_size]
    embs = embedder.encode(batch, convert_to_numpy=True, show_progress_bar=False)
    emb_list.append(embs)
embeddings = np.vstack(emb_list)

# Normalize for cosine similarity
faiss.normalize_L2(embeddings)

# Build FAISS index
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)

print(f" Indexed {index.ntotal} vectors (dim={dimension})")


In [None]:
# —— Updated Cell 5 ——

# 1) Determine if there is an 'id' or 'title' column in ds
id_col = next((c for c in ds.column_names if "id" in c), None)
title_col = next((c for c in ds.column_names if "title" in c), None)

def retrieve(query, k=5):
    """
    Returns top-k dicts with keys:
      - paper_id (str)
      - title    (str or None)
      - abstract (str)
      - score    (float)
    """
    # Embed & normalize the query
    q_emb = embedder.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)

    # Search in FAISS
    scores, idxs = index.search(q_emb, k)
    results = []
    for score, idx in zip(scores[0], idxs[0]):
        i = int(idx)  # convert numpy.int64 → Python int

        # Build metadata dict
        meta = {
            # if there’s an actual id column, use it; otherwise, a synthetic one
            "paper_id": ds[i][id_col] if id_col else f"Paper_{i}",
            # if there’s a real title column, use it; else None
            "title":    ds[i][title_col] if title_col else None,
            "abstract": ds[i]["abstract"],
            "score":    float(score)
        }
        results.append(meta)
    return results

# Quick sanity check
for r in retrieve("quantum encryption", k=3):
    print(f"[{r['score']:.3f}] {r['paper_id']} — {r['abstract'][:100]}…")


[0.577] Paper_1309 —  we consider the security of practical continuous - variable quantum key distribution implementation…
[0.568] Paper_919 —  a novel protocol , measurement - device - independent quantum key distribution ( mdi - qkd ) , remo…
[0.565] Paper_28 —  we investigate an efficient quantum error correction of a fully correlated noise . 
 suppose the no…


In [None]:
def generate_answer(query, retrieved, model="gpt-4o-mini"):
    # Build the context as before...
    context = []
    for i, doc in enumerate(retrieved, start=1):
        header = f"[{i}]"
        if doc["title"]:
            header += f" Title: {doc['title']}\n"
        header += f"Abstract: {doc['abstract']}"
        context.append(header)
    context_str = "\n\n".join(context)

    system = (
        "You are a scholarly assistant. Answer the question using ONLY the provided abstracts. "
        "Cite each fact with the bracketed source number (e.g., [1]). "
        "If information is not present, say “I don’t know.”"
    )
    user_msg = f"Context:\n{context_str}\n\nQuestion: {query}"

        # In your generate_answer (Cell 6), change:
    resp = client.chat.completions.create(
        model="gpt-3.5-turbo",  # was "gpt-4o-mini"
        messages=[
            {"role": "system", "content": system},
            {"role": "user",   "content": user_msg}
        ],
        temperature=0.0,
    )

    return resp.choices[0].message.content


In [None]:
models = client.models.list()
print("✅ Models available:", len(models.data))

✅ Models available: 63


In [None]:
""""#  Cell 6: generate_answer using client.chat.completions ——

def generate_answer(query, retrieved, model="gpt-4o-mini"):
    """
    Construct prompt with numbered sources and call OpenAI chat via the v1 SDK.
    Returns the assistant’s “content” string.
    """
    # 1. Build the context block
    context = []
    for i, doc in enumerate(retrieved, start=1):
        header = f"[{i}]"
        if doc.get("title"):
            header += f" Title: {doc['title']}\n"
        header += f"Abstract: {doc['abstract']}"
        context.append(header)
    context_str = "\n\n".join(context)

    # 2. Prompt instructions
    system = (
        "You are a scholarly assistant. Answer the question using ONLY the provided abstracts. "
        "Cite each fact with the bracketed source number (e.g., [1]). "
        "If information is not present, say “I don’t know.”"
    )
    user_msg = f"Context:\n{context_str}\n\nQuestion: {query}"

    # 3. Call via your `client` (not ChatCompletion.create)
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system},
            {"role": "user",   "content": user_msg}
        ],
        temperature=0.0,
    )
    return resp.choices[0].message.content

# Quick test
query = "How do quantum dots emit light?"
docs = retrieve(query, k=3)
print(generate_answer(query, docs))
""""

I don’t know.


In [None]:
# —— Updated Cell 6: generate_answer using client.chat.completions ——

def generate_answer(query, retrieved, model="gpt-4o-mini"):
    """
    Construct prompt with numbered sources and call OpenAI chat via the v1 SDK.
    Returns the assistant’s “content” string.
    """
    # 1. Build the context block
    context = []
    for i, doc in enumerate(retrieved, start=1):
        header = f"[{i}]"
        if doc.get("title"):
            header += f" Title: {doc['title']}\n"
        header += f"Abstract: {doc['abstract']}"
        context.append(header)
    context_str = "\n\n".join(context)

    # 2. Prompt instructions
    system = (
        "You are a scholarly assistant. Answer the question using ONLY the provided abstracts. "
        "Cite each fact with the bracketed source number (e.g., [1]). "
        "If information is not present, say “I don’t know.”"
    )
    user_msg = f"Context:\n{context_str}\n\nQuestion: {query}"

    # 3. Call via your `client` (not ChatCompletion.create)
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system},
            {"role": "user",   "content": user_msg}
        ],
        temperature=0.0,
    )
    return resp.choices[0].message.content

# Quick test
query = "How do quantum dots emit light?"
docs = retrieve(query, k=3)
print(generate_answer(query, docs))


In [2]:
#def answer_query(query, k=5):
#    docs = retrieve(query, k)
#    answer = generate_answer(query, docs)
#    # Build a reference list
#    refs = "\n".join([f"[{i+1}] {d['title']} (score: {d['score']:.3f})"
#                      for i,d in enumerate(docs)])
#    return answer, refs

# Quick check
#ans, refs = answer_query("What is the key idea of federated learning?", k=4)
#print(ans, "\n\nReferences:\n", refs)

In [None]:
def answer_query(query, k=5):
    docs = retrieve(query, k)
    answer = generate_answer(query, docs)
    # Build a reference list
    refs = "\n".join([f"[{i+1}] {d['title']} (score: {d['score']:.3f})"
                      for i,d in enumerate(docs)])
    return answer, refs

# Quick check
ans, refs = answer_query("What is the key idea of federated learning?", k=4)
print(ans, "\n\nReferences:\n", refs)


In [None]:
# ── Debug Cell ──
query = "What is the key idea of federated learning?"
docs = retrieve(query, k=5)

for i, d in enumerate(docs, start=1):
    print(f"--- Doc {i} (score {d['score']:.3f}) ---")
    print(d["abstract"][:500].replace("\n"," "), "…\n")


--- Doc 1 (score 0.294) ---
 in this paper , we introduce a new machine learning theory based on multi - channel parallel adaptation for rule discovery .   this theory is distinguished from the familiar parallel - distributed adaptation theory of neural networks in terms of channel - based convergence to the target rules .   we show how to realize this theory in a learning system named cfrule .   cfrule is a parallel weight - based model , but it departs from traditional neural computing in that its internal knowledge is c …

--- Doc 2 (score 0.233) ---
 we study the predictability of emergent phenomena in complex systems . using nearest neighbor , one - dimensional cellular automata ( ca ) as an example ,   we show how to construct local coarse - grained descriptions of ca in all classes of wolfram s classification .   the resulting coarse - grained ca that we construct are capable of emulating the large - scale behavior of the original systems without accounting for small - scale det

In [None]:
import gradio as gr

def qa_gradio(query):
    ans, refs = answer_query(query, k=5)
    return ans, refs

demo = gr.Interface(
    fn=qa_gradio,
    inputs=gr.Textbox(label="Ask a research question"),
    outputs=[
      gr.Markdown(label="Answer"),
      gr.Markdown(label="References")
    ],
    title="🔬 RAG Q&A on arXiv",
    description="Enter a question and get citation-backed answers from arXiv abstracts."
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f5cf757dc55c9a108e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


