# CS 5588 — RAG with LangChain, Chroma, and Gemini Free API


### 1) Install

In [1]:

!pip -q install -U langchain langchain-community chromadb pypdf             sentence-transformers transformers tiktoken             langchain-google-genai google-genai
print("If upgraded core libs, consider restarting runtime.")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.6/245.6 kB[0m [31m15.0 MB/s[0m eta [36

### 2) Keys & Imports

In [2]:

import os, getpass, json, sys, platform, pathlib, datetime, importlib
if not os.getenv("GEMINI_API_KEY"):
    os.environ["GEMINI_API_KEY"] = getpass.getpass("Enter your GEMINI_API_KEY: AIzaSyBiSuTuP7OdmHI_yBJ_Vnle_x779kHO4Z0")
os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY", os.environ["GEMINI_API_KEY"])

from google import genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA

pathlib.Path("data").mkdir(exist_ok=True)
pathlib.Path("artifacts").mkdir(exist_ok=True)
print("Env ready.")


Enter your GEMINI_API_KEY: AIzaSyBiSuTuP7OdmHI_yBJ_Vnle_x779kHO4Z0··········
Env ready.


### 3) Log environment → env_rag.json

In [3]:

def pv(m):
    try:
        mod = importlib.import_module(m)
        return getattr(mod, "__version__", "unknown")
    except: return "not installed"
env = {
  "timestamp": datetime.datetime.now().isoformat(),
  "python": sys.version, "platform": platform.platform(),
  "packages": {m: pv(m) for m in [
    "langchain","langchain_community","chromadb","tiktoken","transformers",
    "sentence_transformers","langchain_google_genai","google.genai"
  ]}
}
with open("env_rag.json","w") as f: json.dump(env, f, indent=2)
print(json.dumps(env, indent=2))


{
  "timestamp": "2025-09-19T03:10:36.978243",
  "python": "3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]",
  "platform": "Linux-6.1.123+-x86_64-with-glibc2.35",
  "packages": {
    "langchain": "0.3.27",
    "langchain_community": "0.3.29",
    "chromadb": "1.1.0",
    "tiktoken": "0.11.0",
    "transformers": "4.56.1",
    "sentence_transformers": "5.1.0",
    "langchain_google_genai": "unknown",
    "google.genai": "1.38.0"
  }
}


### 4) Upload documents

In [4]:

try:
    from google.colab import files
    up = files.upload()
    import os
    os.makedirs("data", exist_ok=True)
    for n,c in up.items():
        open(os.path.join("data", n), "wb").write(c)
    print("Uploaded:", list(up.keys()))
except Exception as e:
    print("Colab upload UI not available.", e)


Saving NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf to NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf
Saving NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf to NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf
Saving NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf to NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf
Uploaded: ['NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf', 'NeurIPS-2024-richelieu-self-evolving-llm-based-agents-for-ai-diplomacy-Paper-Conference.pdf', 'NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf']


### 5) Load & chunk

In [5]:

import os
def load_docs(folder="data"):
    docs=[]
    for fname in os.listdir(folder):
        p=os.path.join(folder,fname)
        if not os.path.isfile(p): continue
        ext=fname.lower().split(".")[-1]
        try:
            if ext=="pdf": loader=PyPDFLoader(p)
            elif ext in ["txt","md","markdown"]: loader=TextLoader(p, encoding="utf-8")
            else:
                print("Skip", fname); continue
            docs += loader.load()
        except Exception as e:
            print("Fail", fname, e)
    return docs
raw_docs=load_docs("data")
print("Loaded", len(raw_docs))
splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits=splitter.split_documents(raw_docs)
print("Chunks:", len(splits))
if splits: print(splits[0].page_content[:400])
rag_run_config={"chunk_size":500,"chunk_overlap":100,"embedding_models_tested":[],"llm":None,"retriever_k":4}
import json
json.dump(rag_run_config, open("rag_run_config.json","w"), indent=2)


Loaded 126
Chunks: 1066
Richelieu: Self-Evolving LLM-Based Agents for AI
Diplomacy
Zhenyu Guan ♢, Xiangyu Kong♣†B, Fangwei Zhong♠†B, Yizhou Wang♡♢
♢ Institute for Artificial Intelligence, Peking University
♣ College of Computer Science, Beijing Information Science and Technology University
♠ School of Artificial Intelligence, Beijing Normal University
♡ Center on Frontiers of Computing Studies, School of Computer Science


### 6) Vector DB (Chroma) + baseline embeddings

In [6]:

from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vs = Chroma.from_documents(splits, embedding=emb, persist_directory="./chroma_minilm")
vs.persist()
retriever = vs.as_retriever(search_kwargs={"k":4})
print("Vector store ready.")
cfg=json.load(open("rag_run_config.json"))
cfg["embedding_models_tested"].append("sentence-transformers/all-MiniLM-L6-v2")
json.dump(cfg, open("rag_run_config.json","w"), indent=2)


  emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store ready.


  vs.persist()


### 7) RetrievalQA with Gemini

In [7]:

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
def ask(q):
    r=qa({"query":q})
    print("\nQ:", q); print("A:", r.get("result",""))
    print("\nSources:")
    for i,d in enumerate(r.get("source_documents",[])[:3]):
        print(f"[{i+1}] {d.metadata.get('source','?')} ::", d.page_content[:160].replace("\n"," ")+"...")
ask("What are the main findings relevant to our project domain?")


  r=qa({"query":q})



Q: What are the main findings relevant to our project domain?
A: Based on the provided text, there are no specific "main findings" of a research project discussed. The text primarily outlines guidelines and expectations for authors regarding the "Broader Impacts" section of their papers, focusing on:

*   **Examples of negative societal impacts:** malicious/unintended uses (disinformation, fake profiles, surveillance), fairness, privacy, and security considerations.
*   **Expectation for authors:** To discuss potential negative applications, even for foundational research, if there's a direct path to them.
*   **A sample "Broader Impacts" question:** Whether the paper discusses both positive and negative societal impacts.
*   **Guidelines:** Including preserving anonymity and using "NA" if there is no societal impact.

Sources:
[1] data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf :: • Examples of negative societal impacts include pot

### 8) Mini-experiments (embedding swap & chunk sensitivity) — optional

In [8]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
import shutil, json

g_emb = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

shutil.rmtree("./chroma_gemini", ignore_errors=True)
vs_gem = Chroma.from_documents(splits, embedding=g_emb, persist_directory="./chroma_gemini")
vs_gem.persist()
retriever_gem = vs_gem.as_retriever(search_kwargs={"k": 4})

from langchain.chains import RetrievalQA
qa_gem = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever_gem, return_source_documents=True
)

def compare_embeddings(query: str):
    print("\n=== Embedding Swap: MiniLM vs Gemini ===")
    r1 = qa({"query": query})
    r2 = qa_gem({"query": query})

    print("\n[MiniLM] Q:", query, "\nA:", r1.get("result",""))
    for i,d in enumerate(r1.get("source_documents",[])[:3]):
        print(f"  MiniLM src {i+1}:", d.metadata.get("source","?"))

    print("\n[Gem-emb] Q:", query, "\nA:", r2.get("result",""))
    for i,d in enumerate(r2.get("source_documents",[])[:3]):
        print(f"  Gem-emb src {i+1}:", d.metadata.get("source","?"))

    return r1, r2

_ = compare_embeddings("Define Valid Response Rate (VRR) in the Trust Game paper in one sentence.")



=== Embedding Swap: MiniLM vs Gemini ===

[MiniLM] Q: Define Valid Response Rate (VRR) in the Trust Game paper in one sentence. 
A: Valid Response Rate (VRR) is defined as the percentage of personas whose amount sent falls within the initial money ($10) in the Trust Game.
  MiniLM src 1: data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf
  MiniLM src 2: data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf
  MiniLM src 3: data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf

[Gem-emb] Q: Define Valid Response Rate (VRR) in the Trust Game paper in one sentence. 
A: Valid Response Rate (VRR) indicates an LLM's understanding of the limits on the amount it can send in the Trust Game.
  Gem-emb src 1: data/NeurIPS-2024-can-large-language-model-agents-simulate-human-trust-behavior-Paper-Conference.pdf
  Gem-emb src 2: data/NeurIPS-2024-can-large-la

In [9]:
cfg = json.load(open("rag_run_config.json"))
tested = set(cfg.get("embedding_models_tested", []))
tested.update(["sentence-transformers/all-MiniLM-L6-v2", "models/text-embedding-004"])
cfg["embedding_models_tested"] = sorted(tested)
cfg["llm"] = "gemini-2.5-flash"
json.dump(cfg, open("rag_run_config.json","w"), indent=2)
print("Updated rag_run_config.json")

Updated rag_run_config.json


In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
import shutil

splitter_small = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
splits_small = splitter_small.split_documents(raw_docs)
print("Small-chunk count:", len(splits_small))

shutil.rmtree("./chroma_gemini_small", ignore_errors=True)
vs_gem_small = Chroma.from_documents(splits_small, embedding=g_emb, persist_directory="./chroma_gemini_small")
vs_gem_small.persist()
retriever_gem_small = vs_gem_small.as_retriever(search_kwargs={"k": 4})

qa_gem_small = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever_gem_small, return_source_documents=True
)

def compare_chunks(query: str):
    print("\n=== Chunk Size: 500/100 vs 300/50 (Gemini embeddings) ===")
    r_def = qa_gem({"query": query})
    r_small = qa_gem_small({"query": query})

    print("\n[500/100] Q:", query, "\nA:", r_def.get("result",""))
    for i,d in enumerate(r_def.get("source_documents",[])[:3]):
        print(f"  Src {i+1}:", d.metadata.get("source","?"))

    print("\n[300/50] Q:", query, "\nA:", r_small.get("result",""))
    for i,d in enumerate(r_small.get("source_documents",[])[:3]):
        print(f"  Src {i+1}:", d.metadata.get("source","?"))

    return r_def, r_small

_ = compare_chunks("List the four stages of MDAgents in order.")

Small-chunk count: 1667

=== Chunk Size: 500/100 vs 300/50 (Gemini embeddings) ===

[500/100] Q: List the four stages of MDAgents in order. 
A: The four stages of MDAgents are:
1.  Medical complexity check
2.  Recruitment based on medical complexity
3.  Analysis and synthesis
4.  Final decision-making to return the answer
  Src 1: data/NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf
  Src 2: data/NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf
  Src 3: data/NeurIPS-2024-mdagents-an-adaptive-collaboration-of-llms-for-medical-decision-making-Paper-Conference.pdf

[300/50] Q: List the four stages of MDAgents in order. 
A: The provided text states that "The design of MDAgents (Figures 1 and 2) incorporates four stages: 1) Medical Complexity Check". However, it only lists the first stage and does not provide the remaining three stages.
  Src 1: data/NeurIPS-2024-mdagents-an-adap

In [None]:
import json
cfg = json.load(open("rag_run_config.json"))
cfg["chunk_size_alt"] = 300
cfg["chunk_overlap_alt"] = 50
json.dump(cfg, open("rag_run_config.json","w"), indent=2)
print("Updated rag_run_config.json with alternate chunk size.")

I ran Track-B with Gemini, indexed my three NeurIPS PDFs in Chroma, and asked questions through a RAG chain. For the embedding swap, MiniLM gave a cleaner one line VRR definition, while Gemini embeddings were a bit more interpretive but still on the right paper. When I changed chunking from 500/100 to 300/50, the smaller chunks sometimes clipped context on MDAgents it even returned only the first stage while the default chunks pulled all four stages properly. So my understanding is: in RAG, the embedding choice and chunk size strongly control what gets retrieved, and that directly shapes the answer. Because of that, I always looked at the source snippets to be sure the response was grounded.