# CS 5588 — RAG with LangChain, Chroma, and Gemini Free API
_Generated: 2025-09-14T13:53:05_

### 1) Install

In [None]:

!pip -q install -U langchain langchain-community chromadb pypdf             sentence-transformers transformers tiktoken             langchain-google-genai google-genai
print("If upgraded core libs, consider restarting runtime.")


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m112.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.6/245.6 kB[0m [31m24.9 MB/s[0m eta [3

### 2) Keys & Imports

In [None]:

import os, getpass, json, sys, platform, pathlib, datetime, importlib
if not os.getenv("GEMINI_API_KEY"):
    os.environ["GEMINI_API_KEY"] = getpass.getpass("Enter your GEMINI_API_KEY: ")
os.environ["GOOGLE_API_KEY"] = os.environ.get("GOOGLE_API_KEY", os.environ["GEMINI_API_KEY"])

from google import genai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.chains import RetrievalQA

pathlib.Path("data").mkdir(exist_ok=True)
pathlib.Path("artifacts").mkdir(exist_ok=True)
print("Env ready.")


Enter your GEMINI_API_KEY: ··········
Env ready.


### 3) Log environment → env_rag.json

In [None]:

def pv(m):
    try:
        mod = importlib.import_module(m)
        return getattr(mod, "__version__", "unknown")
    except: return "not installed"
env = {
  "timestamp": datetime.datetime.now().isoformat(),
  "python": sys.version, "platform": platform.platform(),
  "packages": {m: pv(m) for m in [
    "langchain","langchain_community","chromadb","tiktoken","transformers",
    "sentence_transformers","langchain_google_genai","google.genai"
  ]}
}
with open("env_rag.json","w") as f: json.dump(env, f, indent=2)
print(json.dumps(env, indent=2))


{
  "timestamp": "2025-09-19T04:04:53.324542",
  "python": "3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]",
  "platform": "Linux-6.1.123+-x86_64-with-glibc2.35",
  "packages": {
    "langchain": "0.3.27",
    "langchain_community": "0.3.29",
    "chromadb": "1.1.0",
    "tiktoken": "0.11.0",
    "transformers": "4.56.1",
    "sentence_transformers": "5.1.0",
    "langchain_google_genai": "unknown",
    "google.genai": "1.38.0"
  }
}


### 4) Upload documents

In [None]:

try:
    from google.colab import files
    up = files.upload()
    import os
    os.makedirs("data", exist_ok=True)
    for n,c in up.items():
        open(os.path.join("data", n), "wb").write(c)
    print("Uploaded:", list(up.keys()))
except Exception as e:
    print("Colab upload UI not available.", e)


Saving paper1.pdf to paper1.pdf
Saving paper2.pdf to paper2.pdf
Saving paper3.pdf to paper3.pdf
Uploaded: ['paper1.pdf', 'paper2.pdf', 'paper3.pdf']


### 5) Load & chunk

In [None]:

import os
def load_docs(folder="data"):
    docs=[]
    for fname in os.listdir(folder):
        p=os.path.join(folder,fname)
        if not os.path.isfile(p): continue
        ext=fname.lower().split(".")[-1]
        try:
            if ext=="pdf": loader=PyPDFLoader(p)
            elif ext in ["txt","md","markdown"]: loader=TextLoader(p, encoding="utf-8")
            else:
                print("Skip", fname); continue
            docs += loader.load()
        except Exception as e:
            print("Fail", fname, e)
    return docs
raw_docs=load_docs("data")
print("Loaded", len(raw_docs))
splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits=splitter.split_documents(raw_docs)
print("Chunks:", len(splits))
if splits: print(splits[0].page_content[:400])
rag_run_config={"chunk_size":500,"chunk_overlap":100,"embedding_models_tested":[],"llm":None,"retriever_k":4}
import json
json.dump(rag_run_config, open("rag_run_config.json","w"), indent=2)




Loaded 97
Chunks: 667
When AI Meets Finance (StockAgent): Large Language
Model-based Stock Trading in Simulated Real-world
Environments
CHONG ZHANG∗, University of Liverpool, UK
XINYI LIU∗, Peking University, China
ZHONGMOU ZHANG∗, Shanghai University of Finance and Economics, China
MINGYU JIN, Rutgers University, USA
LINGYAO LI,University of Michigan, USA
ZHENTING WANG, Rutgers University, USA
WENYUE HUA, Rutgers Univ


### 6) Vector DB (Chroma) + baseline embeddings

In [None]:

from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vs = Chroma.from_documents(splits, embedding=emb, persist_directory="./chroma_minilm")
vs.persist()
retriever = vs.as_retriever(search_kwargs={"k":4})
print("Vector store ready.")
cfg=json.load(open("rag_run_config.json"))
cfg["embedding_models_tested"].append("sentence-transformers/all-MiniLM-L6-v2")
json.dump(cfg, open("rag_run_config.json","w"), indent=2)


  emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vector store ready.


  vs.persist()


### 7) RetrievalQA with Gemini

In [None]:

llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True)
def ask(q):
    r=qa({"query":q})
    print("\nQ:", q); print("A:", r.get("result",""))
    print("\nSources:")
    for i,d in enumerate(r.get("source_documents",[])[:3]):
        print(f"[{i+1}] {d.metadata.get('source','?')} ::", d.page_content[:160].replace("\n"," ")+"...")
ask("What are the main findings relevant to our project domain?")


  r=qa({"query":q})



Q: What are the main findings relevant to our project domain?
A: Based on the provided text, the main findings relevant to a project domain likely involving market analysis, trading, or AI systems are:

1.  **Market Sentiment & Activity:** There are mixed reactions in the business community, with tech executives optimistic about potential deregulation and increased innovation leading to more spending and dealmaking.
2.  **Structured Market Analysis Process:** A two-tiered team approach is used for market analysis:
    *   An **Analyst Team** synthesizes data from multiple sources for holistic market analysis.
    *   A **Researcher Team** critically evaluates this information, comprising agents with both bullish and bearish perspectives.
3.  **Explainable AI Systems:** There is a focus on creating explainable AI systems where decisions are supported by evidence and are transparent.
4.  **Opportunities for Technical Analysis:** Future work includes performing more technical analysis by

### 8) Mini-experiments (embedding swap & chunk sensitivity) — optional

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI

# Split papers
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(raw_docs)

# Embeddings
g_emb = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")   # Gemini
mini_emb = SentenceTransformerEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")  # free

# Chroma DBs
db_g = Chroma.from_documents(chunks, g_emb, persist_directory="chroma_gemini")
db_m = Chroma.from_documents(chunks, mini_emb, persist_directory="chroma_minilm")

# Retriever + QA
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.2)

qa_g = RetrievalQA.from_chain_type(llm=llm, retriever=db_g.as_retriever(), chain_type="stuff")
qa_m = RetrievalQA.from_chain_type(llm=llm, retriever=db_m.as_retriever(), chain_type="stuff")

print("Gemini Embeddings:", qa_g.invoke({"query":"What is the main contribution of paper1?"}))
print("MiniLM Embeddings:", qa_m.invoke({"query":"What is the main contribution of paper1?"}))


Gemini Embeddings: {'query': 'What is the main contribution of paper1?', 'result': "I'm sorry, but the provided text does not contain information about the main contribution of the paper. It includes licensing details, publication status, author affiliations, and a disclaimer, but no summary of the paper's content or contributions."}
MiniLM Embeddings: {'query': 'What is the main contribution of paper1?', 'result': 'I\'m sorry, but the provided context does not contain any information about "paper1" or its main contribution. The text describes types of documents from an Analyst Team and Traders, and lists a series of names.'}


In [None]:
# Small chunks
small = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50).split_documents(raw_docs)

# Build DBs with same embeddings (use MiniLM here to save API quota)
db_500 = Chroma.from_documents(chunks, mini_emb, persist_directory="chroma_minilm_500")
db_300 = Chroma.from_documents(small, mini_emb, persist_directory="chroma_minilm_300")

qa_500 = RetrievalQA.from_chain_type(llm=llm, retriever=db_500.as_retriever(), chain_type="stuff")
qa_300 = RetrievalQA.from_chain_type(llm=llm, retriever=db_300.as_retriever(), chain_type="stuff")

print("500-chunks:", qa_500.invoke({"query":"Summarize paper2 in one sentence."}))
print("300-chunks:", qa_300.invoke({"query":"Summarize paper2 in one sentence."}))


500-chunks: {'query': 'Summarize paper2 in one sentence.', 'result': 'I\'m sorry, but the provided text does not label the papers as "paper1," "paper2," etc., nor does it provide summaries for them. Therefore, I cannot summarize "paper2."'}
300-chunks: {'query': 'Summarize paper2 in one sentence.', 'result': 'Paper 2, authored by Andres Alonso-Robisco and José Manuel Carbó in 2023, analyzes the narrative surrounding Central Bank Digital Currencies (CBDC) by central banks using large language models.'}
