# Install Packages

In [17]:
# ! pip install python-dotenv
# ! pip -q install --upgrade langchain-openai python-docx PyPDF2 pypdf
# ! pip install chromadb langchain openai langchain-chroma chroma-migrate

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
embedchain 0.1.128 requires langchain-openai<0.3.0,>=0.2.1, but you have langchain-openai 0.3.19 which is incompatible.
embedchain 0.1.128 requires rich<14.0.0,>=13.7.0, but you have rich 14.0.0 which is incompatible.
unstructured-client 0.27.0 requires pydantic<2.10.0,>=2.9.2, but you have pydantic 2.11.4 which is incompatible.
unstructured-client 0.27.0 requires python-dateutil==2.8.2, but you have python-dateutil 2.9.0.post0 which is incompatible.


Collecting chroma-migrate
  Using cached chroma_migrate-0.0.7-py3-none-any.whl.metadata (3.4 kB)
Collecting starlette<0.46.0,>=0.40.0 (from fastapi>=0.95.2->chromadb)
  Using cached starlette-0.45.3-py3-none-any.whl.metadata (6.3 kB)
Collecting clickhouse-connect==0.6.6 (from chroma-migrate)
  Using cached clickhouse_connect-0.6.6-py3-none-any.whl
Collecting duckdb==0.7.1 (from chroma-migrate)
  Using cached duckdb-0.7.1.tar.gz (10.5 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting chroma-bullet (from chroma-migrate)
  Using cached chroma_bullet-2.2.0-py3-none-any.whl.metadata (425 bytes)
Using cached starlette-0.45.3-py3-none-any.whl (71 kB)
Using cached chroma_migrate-0.0.7-py3-none-any.whl (12 kB)
Using cached chroma_bullet-2.2.0-py3-none-any.whl (11 kB)
Building wheels for collected packages: duckdb
  Building wheel for duckdb (setup.py): started
  Building wheel for duckdb (setup.py): finished with status 'error'


  DEPRECATION: Building 'duckdb' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'duckdb'. Discussion can be found at https://github.com/pypa/pip/issues/6334
  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [142 lines of output]
  C:\Users\lenovo\anaconda3\Lib\site-packages\setuptools\__init__.py:94: _DeprecatedInstaller: setuptools.installer and fetch_build_eggs are deprecated.
  !!
  
          ********************************************************************************
          Requirements should be satisfied by a PEP 517 installer.
          If you are using pip, you can try `pip install --use-pep517`.
          ****

In [18]:
import chromadb
print("ChromaDB SDK version:", chromadb.__version__)

ChromaDB SDK version: 0.5.23


# Load Secrets

In [15]:
import os
from dotenv import load_dotenv
from pathlib import Path

# Move up one directory to find .env
env_path = Path.cwd().parent / '.env'

# Load the .env file
load_dotenv(dotenv_path=env_path)

# Access your keys
groq_api_key = os.getenv('GROQ_API_KEY')
openai_api_key = os.getenv('OPENAI_API_KEY')

print(f"GROQ_API_KEY: {bool(groq_api_key)}")
print(f"OPENAI_API_KEY: {bool(openai_api_key)}")


GROQ_API_KEY: True
OPENAI_API_KEY: True


# Initialize ChromaDB & LangChain

In [20]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [21]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_db",  # Where to save data locally, remove if not necessary
)


print("✅ LangChain + Chroma vector store ready (persistent, local)!")

✅ LangChain + Chroma vector store ready (persistent, local)!


# Load your files, split into chunks, and upsert into ChromaDB

In [29]:
import os
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

DATA_DIR = "./data"

# 1. Load & tag raw docs
raw_docs = []
for fname in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, fname)
    if fname.lower().endswith(".pdf"):
        loader = PyPDFLoader(path)
    elif fname.lower().endswith(".docx"):
        loader = Docx2txtLoader(path)
    elif fname.lower().endswith(".txt"):
        loader = TextLoader(path, encoding="utf-8")
    else:
        continue

    docs = loader.load()
    # attach filename so we can delete/update later
    for d in docs:
        d.metadata["source"] = fname
    raw_docs.extend(docs)

print(f"Loaded {len(raw_docs)} document chunks from {len(os.listdir(DATA_DIR))} files.")

# 2. Split into manageable pieces
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(raw_docs)
print(f"Split into {len(chunks)} total chunks.")

Loaded 53 document chunks from 2 files.
Split into 179 total chunks.


In [31]:
print(chunks[0])

page_content='Addendum: Fitting the DESI BAO Data with Dark
Energy Driven by the Cohen–Kaplan–Nelson Bound
Patrick Adolf1∗, Martin Hirsch2†, Sara Krieg1‡, Heinrich P¨ as1§, Mustafa Tabet1¶
1Fakult¨ at f¨ ur Physik, Technische Universit¨ at Dortmund, D-44221 Dortmund, Germany
2Instituto de F` ısica Corpuscular (IFIC), Universidad de Valencia-CSIC,
E-46980 Valencia, Spain
April 23, 2025
Abstract
Motivated by the recent Year-2 data release of the DESI collaboration, we update our results on time-
varying dark energy models driven by the Cohen–Kaplan–Nelson bound. The previously found preference of
time-dependent dark energy models compared to ΛCDM is further strengthend by the new data release. For
our particular models, we find that this preference increases up to ≈ 2.6 σ depending on the used supernova
dataset.
1 Introduction
In this addendum, we update the results of our previous work [1] in the light of the recent Year-2 data release' metadata={'producer': 'pdfTeX-1.40.25', 'creator':

In [35]:
# For a single document
print(chunks[0].metadata['source'])

# # To print the file names of all documents
# for doc in chunks:
#     print(doc.metadata['source'])

astro_physics_1.pdf


# Insert Embeddings into ChromaDB

In [41]:
# 3. Upsert your document chunks
vector_store.add_documents(chunks)

# 4. Write everything immediately to disk
# vector_store.persist()

print(f"✔️ Ingested {len(chunks)} chunks into 'file_embeddings' collection.")

✔️ Ingested 179 chunks into 'file_embeddings' collection.


# Filter and Query Using Filename

In [43]:
# # Replace with your actual filename
file_name = "CSE_1.pdf"

col = vector_store._collection

#  Use a metadata “where” filter to get only that file’s vectors
results = col.get(
    where={"source": file_name}
    # include=["metadatas"]    # <-- you can include metadatas/documents if you want, but not "ids"
)
vector_ids = results["ids"]

print(f"🔍 Found {len(vector_ids)} embeddings for '{file_name}':\n{vector_ids}")

🔍 Found 163 embeddings for 'CSE_1.pdf':
['72c7c99c-2353-4cb0-b062-9d164da1811d', '85965409-3ee3-4e67-aa33-837b6987b3ca', '10b10807-24cb-4c24-adba-8043ece05a44', '4d9146e9-d15e-4f13-98ff-68c2d70e9fe4', 'da4cec38-1d39-48dd-a0b5-9c7794d8bff2', '3d1a1341-d95f-42a3-974e-4dfe5e89157d', '9fe62d75-36fa-4f37-b6f8-d93bf1717c4a', '34a38db5-e823-4dbc-aea9-00a7b316f2a5', 'cb40398f-5a0f-4a06-88fb-31d89ef3013a', 'd7626970-b48c-49f4-8f78-010bb9c79a57', 'dc2aac5e-bbf3-4d86-a343-5056327d3b98', '5e42eed3-9fef-490a-9b6c-ee3dc34273eb', '7349472c-13aa-4098-8dcd-10a62adc792f', '69633824-8d2a-4941-bb8f-b05c66585af9', 'a9ef68ea-213a-4e01-86ad-1776e3565d57', '310b471d-1481-454b-9beb-43200549e07d', 'b4d67271-386d-4ef2-8d33-6ebc02598b39', '132a34d0-80c5-41d1-a3f7-e7b5aab5dfd3', '483a14a0-4da7-46cc-8dad-7cb92b0136c9', 'a0feb1b1-496b-45af-93a2-124aafce6b0b', 'b4f2bb00-5554-409d-bfc8-08d3842f567e', '81d8b252-6642-4cc0-9bcf-a6f0fabd5854', '423406f6-d11c-4e56-ac88-0b561138cbcb', '464f7d80-146e-4a2c-8cb0-c33c98742f3b',

# Create a Retriever (For RAG)

In [46]:
# Step 1: build your retriever (no extra imports needed)
retriever =  vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)
print("Retriever initialized:", retriever)

Retriever initialized: tags=['Chroma', 'OpenAIEmbeddings'] vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x0000012B1970FC20> search_type='mmr' search_kwargs={'k': 1, 'fetch_k': 5}


# Create the RetrievalQA chain

In [49]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# 1. Instantiate an LLM
llm = ChatOpenAI(temperature=0)

# 2. Build the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

print("✅ RetrievalQA chain is ready")


✅ RetrievalQA chain is ready


# Test your RetrievalQA chain

In [52]:
# 1. Define your query
# query = "What are the main topics covered in CSE_1.pdf?"
query = "Tell me about Learning in Dynamic Bayesian Stackelberg Games"

# 2. Run the RetrievalQA chain
result =  qa_chain.invoke({"query": query})

# 3. Print the answer
print("📝 Answer:\n", result["result"])

# 4. Inspect source documents
print("\n🔍 Source chunks:")
for doc in result["source_documents"]:
    print(f"- {doc.metadata['source']} (chunk snippet: {doc.page_content[:150]}...)")

📝 Answer:
 In Learning in Dynamic Bayesian Stackelberg Games, the leader's focus is on maximizing their utility over the rounds of the game rather than identifying the follower's type, which is common in traditional learning paradigms. The leader plays a Dynamic Bayesian Stackelberg Policy (DSP) that specifies their strategy at each round, committing to it before the game starts. The follower observes this policy in advance. The optimal DSP is called the Dynamic Bayesian Stackelberg Equilibrium (DSE). This approach is used in various fields such as dynamic pricing problems, dynamic mechanism design, and Stackelberg security games.

🔍 Source chunks:
- CSE_1.pdf (chunk snippet: drawn from µ. We denote a specific dynamic Bayesian Stackelberg game as {R, Θ, {Cθ}θ∈Θ, µ, T}.
The leader plays a Dynamic Bayesian Stackelberg Policy ...)


# Initiate Agno Agent

In [65]:
from agno.agent import Agent
from agno.tools.duckduckgo import DuckDuckGoTools
from agno.knowledge.langchain import LangChainKnowledgeBase
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from pydantic import BaseModel, Field
from typing import List

In [69]:
# Knowledge Base
knowledge_base = LangChainKnowledgeBase(retriever=retriever)


In [71]:
# Define Pydantic model for structured output
class AnswerOutput(BaseModel):
    query: str = Field(..., description="The user's original query.")
    answer: str = Field(..., description="The answer to the user's query.")
    type: str = Field(..., description="Source of the answer: 'Knowledge/RAG' or 'Web Search'.")
    sources: List[str] = Field(..., description="List of sources used (URLs or document names).")


In [77]:
# Agent with structured_output using Pydantic model
agent = Agent(
    tools=[DuckDuckGoTools()],
    knowledge=knowledge_base,
    instructions=[
        "Try to answer the query using the knowledge base (RAG) first.",
        "If the knowledge base does not contain the information or the user asks for a web search, use DuckDuckGo.",
        "Return your response following the provided structured format."
    ],
    response_model=AnswerOutput,  # <-- Pydantic model here
    show_tool_calls=True,
    markdown=False,
)

# Test the agent

In [115]:
from agno.agent import RunResponse
from rich.pretty import pprint
from datetime import datetime

start_time = datetime.now()
print(f"Start Time: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

# Test the agent
# response: RunResponse = agent.run("What are the latest developments in AI, and how do they relate to the concepts in our knowledge base?")
response: RunResponse = agent.run("Tell me about Learning in Dynamic Bayesian Stackelberg Games")

end_time = datetime.now()
elapsed = end_time - start_time

print(f"End Time: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Elapsed Time: {elapsed.total_seconds():.2f} seconds")  # <-- FIXED


Start Time: 2025-06-04 16:26:14
End Time: 2025-06-04 16:26:23
Elapsed Time: 8.72 seconds


In [116]:
# Pretty print the structured content (your AnswerOutput model)
pprint(response.content)

In [117]:
# Or access individual fields
print(f"Query: {response.content.query}")
print(f"Answer: {response.content.answer}")
print(f"Type: {response.content.type}")
print(f"Sources: {response.content.sources}")

Query: Learning in Dynamic Bayesian Stackelberg Games
Answer: In a Dynamic Bayesian Stackelberg Game, the leader's main objective is to maximize her utility over multiple rounds rather than just identifying the follower type. The leader commits to a Dynamic Bayesian Stackelberg Policy (DSP) before the game starts, which specifies the leader's strategy at each round. This policy is observed by the follower in advance. This type of commitment is common in literature related to dynamic pricing problems, dynamic mechanism design, and Stackelberg security games. The optimal DSP is referred to as the Dynamic Bayesian Stackelberg Equilibrium (DSE).
Type: Knowledge/RAG
Sources: ['CSE_1.pdf']


# Multi-Lingual Agent

In [122]:
# Agent with structured_output using Pydantic model
agent = Agent(
    tools=[DuckDuckGoTools()],
    knowledge=knowledge_base,
    instructions=[
        "Understand user's query and translate into English to use the knowledge-base and web search"
        "Try to answer the query using the knowledge base (RAG) first.",
        "If the knowledge base does not contain the information or the user asks for a web search, use DuckDuckGo.",
        "Return your response following the provided structured format."
        "Answer in the same language as of user's query"
    ],
    response_model=AnswerOutput,  # <-- Pydantic model here
    show_tool_calls=True,
    markdown=False,
)

In [124]:
from agno.agent import RunResponse
from rich.pretty import pprint
from datetime import datetime

start_time = datetime.now()
print(f"Start Time: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

# Test the agent
# response: RunResponse = agent.run("What are the latest developments in AI, and how do they relate to the concepts in our knowledge base?")
response: RunResponse = agent.run("ডায়নামিক বেয়েসিয়ান স্ট্যাকেলবার্গ গেমসে শেখা সম্পর্কে আমাকে বলুন")

end_time = datetime.now()
elapsed = end_time - start_time

print(f"End Time: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Elapsed Time: {elapsed.total_seconds():.2f} seconds")  # <-- FIXED


Start Time: 2025-06-04 16:39:01
End Time: 2025-06-04 16:39:14
Elapsed Time: 12.39 seconds


In [125]:
# Pretty print the structured content (your AnswerOutput model)
pprint(response.content)