# Install Packages

In [None]:
# ! pip -q install --upgrade langchain-openai langchain-pinecone python-docx PyPDF2 pypdf

# ! pip -q install langchain-core langchain-openai langchain-community langchain-experimental langgraph langsmith

In [2]:
import pinecone
print("Pinecone SDK version:", pinecone.__version__)


Pinecone SDK version: 6.0.2


  from .autonotebook import tqdm as notebook_tqdm


# Load Secrets

In [3]:
# at the top of pinecone.ipynb
from dotenv import load_dotenv, find_dotenv
import os

# will walk up parent dirs until it finds a “.env”
load_dotenv(find_dotenv())  


# (2) Read the variables
openai_api_key   = os.getenv("OPENAI_API_KEY")
groq_api_key     = os.getenv("Groq_API_KEY")
pinecone_api_key = os.getenv("Pinecone_API_KEY")

# (3) (Optionally) verify they loaded correctly
if not all([
    openai_api_key,
    groq_api_key,
    pinecone_api_key,
    ]):
    raise RuntimeError("One or more API keys are missing in your .env file!")


# Replace with your actual key & env
os.environ["PINECONE_API_KEY"] = pinecone_api_key
os.environ["PINECONE_ENVIRONMENT"] = "us-west1-gcp"

# Initialize Pinecone & LangChain

In [4]:
import os
from pinecone import Pinecone, ServerlessSpec

# Pinecone client & index setup (same as before)
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
spec = ServerlessSpec(cloud="aws", region="us-east-1")
INDEX_NAME = "poc-23-04-25"
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=1536,
        metric="cosine",
        spec=spec
    )
index = pc.Index(INDEX_NAME)



print(f"✅ Connected to Pinecone index '{INDEX_NAME}' with new LangChain bindings")


✅ Connected to Pinecone index 'poc-23-04-25' with new LangChain bindings


In [5]:
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

# 1. Create your embeddings object
embeddings = OpenAIEmbeddings()

# 2. Initialize the vector store from your existing Pinecone index
vectorstore = PineconeVectorStore.from_existing_index(
    index_name=INDEX_NAME,   # the name you created above
    embedding=embeddings,    # pass the Embeddings instance, not a function
    text_key="text",         # field in metadata that holds your chunk text
    namespace=""             # optional: use per-file namespaces if you like
)

print("✅ LangChain PineconeVectorStore ready!")


✅ LangChain PineconeVectorStore ready!


# Load your files, split into chunks, and upsert into Pinecone

In [None]:
import os
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

DATA_DIR = "./data"

# 1. Load & tag raw docs
raw_docs = []
for fname in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, fname)
    if fname.lower().endswith(".pdf"):
        loader = PyPDFLoader(path)
    elif fname.lower().endswith(".docx"):
        loader = Docx2txtLoader(path)
    elif fname.lower().endswith(".txt"):
        loader = TextLoader(path, encoding="utf-8")
    else:
        continue

    docs = loader.load()
    # attach filename so we can delete/update later
    for d in docs:
        d.metadata["source"] = fname
    raw_docs.extend(docs)

print(f"Loaded {len(raw_docs)} document chunks from {len(os.listdir(DATA_DIR))} files.")

# 2. Split into manageable pieces
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = splitter.split_documents(raw_docs)
print(f"Split into {len(chunks)} total chunks.")

Loaded 66 document chunks from 3 files.
Split into 266 total chunks.


In [16]:
print(chunks)

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-04-23T00:02:31+00:00', 'author': '', 'keywords': '', 'moddate': '2025-04-23T00:02:31+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'astro_physics_1.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='Addendum: Fitting the DESI BAO Data with Dark\nEnergy Driven by the Cohen–Kaplan–Nelson Bound\nPatrick Adolf1∗, Martin Hirsch2†, Sara Krieg1‡, Heinrich P¨ as1§, Mustafa Tabet1¶\n1Fakult¨ at f¨ ur Physik, Technische Universit¨ at Dortmund, D-44221 Dortmund, Germany\n2Instituto de F` ısica Corpuscular (IFIC), Universidad de Valencia-CSIC,\nE-46980 Valencia, Spain\nApril 23, 2025\nAbstract\nMotivated by the recent Year-2 data release of the DESI collaboration, we update our results on time-\nvarying dark energy models driven by the Cohen–Kaplan

In [17]:
print(chunks[0])

page_content='Addendum: Fitting the DESI BAO Data with Dark
Energy Driven by the Cohen–Kaplan–Nelson Bound
Patrick Adolf1∗, Martin Hirsch2†, Sara Krieg1‡, Heinrich P¨ as1§, Mustafa Tabet1¶
1Fakult¨ at f¨ ur Physik, Technische Universit¨ at Dortmund, D-44221 Dortmund, Germany
2Instituto de F` ısica Corpuscular (IFIC), Universidad de Valencia-CSIC,
E-46980 Valencia, Spain
April 23, 2025
Abstract
Motivated by the recent Year-2 data release of the DESI collaboration, we update our results on time-
varying dark energy models driven by the Cohen–Kaplan–Nelson bound. The previously found preference of
time-dependent dark energy models compared to ΛCDM is further strengthend by the new data release. For
our particular models, we find that this preference increases up to ≈ 2.6 σ depending on the used supernova
dataset.
1 Introduction
In this addendum, we update the results of our previous work [1] in the light of the recent Year-2 data release' metadata={'producer': 'pdfTeX-1.40.25', 'creator':

In [18]:
# For a single document
print(chunks[0].metadata['source'])

# To print the file names of all documents
for doc in chunks:
    print(doc.metadata['source'])


astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
astro_physics_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf
CSE_1.pdf


In [19]:
# 3. Upsert into your Pinecone index
vectorstore.add_documents(chunks)
print("✅ Upsert complete!")

✅ Upsert complete!


# Filter and Query Using Filename

In [28]:
# Replace with your actual filename
file_name = "CSE_1.pdf"

# 1. Query (filter) to see which vectors belong to that file
#    We pass a dummy vector since we only care about filtering here,
#    and set top_k to a high number to catch all matches.
dummy_vector = [0.0] * 1536
result = index.query(
    vector=dummy_vector,
    top_k=1000,
    include_metadata=True,
    include_values=False,
    filter={"source": file_name}
)

# Print out the matched IDs and metadata
for match in result.matches:
    print(match.id, match.metadata)


65b883b5-7bc5-4070-8b90-53ecfcb7c478 {'author': '', 'creationdate': '2025-04-23T00:23:21+00:00', 'creator': 'LaTeX with hyperref', 'keywords': '', 'moddate': '2025-04-23T00:23:21+00:00', 'page': 20.0, 'page_label': '21', 'producer': 'pdfTeX-1.40.25', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'source': 'CSE_1.pdf', 'subject': '', 'text': 'example, we show that learning can be arbitrarily more powerful than communication. However, we\ndo not have an equivalent result to Corollary 1 for the relative performance of the DSE versus the\nRME. Is it also true that for random games the DSE achieves strictly higher leader utility compared\nto the RME? Based on our experimental results, we believe this is very likely, but proving it remains\nan open question.\nReferences\n[1] Gagan Aggarwal, Ashish Goel, and Rajeev Motwani. 2006. Truthful auctions for pricing search\nkeywords. In Proceedings of the 7th ACM Conference on Electronic

In [29]:
len(result.matches)

163

# 2. Delete by filter: remove all vectors whose metadata.source == file_name

In [21]:
vectorstore.delete(filter={"source": file_name})
print(f"Requested deletion of all vectors for '{file_name}'")


Requested deletion of all vectors for 'CSE_1.pdf'


# Implement “ingest a single file” and “update a file” routines

In [24]:
from typing import List
from langchain.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

def load_and_split(path: str) -> List:
    """Load one file and split into chunks with metadata['source'] set."""
    fname = os.path.basename(path)
    # Choose loader by extension
    if path.lower().endswith(".pdf"):
        docs = PyPDFLoader(path).load()
    elif path.lower().endswith(".docx"):
        docs = Docx2txtLoader(path).load()
    elif path.lower().endswith(".txt"):
        docs = TextLoader(path, encoding="utf-8").load()
    else:
        raise ValueError(f"Unsupported file type: {path}")
    # Tag and split
    for d in docs:
        d.metadata["source"] = fname
    return splitter.split_documents(docs)

def ingest_file(file_name: str):
    """Upsert vectors for a single file."""
    path = os.path.join(DATA_DIR, file_name)
    chunks = load_and_split(path)
    vectorstore.add_documents(chunks)
    print(f"✅ Ingested {len(chunks)} chunks for '{file_name}'")

def update_file(file_name: str):
    """Delete old vectors then re-ingest updated content."""
    delete_file_vectors(file_name)
    ingest_file(file_name)
    print(f"✅ Updated vectors for '{file_name}'")


In [25]:
ingest_file("CSE_1.pdf")
# update_file("CSE_2.pdf")


✅ Ingested 163 chunks for 'CSE_1.pdf'


## Verify ingestion

In [31]:
# Build a zero vector matching your dimension
dummy = [0.0] * 1536

result = index.query(
    vector=dummy,
    top_k=100,
    include_metadata=True,
    include_values=False,
    filter={"source": "CSE_1.pdf"}
)
print(f"Found {len(result.matches)} chunks for 'CSE_1.pdf'")
# Optionally inspect a couple:
for m in result.matches[:3]:
    print(m.id, m.metadata["source"])


Found 100 chunks for 'CSE_1.pdf'
17eb908d-0a98-48c1-affa-075b60cadaa9 CSE_1.pdf
46e86b24-75f3-4b9a-9b0c-84359824b69e CSE_1.pdf
e7bd490f-15a2-46e9-8bf6-5816483de404 CSE_1.pdf


# Langchain

## Create a Retriever (For RAG)

In [33]:
# Step 1: build your retriever (no extra imports needed)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
print("Retriever initialized:", retriever)


Retriever initialized: tags=['PineconeVectorStore', 'OpenAIEmbeddings'] vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x000001C3F44C96D0> search_kwargs={'k': 3}


## Create the RetrievalQA chain

In [35]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# 1. Instantiate an LLM
llm = ChatOpenAI(temperature=0)

# 2. Build the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

print("✅ RetrievalQA chain is ready")


✅ RetrievalQA chain is ready


## Test your RetrievalQA chain

In [42]:
# 1. Define your query
# query = "What are the main topics covered in CSE_1.pdf?"
query = "Tell me about Learning in Dynamic Bayesian Stackelberg Games"

# 2. Run the RetrievalQA chain
result =  qa_chain.invoke({"query": query})

# 3. Print the answer
print("📝 Answer:\n", result["result"])

# 4. Inspect source documents
print("\n🔍 Source chunks:")
for doc in result["source_documents"]:
    print(f"- {doc.metadata['source']} (chunk snippet: {doc.page_content[:150]}...)")


📝 Answer:
 Learning in Dynamic Bayesian Stackelberg Games refers to the process where a leader and a fully strategic follower interact repeatedly, with the follower's type being unknown. Contrary to some existing results, it has been shown that the leader can improve their utility through learning in repeated play in these settings. The effectiveness of learning in these games has been demonstrated without needing to limit the follower's strategic space. The improvement achieved through learning is not solely due to the leader's ability to commit, nor does learning simply substitute for communication between the leader and the follower. There are algorithms, including a mixed-integer linear program, that can be used to compute the optimal leader policy in these games, as well as heuristic algorithms to approximate the optimal dynamic policy more efficiently.

🔍 Source chunks:
- CSE_1.pdf (chunk snippet: to a large literature on learning in strategic settings that relies on limiting the