## Install Dependencies

In [None]:
!pip install \
  langchain \
  langchain-community \
  langchain-chroma \
  langchain-google-genai \
  chromadb \
  pypdf \
  python-dotenv \
  tqdm


## Get Gemini API Working

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

## Load PDFs

In [18]:
from langchain_community.document_loaders import PyPDFLoader
import os

PDF_DIR = "data/event_pdfs"

def load_pdfs(pdf_dir):
    documents = []
    for file in os.listdir(pdf_dir):
        if file.endswith(".pdf"):
            path = os.path.join(pdf_dir, file)
            loader = PyPDFLoader(path)
            docs = loader.load()

            # Attach paper ID to metadata
            for d in docs:
                d.metadata["paper_id"] = file.replace(".pdf", "")
            documents.extend(docs)

    return documents


raw_docs = load_pdfs(PDF_DIR)
print(f"Loaded {len(raw_docs)} pages")


Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 781 0 (offset 0)
Ignoring wrong pointing object 785 0 (offset 0)
Ignoring wrong pointing object 1369 0 (offset 0)


Loaded 467 pages


In [None]:
## load pdfs

from langchain_community.document_loaders import PyPDFLoader
import os
from tqdm import tqdm

data_dir = "data/event_pdfs"

documents = []
for file in tqdm(os.listdir(data_dir)):
    if file.endswith(".pdf"):
        path = os.path.join(data_dir, file)

        # loader: recieves path of a single file, returns list of Document objects  >> docs is a list
        # Each Document object represents a page from the PDF
        loader = PyPDFLoader(path)
        docs = loader.load()

        # Attach paper ID to metadata
        for d in docs:
            d.metadata["paper_id"] = file.replace(".pdf", "")

        # use extend to add multiple items to a list, not append to create nested list
        documents.extend(docs)

print(f"Loaded {len(documents)} pages")

 23%|██▎       | 7/31 [00:02<00:07,  3.05it/s]Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 781 0 (offset 0)
Ignoring wrong pointing object 785 0 (offset 0)
Ignoring wrong pointing object 1369 0 (offset 0)
100%|██████████| 31/31 [00:48<00:00,  1.56s/it]

Loaded 467 pages





In [None]:
# Split into Chunks  

In [None]:
# ===============================================================================================================
#                                               Documents into Chunk
# ===============================================================================================================

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

print(f"\nnum. chunks {len(chunks)}")


num. chunks 6025


### Explanation (don't run)

In [None]:
# What page_content might look like for page 1 of a research paper:
docs[0].page_content = """
Event Cameras: A New Paradigm for Computer Vision
John Doe, Jane Smith
University of Technology

Abstract—Event cameras are bio-inspired sensors...

I. INTRODUCTION
Traditional frame-based cameras...

Figure 1: Comparison of event camera vs conventional camera.
"""
# ===========================================================================

# Before your modification:
docs[0].metadata = {
    "source": "data/event_pdfs/event_camera_survey.pdf",  # File path
    "page": 0  # Page number (0-indexed, so page 0 = actual page 1)
}

# After your modification with paper_id:
docs[0].metadata = {
    "source": "data/event_pdfs/event_camera_survey.pdf",
    "page": 0,
    "paper_id": "event_camera_survey"  # ← Your added field!
}

# ===========================================================================

# documents list would contain 15 Document objects (5+3+7)
len(documents)  # Returns: 15

# Accessing specific documents:
documents[0]   # Page 1 of event_camera_basics.pdf
documents[4]   # Page 5 of event_camera_basics.pdf (last page of this PDF)
documents[5]   # Page 1 of davis_sensors.pdf
documents[7]   # Page 3 of davis_sensors.pdf (last page)
documents[8]   # Page 1 of applications_robotics.pdf
documents[14]  # Page 7 of applications_robotics.pdf (last page)

## Create Embeddings + ChromaDB

In [None]:
# embedding model
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001"
)

