- input: PDF file
- output: Vector DB representation

Steps:
1. Load & Parse PDF
2. Text splitting
3. Embedding
4. Vector DB

## 0. Install Modules

In [None]:
%pip install arxiv
%pip install langchain
%pip install pypdf
%pip install langchain_community
%pip install cohere
%pip install faiss-cpu

## 1. PDF Loader

In [11]:
# PDF File sample
import arxiv

def download_arxiv_doi(doi, save_path="paper.pdf"):
    '''
    download only arxiv papers
    '''
    if not doi.startswith("10.48550/arXiv."):
        raise Exception("Not an arXiv DOI.")
    
    arxiv_id = doi.split("arXiv.")[-1]
    search = arxiv.Search(id_list=[arxiv_id])
    paper = next(search.results())
    paper.download_pdf(filename=save_path)
    print(f"Downloaded {arxiv_id} to {save_path}")

# Example:
download_arxiv_doi("10.48550/arXiv.2510.18234")

  paper = next(search.results())


Downloaded 2510.18234 to paper.pdf


In [5]:
# Simple Parser
from langchain_community.document_loaders import PyPDFLoader
llm_loader = PyPDFLoader("paper.pdf")
pages = llm_loader.load_and_split()
len(pages)

22

In [7]:
pages[3].metadata

{'producer': 'pikepdf 8.15.1',
 'creator': 'arXiv GenPDF (tex2pdf:e76afa9)',
 'creationdate': '',
 'author': 'Haoran Wei; Yaofeng Sun; Yukun Li',
 'doi': 'https://doi.org/10.48550/arXiv.2510.18234',
 'license': 'http://creativecommons.org/licenses/by/4.0/',
 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.28 (TeX Live 2025) kpathsea version 6.4.1',
 'title': 'DeepSeek-OCR: Contexts Optical Compression',
 'trapped': '/False',
 'arxivid': 'https://arxiv.org/abs/2510.18234v1',
 'source': 'paper.pdf',
 'total_pages': 22,
 'page': 3,
 'page_label': '4'}

## 2. Text Splitter

In [14]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="",
    chunk_size=300,  # configurable variable
    length_function = len,
    chunk_overlap=50, # configurable variable
)

In [15]:
documents = []
metadatas = []
for page in pages:
    documents.append(page.page_content)
    metadatas.append(page.metadata)

In [16]:
chunks = text_splitter.create_documents(documents, metadatas)

In [17]:
len(chunks)

221

## 3. Embedding

In [19]:
from dotenv import dotenv_values
env_values = dotenv_values('app.env')
cohere_api_key = env_values['COHERE_API_KEY']

In [None]:
from langchain_community.embeddings.cohere import CohereEmbeddings
embedding_llm = CohereEmbeddings(cohere_api_key=cohere_api_key, user_agent="langchain")

## 4. Vector DB

In [None]:
from langchain_community.vectorstores import FAISS
vector_db = FAISS.from_documents(chunks, embedding_llm)

In [54]:
query = "what is Context Optical Compression?"
similar_docs = vector_db.similarity_search(query)

In [57]:
print(similar_docs[1].page_content)

components,
data engineering, and training skills.
3.2. DeepEncoder
To explore the feasibility of contexts optical compression, we need a vision encoder with the
following features: 1.Capable of processing high resolutions; 2.Low activation at high resolutions;
3.Few vision tokens; 4.Support for mul


In [65]:
save_to_dir = "faiss_vector_data"
vector_db.save_local(save_to_dir)