## Document Loading

In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./sfp_modules_uk.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs)) # PyPDFLoaded creates 1 Doc per PDF page
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

## Document Splitting

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

## Embeddings

In [None]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

## Vector Stores

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)
ids = vector_store.add_documents(documents=all_splits)

results = await vector_store.asimilarity_search(
    "What speed 10-Gigabit Ethernet operate at ?"
)
print(results[0])

## Retrievers 

In [None]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 1},
)

retriever.batch(
    [
        "What speed 10-Gigabit Ethernet operate at ?",
        "What speed does STM-4 operate at ?",
    ],
)
