In [2]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings, GoogleGenerativeAI
from langchain_community.document_loaders import PyPDFLoader
from dotenv import load_dotenv

load_dotenv()

os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY')

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [27]:
loader = PyPDFLoader('docs/Chenula IPD.pdf')
docs = loader.load()
docs

[Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': 'docs/Chenula IPD.pdf', 'total_pages': 65, 'page': 0, 'page_label': '1'}, page_content='INFORMATICS INSTITUTE OF TECHNOLOGY \nIn Collaboration with \nUNIVERSITY OF WESTMINSTER \n \nHybrid Prompt Compression for RAG Pipelines \n \nA Project Proposal by \nChenula Senkith Jayasinghe \n \nSupervised by \nMr Vinula Uthsara Buthgamumudalige \n \n \n \n \n \n \n02/2025'),
 Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': 'docs/Chenula IPD.pdf', 'total_pages': 65, 'page': 1, 'page_label': '2'}, page_content='Abstract  \n \nWith the huge growth of unstructured textual data in the healthcare domain, retrieval and \nsummarization of relevant inf

In [42]:
dir = './docs/'

In [43]:
for file in os.listdir(dir):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(dir + file)
        docs = loader.load()
        
        docs
        # break

In [7]:
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=100,
#     chunk_overlap=20,
#     length_function=len,
#     is_separator_regex=False,
# )

In [44]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
all_splits = text_splitter.split_documents(docs)
all_splits

[Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': './docs/Chenula IPD.pdf', 'total_pages': 65, 'page': 0, 'page_label': '1'}, page_content='INFORMATICS INSTITUTE OF TECHNOLOGY \nIn Collaboration with \nUNIVERSITY OF WESTMINSTER \n \nHybrid Prompt Compression for RAG Pipelines \n \nA Project Proposal by \nChenula Senkith Jayasinghe \n \nSupervised by \nMr Vinula Uthsara Buthgamumudalige \n \n \n \n \n \n \n02/2025'),
 Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': './docs/Chenula IPD.pdf', 'total_pages': 65, 'page': 1, 'page_label': '2'}, page_content='Abstract  \n \nWith the huge growth of unstructured textual data in the healthcare domain, retrieval and \nsummarization of relevant

In [45]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", task_type="retrieval_document")

In [46]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index = faiss.IndexFlatL2(768)

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [57]:
dir = './docs'
for file in os.listdir(dir):
    if file.endswith(".pdf"):
        # print("Processing file:", file, end="...\n")
        all_split_texts = []
        loader = PyPDFLoader(os.path.join(dir, file))
        print(dir + file)
        docs = loader.load()
        split_texts = text_splitter.split_documents(docs)
        all_split_texts.extend(split_texts)
        # print(all_split_texts)
        # print(split_texts)

        uuids = [str(uuid4()) for _ in range(len(all_split_texts))]
        # print(uuids)
        vector_store.add_documents(documents=all_split_texts, ids=uuids)
        # print("done")
all_split_texts[0]

./docsChenula IPD.pdf


Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': './docs\\Chenula IPD.pdf', 'total_pages': 65, 'page': 0, 'page_label': '1'}, page_content='INFORMATICS INSTITUTE OF TECHNOLOGY \nIn Collaboration with \nUNIVERSITY OF WESTMINSTER \n \nHybrid Prompt Compression for RAG Pipelines \n \nA Project Proposal by \nChenula Senkith Jayasinghe \n \nSupervised by \nMr Vinula Uthsara Buthgamumudalige \n \n \n \n \n \n \n02/2025')

In [55]:
from uuid import uuid4
uuids = [str(uuid4()) for _ in range(len(all_splits))]
uuids

['089309db-509f-4c97-a8a5-ff435698df1a',
 'a68d1def-f487-4b59-8a42-bbf648389b9c',
 '85b4d364-4dfe-40b6-a248-00f5c7dedd64',
 '0ca3f3f7-8aec-4011-ba3a-5d59db9563c6',
 'b4baefc2-d0ed-4ef6-b2ee-cd1e11a7645c',
 '3b6d6e46-f60d-4952-8aa2-a659305bdf54',
 '4a89abf6-6138-4f05-9133-978b57be248f',
 'a6f08c0a-8bb8-4f1f-b45c-855287bf5107',
 '45f64788-4cd4-44bc-98b0-cf03b05b0e04',
 'f5d3bdbb-0c7d-4728-a9eb-0263071904c1',
 'cf95a61d-3c4f-4917-a20e-cdbbf66f5cc0',
 '8ef0f3e4-c35a-4cf0-966d-467ab1ce6618',
 'ccedea6c-45f6-4239-8566-f11ca77fff55',
 '8e916611-8929-4e66-9e62-9a0da2d2bc45',
 '987636a8-5922-44dc-82d2-2a04353a42a7',
 '0497fe2b-c9a3-4395-b661-03ab0e8949d3',
 '0fda5af4-141d-43f8-b1ce-a4104f69c8bc',
 'c8d81bf5-96c2-4a08-9e1b-fb6cc63aecb3',
 '89de2833-33b8-4eb9-9f05-aaee02a3834a',
 '001649a5-610c-4673-aaf5-60827e94f0c5',
 'c1ef8e58-235e-4df2-bb4c-5da3c2ac1230',
 'c285d8cd-4cda-428e-ad26-68f13191dfec',
 '35bad6f0-bfc0-4994-9c97-5df0fdd4ff90',
 'a2fcbb20-81e4-4f0d-a465-387b18a5f3b6',
 '5b5e800d-5852-

In [48]:
len(uuids)

132

In [50]:
all_splits[0]

Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': './docs/Chenula IPD.pdf', 'total_pages': 65, 'page': 0, 'page_label': '1'}, page_content='INFORMATICS INSTITUTE OF TECHNOLOGY \nIn Collaboration with \nUNIVERSITY OF WESTMINSTER \n \nHybrid Prompt Compression for RAG Pipelines \n \nA Project Proposal by \nChenula Senkith Jayasinghe \n \nSupervised by \nMr Vinula Uthsara Buthgamumudalige \n \n \n \n \n \n \n02/2025')

In [51]:
all_split_texts[0]

Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': './docs\\Chenula IPD.pdf', 'total_pages': 65, 'page': 0, 'page_label': '1'}, page_content='INFORMATICS INSTITUTE OF TECHNOLOGY \nIn Collaboration with \nUNIVERSITY OF WESTMINSTER \n \nHybrid Prompt Compression for RAG Pipelines \n \nA Project Proposal by \nChenula Senkith Jayasinghe \n \nSupervised by \nMr Vinula Uthsara Buthgamumudalige \n \n \n \n \n \n \n02/2025')

In [53]:
all_splits[0]

Document(metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': './docs/Chenula IPD.pdf', 'total_pages': 65, 'page': 0, 'page_label': '1'}, page_content='INFORMATICS INSTITUTE OF TECHNOLOGY \nIn Collaboration with \nUNIVERSITY OF WESTMINSTER \n \nHybrid Prompt Compression for RAG Pipelines \n \nA Project Proposal by \nChenula Senkith Jayasinghe \n \nSupervised by \nMr Vinula Uthsara Buthgamumudalige \n \n \n \n \n \n \n02/2025')

In [56]:
import numpy as np
vector_store.add_documents(documents=all_splits, ids=uuids)

# # Ensure that all_splits and uuids are not None and have the same length
# if all_splits is not None and uuids is not None and len(all_splits) == len(uuids):
#     try:
#         vector_store.add_documents(documents=all_splits, ids=uuids)
#     except AssertionError as e:
#         # Print the dimensions of the embeddings and the expected dimension
#         embeddings_np = vector_store._embed_documents([doc.page_content for doc in all_splits])
#         embeddings_np = np.array(embeddings_np)  # Convert to NumPy array
#         print(f"Embedding dimension: {embeddings_np.shape[1]}")
#         print(f"Expected dimension: {vector_store.d}")
#         raise e
# else:
#     raise ValueError("Documents and IDs must be non-None and have the same length.")

['089309db-509f-4c97-a8a5-ff435698df1a',
 'a68d1def-f487-4b59-8a42-bbf648389b9c',
 '85b4d364-4dfe-40b6-a248-00f5c7dedd64',
 '0ca3f3f7-8aec-4011-ba3a-5d59db9563c6',
 'b4baefc2-d0ed-4ef6-b2ee-cd1e11a7645c',
 '3b6d6e46-f60d-4952-8aa2-a659305bdf54',
 '4a89abf6-6138-4f05-9133-978b57be248f',
 'a6f08c0a-8bb8-4f1f-b45c-855287bf5107',
 '45f64788-4cd4-44bc-98b0-cf03b05b0e04',
 'f5d3bdbb-0c7d-4728-a9eb-0263071904c1',
 'cf95a61d-3c4f-4917-a20e-cdbbf66f5cc0',
 '8ef0f3e4-c35a-4cf0-966d-467ab1ce6618',
 'ccedea6c-45f6-4239-8566-f11ca77fff55',
 '8e916611-8929-4e66-9e62-9a0da2d2bc45',
 '987636a8-5922-44dc-82d2-2a04353a42a7',
 '0497fe2b-c9a3-4395-b661-03ab0e8949d3',
 '0fda5af4-141d-43f8-b1ce-a4104f69c8bc',
 'c8d81bf5-96c2-4a08-9e1b-fb6cc63aecb3',
 '89de2833-33b8-4eb9-9f05-aaee02a3834a',
 '001649a5-610c-4673-aaf5-60827e94f0c5',
 'c1ef8e58-235e-4df2-bb4c-5da3c2ac1230',
 'c285d8cd-4cda-428e-ad26-68f13191dfec',
 '35bad6f0-bfc0-4994-9c97-5df0fdd4ff90',
 'a2fcbb20-81e4-4f0d-a465-387b18a5f3b6',
 '5b5e800d-5852-

In [63]:
results = vector_store.similarity_search(
    "Tell me about query retreival",
    # k=8,
)
print(len(results))
results[0].page_content

4


'2. Embedding Generation \n \n \n \n \n \n3. FAISS Index Initialization \n \n \n \n \n4. Query Vectorization and Retrieval'

In [22]:
%pip install -qU langchain langchain_community

# Local vector store via Chroma
%pip install -qU langchain_chroma

# Local inference and embeddings via Ollama
%pip install -qU langchain_ollama

# Web Loader
%pip install -qU beautifulsoup4

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()
data

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
# all_splits = text_splitter.split_documents(data)

[Document(metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, thereby improving the quality of final resu

In [64]:
vector_store.save_local("vector_store")

In [68]:
new_vector_store = FAISS.load_local(
    "vector_store", embeddings, allow_dangerous_deserialization=True
)

docs = new_vector_store.similarity_search("qux")
docs

[Document(id='fca60273-b5ca-4ced-bb22-3b27d10e991c', metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': './docs/Chenula IPD.pdf', 'total_pages': 65, 'page': 48, 'page_label': '49'}, page_content='Figure 8 User Interface'),
 Document(id='690f6e21-e312-437e-b787-24231af46af5', metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': './docs/Chenula IPD.pdf', 'total_pages': 65, 'page': 47, 'page_label': '48'}, page_content='User Interface \n \n \n \n>> Next Page'),
 Document(id='8758dbc2-7ae7-4bd7-9da8-2fe8f2b47d14', metadata={'producer': 'Microsoft® Word LTSC', 'creator': 'Microsoft® Word LTSC', 'creationdate': '2025-02-14T00:05:05+05:30', 'author': 'User', 'moddate': '2025-02-14T00:05:05+05:30', 'source': './doc