In [10]:
## Data Ingestion
from langchain_community.document_loaders import PyPDFLoader
loader=PyPDFLoader("vllm.pdf")
docs =loader.load()
docs



[Document(page_content='Efficient Memory Management for Large Language\nModel Serving with PagedAttention\nWoosuk Kwon1,∗ Zhuohan Li1,∗ Siyuan Zhuang1 Ying Sheng1,2 Lianmin Zheng1 Cody Hao Yu3\nJoseph E. Gonzalez1 Hao Zhang4 Ion Stoica1\n1UC Berkeley 2Stanford University 3Independent Researcher 4UC San Diego\nAbstract\nHigh throughput serving of large language models (LLMs)\nrequires batching sufficiently many requests at a time. How-\never, existing systems struggle because the key-value cache\n(KV cache) memory for each request is huge and grows\nand shrinks dynamically. When managed inefficiently, this\nmemory can be significantly wasted by fragmentation and\nredundant duplication, limiting the batch size. To address\nthis problem, we propose PagedAttention, an attention al-\ngorithm inspired by the classical virtual memory and pag-\ning techniques in operating systems. On top of it, we build\nvLLM, an LLM serving system that achieves (1) near-zero\nwaste in KV cache memory and (2) 

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = text_splitter.split_documents(docs)
#print(documents[0].page_content)


In [12]:
# Download the embedding model from Hugging Face

from sentence_transformers import SentenceTransformer

# Define model name and local path
hf_model_name = "sentence-transformers/all-MiniLM-L6-v2"
local_path = "./local_models/all-MiniLM-L6-v2"

# Load and download the model from Hugging Face
model = SentenceTransformer(hf_model_name)

# Save the entire model to the local directory
model.save(local_path)

print(f"Model saved at: {local_path}")

Model saved at: ./local_models/all-MiniLM-L6-v2


In [13]:
# Generate Embeddings and Create a FAISS Index
# This step converts each chunk of your PDF into a vector 
# (using a sentence embedding model), and stores them in a FAISS vector store for fast similarity search.
# We'll now use the local model with langchain to embed your document chunks and store them in a FAISS vector store
# for fast retrieval.

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Step 1: Load local model for embeddings
embedding_model = HuggingFaceEmbeddings(model_name="./local_models/all-MiniLM-L6-v2")

# Step 2: Create FAISS vector store from your document chunks
vectorstore = FAISS.from_documents(documents, embedding_model)

# Step 3: Optionally save the FAISS index to disk
vectorstore.save_local("quantization_faiss_index")

print("FAISS index created and saved.")




FAISS index created and saved.


In [14]:
# Connect LangChain to the Running Ollama Mistral Model
from langchain.llms import Ollama
from langchain.chains import RetrievalQA

# Connect to the locally running Mistral model
llm = Ollama(model="mistral")  # Default port is http://localhost:11434


In [15]:
# Set up RAG with FAISS retriever
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

In [23]:
# Ask your question
query = "What is main benefit of PageAttention in vLLM?"
result = qa_chain(query)

print("Answer:", result["result"])

Answer:  The main benefit of PagedAttention in vLLM is that it allows for more efficient management of memory usage, enabling the processing of a larger number of requests at the same time compared to systems like Orca (Max) and FasterTransformer. This is achieved by storing continuous keys and values in non-contiguous memory space, which allows for batching more requests than these other systems.


In [28]:
print("👋 Welcome to the vLLM Q&A Assistant!")
print("💡 Ask one question related to vLLM, PagedAttention, or quantization.\n")

# Your predefined question
query = "What is main benefit of PageAttention in vLLM?"
print(f"❓ Question: {query}")

# Retrieve relevant docs
docs = vectorstore.similarity_search(query, k=2)

# Fallback if nothing relevant is found
if not docs or all(len(doc.page_content.strip()) < 20 for doc in docs):
    print("🤖 Hi! I'm here to help with questions about vLLM and related topics.")
else:
    # Run the RAG pipeline
    result = qa_chain.invoke(query)

    # Extract and print only the first sentence of the answer
    answer = result["result"].strip().split('\n')[0]
    print(f"\n📘 Answer: {answer}")


👋 Welcome to the vLLM Q&A Assistant!
💡 Ask one question related to vLLM, PagedAttention, or quantization.

❓ Question: What is main benefit of PageAttention in vLLM?

📘 Answer: The main benefit of PagedAttention in vLLM is its ability to manage memory usage efficiently, allowing it to process more requests at a time compared to other models such as Orca and FasterTransformer. This is made possible by storing continuous keys and values in non-contiguous memory space, which enables batching more requests, resulting in higher request rates.
