In [15]:
# Install necessary libraries, including the new langchain-community package
!pip install -q langchain langchain-community huggingface_hub transformers accelerate bitsandbytes sentence-transformers faiss-cpu pypdf

print("✅ Libraries installed successfully!")

✅ Libraries installed successfully!


In [16]:
pip install pymupdf



In [17]:
import os
import time
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [18]:
!mkdir -p product_data

# List the files to ensure they are uploaded correctly
!ls product_data

# Define the path to your data
DATA_PATH = "product_data/"

'2.1 Introduction to RAGs.pdf'	'2.4 Indexation and Vector search.pdf'
'2.2 Chunking.pdf'		'2.5 Summary of RAGs.pdf'
'2.3 Retrieval Methods.pdf'	'2.6 Evaluation metrics for RAG systems.pdf'


In [25]:
# Load documents from the specified directory
documents = []
for filename in os.listdir(DATA_PATH):
    if filename.endswith('.pdf'):
        file_path = os.path.join(DATA_PATH, filename)
        try:
            loader = PyMuPDFLoader(file_path)
            loaded_docs = loader.load()
            for doc in loaded_docs:
                doc.metadata['source'] = filename
            documents.extend(loaded_docs)
            print(f"Loaded {len(loaded_docs)} pages from {filename}")
        except Exception as e:
            print(f"Error loading {filename}: {e}")

print(f"\n✅ Total documents loaded: {len(documents)}")

Loaded 1 pages from 2.2 Chunking.pdf
Loaded 2 pages from 2.6 Evaluation metrics for RAG systems.pdf
Loaded 2 pages from 2.4 Indexation and Vector search.pdf
Loaded 2 pages from 2.3 Retrieval Methods.pdf
Loaded 2 pages from 2.5 Summary of RAGs.pdf
Loaded 2 pages from 2.1 Introduction to RAGs.pdf

✅ Total documents loaded: 11


In [26]:
# --- Parameters ---
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 150
EMBEDDING_MODEL_NAME = "BAAI/bge-large-en-v1.5"
LLM_MODEL_ID = "google/flan-t5-large"
RETRIEVER_K = 3

# Initialize the text splitter with our parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

# Split the documents into chunks
chunked_docs = text_splitter.split_documents(documents)

print(f"Total documents chunked into {len(chunked_docs)} pieces.")

Total documents chunked into 17 pieces.


In [28]:
# Define model and encoding kwargs
model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}

# Load embedding model and capture timing
print("Loading embedding model...")
start_time = time.time()
embeddings = HuggingFaceBgeEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
embedding_load_time = time.time() - start_time
print(f"✅ Embedding model loaded in {embedding_load_time:.2f} seconds.")

# Create the FAISS vector store and capture timing
print("\nCreating vector store...")
start_time = time.time()
db = FAISS.from_documents(chunked_docs, embeddings)
db_creation_time = time.time() - start_time
print(f"✅ Vector store created in {db_creation_time:.2f} seconds.")

Loading embedding model...
✅ Embedding model loaded in 8.82 seconds.

Creating vector store...
✅ Vector store created in 1.17 seconds.


In [29]:
# Load the LLM and capture timing
print("Loading LLM...")
start_time = time.time()

tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL_ID, load_in_8bit=True, device_map="auto")

# Create a text-generation pipeline
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0.1,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)

llm_load_time = time.time() - start_time
print(f"✅ LLM loaded in {llm_load_time:.2f} seconds.")

Loading LLM...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✅ LLM loaded in 10.80 seconds.


In [30]:
# Define a custom prompt template
prompt_template = """
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}

Question: {question}

Helpful Answer:
"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

# Create the RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": RETRIEVER_K}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

print("✅ RAG chain created successfully.")

✅ RAG chain created successfully.


In [31]:
# Define our list of questions
queries = [
    "In the context of RAG systems, what does chunking mean?",
    "Why is RAG needed instead of just retraining a model with new data?",
    "What is re-ranking and how does it improve retrieval?",
    "What is FAISS and what is it used for?",
    "What is the difference between retrieval metrics and answer evaluation metrics?"
]

results = []
query_times = []

for query in queries:
    print(f"--- Running Query: {query} ---")
    start_time = time.time()
    result = qa_chain({"query": query})
    query_time = time.time() - start_time

    results.append(result)
    query_times.append(query_time)

    print(f"Answer: {result['result']}")
    print(f"Time Taken: {query_time:.2f} seconds\n")

print("✅ All queries processed.")

Token indices sequence length is longer than the specified maximum sequence length for this model (672 > 512). Running this sequence through the model will result in indexing errors
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


--- Running Query: In the context of RAG systems, what does chunking mean? ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer: splitting the text into different segments and we store these different segments
Time Taken: 6.35 seconds

--- Running Query: Why is RAG needed instead of just retraining a model with new data? ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer: Catastrophic forgetting of neural networks
Time Taken: 9.89 seconds

--- Running Query: What is re-ranking and how does it improve retrieval? ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer: Re-ranking is a post retrieval technique which can be used to improve the relevancy of the top k chunks being passed to the generation model
Time Taken: 33.53 seconds

--- Running Query: What is FAISS and what is it used for? ---


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Answer: I don’t know
Time Taken: 2.92 seconds

--- Running Query: What is the difference between retrieval metrics and answer evaluation metrics? ---
Answer: The answer generated by the RAG system should also be evaluated because sometimes we may not choose the correct answer from the retrieved context.
Time Taken: 7.46 seconds

✅ All queries processed.


In [32]:
# --- Automated Report Generation ---

# Calculate average query time
avg_query_time = sum(query_times) / len(query_times) if query_times else 0

# Start building the report string
report = f"""
RAG System Design and Experiment Report
=======================================

Section 1: Metadata about Document Chunks
-----------------------------------------
1.  **Chunking Method Used**: RecursiveCharacterTextSplitter from LangChain.
2.  **Splitting Technique Details**: This splitter uses a hierarchy of separators (e.g., "\\n\\n", "\\n", " ") to keep semantically related text together.
3.  **Chunk Size Measurement Criteria**: Number of characters.
4.  **Chunk Size Used**: {CHUNK_SIZE} characters.
5.  **Chunk Overlapping**: {CHUNK_OVERLAP} characters.
6.  **Chunk Preprocessing**: No special preprocessing was applied beyond the text extraction handled by PyMuPDFLoader.

---

Section 2: DB and Model Details
-------------------------------
1.  **Database Used**: FAISS (Facebook AI Similarity Search) in-memory vector store.
2.  **Embedding Model**: {EMBEDDING_MODEL_NAME}
3.  **LLM Used**: {LLM_MODEL_ID}
4.  **Processing Latencies**:
    * **Embedding Model Load Time**: {embedding_load_time:.2f} seconds.
    * **LLM Load Time**: {llm_load_time:.2f} seconds.
    * **Database Creation Time (Vectorization & Indexing)**: {db_creation_time:.2f} seconds.
    * **Average Query Time (End-to-End)**: {avg_query_time:.2f} seconds.
5.  **Hardware Usage**: Models were configured to run on GPU if available ('{model_kwargs['device']}').

---

Section 3: LLM / Model Evaluation Parameters
---------------------------------------------
1.  **LLM Framework**: HuggingFacePipeline in LangChain.
2.  **Tokens per Chunk (Approximate)**: {CHUNK_SIZE // 4} tokens. With k={RETRIEVER_K}, context is ~{RETRIEVER_K * (CHUNK_SIZE // 4)} tokens.
3.  **Decoding Strategy**:
    * **max_length**: 512
    * **temperature**: 0.1
    * **top_p**: 0.95
    * **repetition_penalty**: 1.15

---

Section 4: Experiment Results on the Best Parameters
--------------------------------------------------
1.  **Prompt Used**:
    ```
    {prompt_template}
    ```
2.  **Chain Type Used**: `stuff`
3.  **Best Chunk Parameters**: `chunk_size`: {CHUNK_SIZE}, `chunk_overlap`: {CHUNK_OVERLAP}.
4.  **Retriever Parameters**: FAISS retriever with `k={RETRIEVER_K}`.

5.  **Five Example Results**:
"""

# Append the query results to the report
for i, result in enumerate(results):
    report += f"""
    -----------------------------------
    **Example {i+1}**:
    * **Query**: {result['query']}
    * **Answer**: {result['result']}
    * **Source Docs**: {[doc.metadata.get('source', 'N/A') for doc in result['source_documents']]}
    * **Time Taken**: {query_times[i]:.2f} seconds
"""

# Append the final section on metrics
report += """
6.  **Retrieval and Answer Level Metrics (Manual Evaluation)**:
    * **Instructions**: For each example above, manually judge the results.
    * **Context Precision**: (Relevant Retrieved Docs) / (Total Retrieved Docs). Did the sources contain the right info?
    * **Context Recall**: (Relevant Retrieved Docs) / (Total Relevant Docs). Did we find all the possible sources?
    * **Answer Faithfulness**: Does the answer stick strictly to the provided context? (Score 1 for Yes, 0 for No).
    * **Answer Relevancy**: Is the answer relevant to the question? (Score 1 for Yes, 0 for No).
"""

# Print the final report
print(report)

# Optionally, save the report to a file
with open("rag_report.txt", "w") as f:
    f.write(report)

print("\n\n✅ Report generated and saved to rag_report.txt")


RAG System Design and Experiment Report

Section 1: Metadata about Document Chunks
-----------------------------------------
1.  **Chunking Method Used**: RecursiveCharacterTextSplitter from LangChain.
2.  **Splitting Technique Details**: This splitter uses a hierarchy of separators (e.g., "\n\n", "\n", " ") to keep semantically related text together.
3.  **Chunk Size Measurement Criteria**: Number of characters.
4.  **Chunk Size Used**: 1000 characters.
5.  **Chunk Overlapping**: 150 characters.
6.  **Chunk Preprocessing**: No special preprocessing was applied beyond the text extraction handled by PyMuPDFLoader.

---

Section 2: DB and Model Details
-------------------------------
1.  **Database Used**: FAISS (Facebook AI Similarity Search) in-memory vector store.
2.  **Embedding Model**: BAAI/bge-large-en-v1.5
3.  **LLM Used**: google/flan-t5-large
4.  **Processing Latencies**:
    * **Embedding Model Load Time**: 8.82 seconds.
    * **LLM Load Time**: 10.80 seconds.
    * **Databas