In [1]:
import os
os.chdir("../")

### Model

In [2]:
from rag_pipeline.components.models import ModelConfig

In [3]:
model = ModelConfig()

In [4]:
model.get_agent_model(
    agent_name = "question_rewriter"
).get("name")

'gpt-4o-mini'

### Prompts

In [5]:
from rag_pipeline.components.prompts import render_prompt

In [7]:
render_prompt(
    prompt_name = "question_rewriter",
    current_question = "What is RAG?",
    conversation = "User: What is AI?\nAgent: AI stands for Artificial Intelligence"
)

{'system': 'ROLE\n- You are a question rewriter for a Retrieval-Augmented Generation system.\n\nINPUTS\n- "current_question": The user\'s most recent query.\n- "conversation": A short history of recent interactions (if available).\n\nTASK\n- Rephrase the query into a clear, standalone form that captures the full intent of the user\'s question.\n\nRULES\n- Use conversation only to clarify vague references like "this file" or "the previous report".\n- Keep the question concise and factual (under 40 tokens).\n- Do not guess missing details; preserve ambiguity if uncertain.\n- Avoid multi-sentence questions.',
 'user': 'USER QUERY: What is RAG?\nCHAT HISTORY: User: What is AI?\nAgent: AI stands for Artificial Intelligence',
 'output_schema': '{\n  "rephrased_question": "string"\n}\n'}

### Graph

In [2]:
from graph import run_graph

In [3]:
graph = run_graph()

In [4]:
user_input = "What is RAG?"
input_data = {
    "question": user_input
}

In [5]:
await graph.ainvoke(
    input = input_data,
    config = {
        "configurable": {
            "retriever": None
        }
    }
)

{'question': 'What is RAG?',
 'rephrased_question': 'What does RAG stand for and what does it mean?'}

### Data Extract

In [2]:
from rag_pipeline.utils.extract_doc import extract_from_pdf, extract_from_text_files

In [3]:
pdf_texts, pfd_tables, pdf_metadata = extract_from_pdf()
txt_texts, txt_metadata = extract_from_text_files()

📄 Reading Assignment.pdf ...
📄 Reading Networking Activity.pdf ...
✅ Extracted 4 pages and 0 tables from PDFs.
📄 RAG.txt: 2 chunks
✅ Extracted 2 text chunks from 'data'.


In [4]:
pdf_texts

[{'content': 'LLM Specialist Assignment\nOverview\nCreate a Retrieval-Augmented Generation (RAG) pipeline that allows users to upload documents and\nask questions based on their content. The system should leverage vector databases for efficient retrieval\nand an LLM API (e.g., OpenAI, Gemini, or another REST-based model) for generating responses. The entire\napplication should be containerized using Docker and deployable on cloud or local environments.\nRequirements:\n1. Document Ingestion & Processing:\no Support uploading up to 20 documents, each with a maximum of 1000 pages.\no Chunk documents into manageable sizes for efficient retrieval.\no Use text embeddings to store document chunks in a vector database (e.g., FAISS, Pinecone,\nWeaviate, or ChromaDB).\n2. Retrieval-Augmented Generation (RAG) Pipeline:\no Accept user queries and retrieve relevant document chunks.\no Pass the retrieved chunks to the LLM API for contextual response generation.\no Ensure responses are accurate, conc

In [6]:
pdf_metadata

[{'file_name': 'Assignment.pdf',
  'file_path': 'C:\\Users\\rahul\\Desktop\\Coding Assignment\\Retrieval-Augmented-Generation\\data\\Assignment.pdf',
  'type': 'pdf',
  'size_kb': 84.35,
  'total_pages': 2,
  'total_tables': 0},
 {'file_name': 'Networking Activity.pdf',
  'file_path': 'C:\\Users\\rahul\\Desktop\\Coding Assignment\\Retrieval-Augmented-Generation\\data\\Networking Activity.pdf',
  'type': 'pdf',
  'size_kb': 215.3,
  'total_pages': 2,
  'total_tables': 0}]

In [7]:
txt_metadata

[{'file_name': 'RAG.txt',
  'file_path': 'C:\\Users\\rahul\\Desktop\\Coding Assignment\\Retrieval-Augmented-Generation\\data\\RAG.txt',
  'type': 'text',
  'size_kb': 0.66,
  'total_chunks': 2}]

In [6]:
pdf_texts.extend(txt_texts)

In [7]:
pdf_texts

[{'content': 'LLM Specialist Assignment\nOverview\nCreate a Retrieval-Augmented Generation (RAG) pipeline that allows users to upload documents and\nask questions based on their content. The system should leverage vector databases for efficient retrieval\nand an LLM API (e.g., OpenAI, Gemini, or another REST-based model) for generating responses. The entire\napplication should be containerized using Docker and deployable on cloud or local environments.\nRequirements:\n1. Document Ingestion & Processing:\no Support uploading up to 20 documents, each with a maximum of 1000 pages.\no Chunk documents into manageable sizes for efficient retrieval.\no Use text embeddings to store document chunks in a vector database (e.g., FAISS, Pinecone,\nWeaviate, or ChromaDB).\n2. Retrieval-Augmented Generation (RAG) Pipeline:\no Accept user queries and retrieve relevant document chunks.\no Pass the retrieved chunks to the LLM API for contextual response generation.\no Ensure responses are accurate, conc

### Data Extract Pipeline

In [2]:
from rag_pipeline.pipeline.data_extract import extract_data_pipeline

In [3]:
texts, tables, metadata = extract_data_pipeline()

📄 Reading Assignment.pdf ...
📄 Reading Networking Activity.pdf ...
✅ Extracted 4 pages and 0 tables from PDFs.
📄 RAG.txt: 2 chunks
✅ Extracted 2 text chunks from 'data'.
✅ Combined 6 text chunks from 3 files.


### retriever

In [4]:
from rag_pipeline.components.retriever import create_retriever

In [5]:
db, path = create_retriever(
    texts = texts,
    tables = tables,
    model_name = "text-embedding-3-small",
    save_path = "./models/faiss_index"
)

📚 Preparing 6 documents for indexing...
✅ Retriever (FAISS index) created and saved at: ./models/faiss_index


In [9]:
db.similarity_search("What is RAG?", k = 4)

[Document(id='aab8287d-356c-4fc7-9828-b7105f40ad94', metadata={'source': 'RAG.txt', 'type': 'text', 'chunk_id': 1}, page_content='Retrieval-Augmented Generation (RAG) combines retrieval and generation to improve the factual accuracy of language models. It works by fetching relevant information from external sources such as documents or databases, then using that information to generate more accurate and context-aware responses.'),
 Document(id='f4b985ec-21e4-4591-9891-247d37690cb4', metadata={'source': 'RAG.txt', 'type': 'text', 'chunk_id': 2}, page_content='A typical RAG system includes three main components:\n1. A retriever that searches and fetches relevant chunks.\n2. A generator (usually an LLM) that uses the retrieved data to answer questions.\n3. A vector database like FAISS that stores embeddings for quick semantic search. This approach reduces hallucinations and enhances the reliability of LLM-based systems.'),
 Document(id='b452ef9f-954b-4efc-8779-fd47b6d124ed', metadata={'so