## *Importing The Libraries*

In [1]:
import os      # Import OS module for file handling
import fitz    # PyMuPDF: A library for extracting text from PDFs
from langchain.text_splitter import RecursiveCharacterTextSplitter
import random
import chromadb
import ollama  # Import Ollama for chat model interaction
from sentence_transformers import SentenceTransformer

## PymyPDF

### *Text Extraction Function*

In [2]:

def extract_text_from_pdfs(project_dataset):
    """Extracts text from all PDFs in a given folder."""
    all_text_chunks = []  # List to store extracted text chunks

    for pdf_file in os.listdir(project_dataset):  # Loop through all files in the folder
        if pdf_file.endswith(".pdf"):  # Process only PDF files
            pdf_path = os.path.join(project_dataset, pdf_file)  # Get full file path
            doc = fitz.open(pdf_path)  # Open the PDF file
            
            for page_num, page in enumerate(doc):  # Loop through each page
                text = page.get_text("text")  # Extract text from the page
                if text.strip():  # Ignore empty pages
                    all_text_chunks.append({  # Store extracted text with metadata
                        "content": text,  
                        "metadata": {"source": pdf_file, "page": page_num + 1}  
                    })
    
    return all_text_chunks  # Return extracted text and metadata

### *Calling The Function*

In [3]:
# Call the function and store the result
project_dataset = r"C:\Users\ronit\project_dataset"  # Replace with the actual folder path
all_text_chunks = extract_text_from_pdfs(project_dataset)  # Call the function

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'



In [4]:

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    extracted = []

    for i, page in enumerate(doc):
        try:
            text = page.get_text()
            extracted.append((i + 1, text))
        except Exception as e:
            print(f"Error on page {i + 1}: {e}")
            continue

    return extracted

    print(extracted)


### *Checking the Funtion*

In [18]:
# all_text_chunks is defined, and we can loop through it
target_pdf = "auction_two_wheeler.pdf"  # Change this to the specific file we want to check

for chunk in all_text_chunks:
    if chunk["metadata"]["page"] == 1  and chunk["metadata"]["source"] == target_pdf:
        print(chunk["content"])
        print(chunk["metadata"])

GOVERNMENT OF INDIA 
MINISTRY OF RAILWAYS 
(RAILWAY BOARD) 
The General Managers(Comml), 
All Indian Railways.  
Sub:- Auctioning of two-wheelers. 
         Chief Claims Officer, Southern Railway has brought to the notice of Ministry of Railways that at the 
time of booking of two-wheeler consignments no documentary evidence/proof is insisted upon in the 
forwarding note to establish genuineness of the vehicle. As and when unclaimed/undelivered two-wheelers 
put to public auction these either fetch very low bid or no bid as the intending bidders are not ready to buy 
them as it is difficult to get fresh registration for want of documents.  
         Ministry of Railways have examined the matter and it has been decided that following procedure 
should be adopted by the railways for booking of two-wheelers:-  
1. The booking clerk should ensure that the consignor fills his complete home address, telephone 
number etc. in the Forwarding Note.  
2. Party should be asked to furnish a copy o

## RecursiveCharacterTextSplitter

### *Chunking*

In [6]:
def process_pdf(pdf_path):
    """Extracts text from a PDF and splits it into smaller chunks with metadata."""
    pdf_texts = extract_text_from_pdfs(pdf_path)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    
    all_chunks = []
    for text_entry in pdf_texts:
        chunks = text_splitter.split_text(text_entry["content"])
        for chunk in chunks:
            all_chunks.append({"content": chunk, "metadata": text_entry["metadata"]})

    return all_chunks

### *Checking The Function*

In [7]:
processed_chunks = process_pdf(project_dataset)
print(random.choice(processed_chunks))

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

{'content': 'General services and also S.&T. Department, since\nall the S. & T. and Electrical lines are cabled on\naccount of Electrical Induction.\nIn all A. C. and D. C. traction areas, cable\nmarkers showing location of cables are provided by\nthe Traction Department. In addition, the cables are\nprotected by tiles and bricks, and during excavation\nif workmen come across such tiles or bricks in an\narranged manner, they should at once report the\nmatter to the higher officials. Any further excavation', 'metadata': {'source': 'IRPWMUPTOACS148.pdf', 'page': 137}}



## Embeddings

### *Sentence Transformer for Embeddings*

In [8]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
print("✅ Model loaded successfully.")


✅ Model loaded successfully.


### *Testing The Model*

In [9]:
# Test the model with a sample sentence
sample = "Indian Railways is one of the largest rail networks in the world."
embedding = embedding_model.encode(sample)
print("✅ Sample embedding generated. Length:", len(embedding))

✅ Sample embedding generated. Length: 384


### *Database Deletion(if only a data base already exists)(only when running the whole code)*

In [10]:
import shutil

if os.path.exists("./chroma_db"):
    shutil.rmtree("./chroma_db")
    print("🗑️ Old ChromaDB removed.")


🗑️ Old ChromaDB removed.


### *Creating Vector Database Using ChromaDB*

In [None]:
def store_embeddings(chunks):
    """Encodes text chunks and stores them in ChromaDB with metadata."""
    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    collection = chroma_client.get_or_create_collection(name="railway_docs")

    # Log 1: Extract contents using explicit loop
    contents = []
    for c in chunks:
        contents.append(c["content"])

    # ✅ Encode contents directly (no instruction needed)
    embeddings = embedding_model.encode(contents)

    # Log 2: Extract ids using explicit loop
    ids = []
    for i in range(len(chunks)):
        ids.append(str(i))

    # Log 3: Extract metadatas using explicit loop
    metadatas = []
    for c in chunks:
        metadatas.append(c["metadata"])

    # Log 4: Convert embeddings to lists using explicit loop
    converted_embeddings = []
    for e in embeddings:
        converted_embeddings.append(e.tolist())

    # Add all at once to ChromaDB
    collection.add(
        ids=ids,
        documents=contents,
        embeddings=converted_embeddings,
        metadatas=metadatas
    )

    return collection

collection = store_embeddings(processed_chunks)

ValueError: Batch size 10317 exceeds maximum batch size 5461

In [11]:
def store_embeddings(chunks):
    """Encodes text chunks and stores them in ChromaDB with metadata in batches."""
    chroma_client = chromadb.PersistentClient(path="./chroma_db")
    collection = chroma_client.get_or_create_collection(name="railway_docs")

    contents = []
    for c in chunks:
        contents.append(c["content"])

    embeddings = embedding_model.encode(contents)

    ids = []
    for i in range(len(chunks)):
        ids.append(str(i))

    metadatas = []
    for c in chunks:
        metadatas.append(c["metadata"])

    converted_embeddings = []
    for e in embeddings:
        converted_embeddings.append(e.tolist())

    # ✅ Split into safe batches
    BATCH_SIZE = 5000
    total = len(ids)
    
    for i in range(0, total, BATCH_SIZE):
        print(f"Adding batch {i} to {min(i + BATCH_SIZE, total)}...")
        collection.add(
            ids=ids[i:i + BATCH_SIZE],
            documents=contents[i:i + BATCH_SIZE],
            embeddings=converted_embeddings[i:i + BATCH_SIZE],
            metadatas=metadatas[i:i + BATCH_SIZE]
        )

    return collection


In [12]:
collection = store_embeddings(processed_chunks)

Adding batch 0 to 5000...
Adding batch 5000 to 10000...
Adding batch 10000 to 15000...
Adding batch 15000 to 15128...


## Retriever

### *Retrieve with Confidence Filter*

In [13]:
def retrieve_chunks(query, collection, threshold=0.6):
    """Retrieves top chunks with confidence check."""
    query_emb = embedding_model.encode(query, convert_to_tensor=True)
    results = collection.query(query_embeddings=[query_emb.tolist()], n_results=5, include=["distances", "metadatas", "documents"])

    top_distances = results.get("distances", [[]])[0]
    top_docs = results.get("documents", [[]])[0]
    top_meta = results.get("metadatas", [[]])[0]

    retrieved = []
    for doc, meta, dist in zip(top_docs, top_meta, top_distances):
        similarity = 1 - dist  # cosine distance to similarity
        if similarity >= threshold:
            retrieved.append({"content": doc, "metadata": meta})

    return retrieved

### *Answering With Ollama*

In [19]:
def answer_query(query, collection):
    """Generates response using retrieved context and Ollama."""
    LLM = "llama3.1"
    chunks = retrieve_chunks(query, collection)

    if not chunks:
        return "❌ No relevant info found for your query.", []

    context = "\n\n".join([c["content"] for c in chunks])
    metadata = [c["metadata"] for c in chunks]
    
    prompt = f"Use the following context to answer:\n\n{context}\n\nQuestion: {query}\nAnswer:"

    response = ollama.chat(model=LLM, messages=[{"role": "user", "content": prompt}])
    return response["message"]["content"], metadata

## Querying

### *Query and Retrieve*

In [20]:
# === Step 3: Query and Retrieve ===
query = "Auctioning of two-wheelers?"
retrieved_chunks = retrieve_chunks(query, collection, threshold=0.3)


### *Previewing Retrieved Chunks*

In [21]:
# === Step 4:  ===
print("\n🔍 Retrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks):
    print(f"\n[{i+1}]")
    print("📄 Content Preview:", chunk["content"][:300].strip().replace("\n", " "))
    print("📎 Metadata:", chunk["metadata"])


🔍 Retrieved Chunks:

[1]
📄 Content Preview: put to public auction these either fetch very low bid or no bid as the intending bidders are not ready to buy  them as it is difficult to get fresh registration for want of documents.            Ministry of Railways have examined the matter and it has been decided that following procedure  should be
📎 Metadata: {'page': 1, 'source': 'auction_two_wheeler.pdf'}


## Generation

In [22]:
# === Step 5: Build Prompt ===
def build_prompt(context, question):
    return f"""
You are a helpful assistant. Use the following extracted context to answer the question. 
If the answer is not found in the context, say "Answer not found in the provided documents." 

Context:
{context}

Question: {question}
Answer:"""

context = "\n\n".join([chunk["content"] for chunk in retrieved_chunks])
prompt = build_prompt(context, query)

# === Step 6: Get LLM Response ===
response = ollama.chat(model="llama3.1", messages=[{"role": "user", "content": prompt}])
print("\n🧠 Answer:\n", response["message"]["content"])

# === Step 7: Show Metadata Summary ===
if retrieved_chunks:
    metadata_info = [
        f"📎 Source: {chunk['metadata'].get('source', 'Unknown')}, Page: {chunk['metadata'].get('page', 'Unknown')}"
        for chunk in retrieved_chunks
    ]
    metadata_str = "\n".join(metadata_info)
    print("\n✅ Relevant Sources:\n" + metadata_str)
else:
    print("\n❌ No relevant info found for your query.")


🧠 Answer:
 According to the context, it is mentioned that "put to public auction these either fetch very low bid or no bid as the intending bidders are not ready to buy them..." which implies that the intention is to put the two-wheelers up for public auction.

✅ Relevant Sources:
📎 Source: auction_two_wheeler.pdf, Page: 1
