<a href="https://colab.research.google.com/github/Sindu512/-Chat-pdf-using-RAG_bot/blob/main/Chat_with_PDF_Using_RAG_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade pip



In [None]:
!pip install pypdf2 sentence-transformers faiss-cpu --quiet

In [None]:
import os
import faiss
from sentence_transformers import SentenceTransformer
import pdfplumber
import numpy as np


In [None]:
# Step 1: Extract text from a PDF
def extract_text_from_pdf(pdf_path):
    extracted_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text += page.extract_text() + "\n"
    return extracted_text

In [None]:
# Step 2: Split text into chunks
def split_into_chunks(text, chunk_size=100):
    sentences = text.split(". ")
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk.split()) + len(sentence.split()) <= chunk_size:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks


In [None]:
# Step 3: Create embeddings for the chunks
def create_embeddings(chunks, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks)
    return embeddings, model


In [None]:
# Step 4: Store embeddings in FAISS

def store_embeddings(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

In [None]:
# Step 5: Search the most relevant chunks
def search_query(query, index, chunks, model):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, k=1)

    results = []
    for i in indices[0]:
        results.append(chunks[i])

    return results

In [None]:
# Step 6: Use a mock LLM to generate a response
def generate_response(query, results):
    response = "Here are the relevant results for your query: \n\n"
    for result in results:
        response += f"- {result}\n\n"
    return response

In [None]:
# Main Script
def main():
    # 1. Path to the PDF
    pdf_path = "/content/drive/MyDrive/Task1.pdf"  # Replace with your PDF path

    # 2. Extract and preprocess text
    text = extract_text_from_pdf(pdf_path)
    chunks = split_into_chunks(text)

    # 3. Create embeddings
    embeddings, model = create_embeddings(chunks)

    # 4. Store embeddings in FAISS index
    index = store_embeddings(np.array(embeddings))

    # 5. Query the system
    query = "can you give U.S. Bureau of Labor Statistics?"  # Replace with your query
    results = search_query(query, index, chunks, model)

    # 6. Generate and print response
    response = generate_response(query, results)
    print(response)

if __name__ == "__main__":
    main()

Here are the relevant results for your query: 

- Bureau of Labor Statistics
Year 2010 2011 2012 2013 2014 2015
All Industries 26093515 27535971 28663246 29601191 30895407 31397023
Manufacturing 4992521 5581942 5841608 5953299 6047477 5829554
Finance,
Insurance, Real
4522451 4618678 4797313 5031881 5339678 5597018
Estate, Rental,
Leasing
Arts,
Entertainment,
Recreation, 964032 1015238 1076249 1120496 1189646 1283813
Accommodation,
and Food Service
Other 15614511 16320113 16948076 17495515 18318606 18686638
• The chart below is called a pie chart.


