<a href="https://colab.research.google.com/github/RitamPatra76/RAG/blob/main/pdf_rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install langchain pypdf pdfplumber transformers faiss-cpu pydantic python-dotenv




In [10]:
!pip install -U langchain-community



In [13]:
import json
import pdfplumber
from langchain.document_loaders import PyPDFLoader, UnstructuredHTMLLoader
from transformers import pipeline
import os
from dotenv import load_dotenv

load_dotenv()

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

def extract_text_from_html(html_path):
    loader = UnstructuredHTMLLoader(html_path)
    data = loader.load()
    return " ".join([d.page_content for d in data])

qa_pipeline = pipeline("question-answering")

def extract_structured_data_from_document(document_text):
    queries = {
        "Bid Number": "What is the bid number?",
        "Title": "What is the full title of the RFP document?",
        "Due Date": "When is the bid due?",
        "Bid Submission Type": "How should the bid be submitted?",
        "Term of Bid": "What are the terms of the bid?",
        "Pre Bid Meeting": "Is there a pre-bid meeting?",
        "Installation": "What is the installation requirement?",
        "Bid Bond Requirement": "What is the bid bond requirement?",
        "Delivery Date": "When should the products be delivered?",
        "Payment Terms": "What are the payment terms?",
        "Any Additional Documentation Required": "Is any additional documentation required?",
        "MFG for Registration": "What manufacturer or registration is required?",
        "Contract or Cooperative to use": "What contract or cooperative should be used?",
        "Model_no": "What is the model number for the products?",
        "Part_no": "What is the part number for the products?",
        "Product": "What products are included in the RFP?",
        "contact_info": "What is the contact information?",
        "company_name": "What is the name of the company?",
        "Bid Summary": "Can you provide a summary of the bid?",
        "Product Specification": "What are the product specifications?",
    }

    structured_data = {}
    for field, query in queries.items():
        answer = qa_pipeline(question=query, context=document_text)
        structured_data[field] = answer['answer']

    return structured_data

def process_rfp_file(file_path, file_type, output_file="output.json"):
    if file_type.lower() == "pdf":
        document_text = extract_text_from_pdf(file_path)
    elif file_type.lower() == "html":
        document_text = extract_text_from_html(file_path)
    else:
        raise ValueError("Unsupported file type. Only 'pdf' and 'html' are supported.")

    structured_data = extract_structured_data_from_document(document_text)

    json_output = json.dumps(structured_data, indent=4)

    with open(output_file, "w") as file:
        file.write(json_output)

    return json_output

file_path = "/content/Addendum 1 RFP JA-207652 Student and Staff Computing Devices.pdf"
file_type = "pdf"
output_file_path = "structured_data_output.json"

output_json = process_rfp_file(file_path, file_type, output_file=output_file_path)
print("Final JSON Output saved to:", output_file_path)


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Final JSON Output saved to: structured_data_output.json
