In [3]:

# 1. Install Dependencies

!pip install PyPDF2 docx2txt tika


# 2. Import Libraries

import os
import json
import docx2txt
import PyPDF2
from tika import parser

# 3. PDF Extraction (PyPDF2)

def extract_pdf_pypdf2(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

# 4. DOCX Extraction (docx2txt)

def extract_docx(file_path):
    return docx2txt.process(file_path)

# 5. Multi-format Extraction (Apache Tika)

def extract_tika(file_path):
    parsed = parser.from_file(file_path)
    text = parsed.get("content", "")
    metadata = parsed.get("metadata", {})
    return text, metadata


# 6. Unified Proposal Extraction Function

def extract_proposal(file_path):
    ext = file_path.split(".")[-1].lower()
    data = {"title": None, "abstract": None, "budget": None, "raw_text": "", "metadata": {}}

    if ext == "pdf":
        text = extract_pdf_pypdf2(file_path)
        tika_text, metadata = extract_tika(file_path)
        text = text if text else tika_text
        data["raw_text"] = text
        data["metadata"] = metadata

    elif ext == "docx":
        text = extract_docx(file_path)
        tika_text, metadata = extract_tika(file_path)
        text = text if text else tika_text
        data["raw_text"] = text
        data["metadata"] = metadata

    else:
        raise ValueError("Unsupported file format: use PDF or DOCX")

    # Basic keyword heuristics for MVP
    lines = data["raw_text"].split("\n")
    for line in lines:
        if not data["title"] and ("title" in line.lower() or len(line.split()) < 10):
            data["title"] = line.strip()
        if "abstract" in line.lower():
            data["abstract"] = line.strip()
        if "budget" in line.lower() or "$" in line or "INR" in line:
            data["budget"] = line.strip()

    return data


# 7. Example Usage (using uploaded file)

file_path = "/content/Business Proposals.pdf"   # Replace with your uploaded file path in Colab

result = extract_proposal(file_path)

# Save to JSON
with open("proposal_output.json", "w") as f:
    json.dump(result, f, indent=4)

# Print extracted JSON
print(json.dumps(result, indent=4))






{
    "title": "Business Proposals, Spring 2022.   1 of 6  Business Proposals A business proposal is a document you send to potential customers to persuade them to do business with you. Business proposals are a common and effective way to win business. Research your potential customer before writing a business proposal; customize your proposal to address their needs. Your tone should appeal to your potential customer while aligning with your brand\u2019s personality. In general, you should strive to be clear and courteous. Your proposal should be long enough to convey why your potential customer should do business with you, but it should not be unnecessarily long or contain irrelevant details.   Parts of a Business Proposal Though every business proposal should be customized to appeal to a unique potential customer, most business proposals will include the following sections.  \u2022 Title Page \u2022 Table of Contents \u2022 Executive Summary  \u2022 Problem Statement  \u2022 Proposed