In [7]:
import os
import json
import datetime
from utils.parse_pdf import extract_sections
from utils.embedder import load_model, get_embedding
from utils.ranker import rank_sections


# Paths
input_json_path = "./challenge1b_input.json"
pdf_dir = "./PDFs"
output_json_path = "./challenge1b_output.json"

# Load model
model = load_model()

# Load input JSON
with open(input_json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

query = data["job_to_be_done"]["task"]
documents = data["documents"]
query_emb = get_embedding(model, query)

# Result containers
extracted_sections = []
subsection_analysis = []
input_filenames = [d["filename"] for d in documents]

for rank, doc in enumerate(documents, start=1):
    fname = doc["filename"]
    pdf_path = os.path.join(pdf_dir, fname)

    if not os.path.exists(pdf_path):
        print(f"Missing file: {pdf_path}")
        continue

    sections, doc_title = extract_sections(pdf_path)

    if doc_title:
        section_title = doc_title
        selected_page = 1
        selected_text = "\n".join([s["text"] for s in sections if s["page"] == 1])
    else:
        ranked = rank_sections(query_emb, sections, model)
        if not ranked:
            continue
        top_sec = ranked[0]
        section_title = top_sec.get("title", top_sec["text"][:40])
        selected_page = top_sec["page"]
        selected_text = top_sec["text"]

    extracted_sections.append({
        "document": fname,
        "section_title": section_title,
        "importance_rank": rank,
        "page_number": selected_page
    })

    subsection_analysis.append({
        "document": fname,
        "refined_text": selected_text,
        "page_number": selected_page
    })

# Compose final output
final_output = {
    "metadata": {
        "input_documents": input_filenames,
        "persona": data.get("persona", "Unknown"),
        "job_to_be_done": query,
        "processing_timestamp": datetime.datetime.now().isoformat()
    },
    "extracted_sections": extracted_sections,
    "subsection_analysis": subsection_analysis
}

# Write output JSON
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(final_output, f, indent=2, ensure_ascii=False)

print(f"✓ Done — Output written to: {output_json_path}")


✓ Done — Output written to: ./challenge1b_output.json


In [6]:
import json

path = "./challenge1b_output.json"  # change path as needed

with open(path, "r") as f:
    data = json.load(f)


print(json.dumps(data, indent=2))


{
  "metadata": {
    "input_documents": [
      "Learn Acrobat - Create and Convert_1.pdf",
      "Learn Acrobat - Create and Convert_2.pdf",
      "Learn Acrobat - Edit_1.pdf",
      "Learn Acrobat - Edit_2.pdf",
      "Learn Acrobat - Export_1.pdf",
      "Learn Acrobat - Export_2.pdf",
      "Learn Acrobat - Fill and Sign.pdf",
      "Learn Acrobat - Generative AI_1.pdf",
      "Learn Acrobat - Generative AI_2.pdf",
      "Learn Acrobat - Request e-signatures_1.pdf",
      "Learn Acrobat - Request e-signatures_2.pdf",
      "Learn Acrobat - Share_1.pdf",
      "Learn Acrobat - Share_2.pdf",
      "Test Your Acrobat Exporting Skills.pdf",
      "The Ultimate PDF Sharing Checklist.pdf"
    ],
    "persona": {
      "role": "HR professional"
    },
    "job_to_be_done": "Create and manage fillable forms for onboarding and compliance.",
    "processing_timestamp": "2025-07-27T18:31:32.449624"
  },
  "extracted_sections": [
    {
      "document": "Learn Acrobat - Create and Convert_1.p