In [9]:
import os
import json
import datetime
from utils.parse_pdf import extract_sections
from utils.embedder import load_model, get_embedding
from utils.ranker import rank_sections



BASE_DIR = "."
COLLECTION_PREFIX = "collection"
OUTPUT_FILENAME = "challenge1b_output.json"
INPUT_FILENAME = "challenge1b_input.json"
PDF_DIR_NAME = "PDFs"


model = load_model()


for entry in os.listdir(BASE_DIR):
    collection_path = os.path.join(BASE_DIR, entry)
    if not os.path.isdir(collection_path) or not entry.startswith(COLLECTION_PREFIX):
        continue

    input_json_path = os.path.join(collection_path, INPUT_FILENAME)
    pdf_dir = os.path.join(collection_path, PDF_DIR_NAME)
    output_json_path = os.path.join(collection_path, OUTPUT_FILENAME)

    if not os.path.exists(input_json_path) or not os.path.isdir(pdf_dir):
        print(f"⚠️ Skipping {entry}: Missing input JSON or PDFs folder.")
        continue

    print(f"🔍 Processing: {entry}")

    
    with open(input_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    query = data["job_to_be_done"]["task"]
    documents = data["documents"]
    query_emb = get_embedding(model, query)

    extracted_sections = []
    subsection_analysis = []
    input_filenames = [d["filename"] for d in documents]

    for rank, doc in enumerate(documents, start=1):
        fname = doc["filename"]
        pdf_path = os.path.join(pdf_dir, fname)

        if not os.path.exists(pdf_path):
            print(f"🚫 Missing file: {pdf_path}")
            continue

        sections, doc_title = extract_sections(pdf_path)

        if doc_title:
            section_title = doc_title
            selected_page = 1
            selected_text = "\n".join([s["text"] for s in sections if s["page"] == 1])
        else:
            ranked = rank_sections(query_emb, sections, model)
            if not ranked:
                continue
            top_sec = ranked[0]
            section_title = top_sec.get("title", top_sec["text"][:40])
            selected_page = top_sec["page"]
            selected_text = top_sec["text"]

        extracted_sections.append({
            "document": fname,
            "section_title": section_title,
            "importance_rank": rank,
            "page_number": selected_page
        })

        subsection_analysis.append({
            "document": fname,
            "refined_text": selected_text,
            "page_number": selected_page
        })

    final_output = {
        "metadata": {
            "input_documents": input_filenames,
            "persona": data.get("persona", "Unknown"),
            "job_to_be_done": query,
            "processing_timestamp": datetime.datetime.now().isoformat()
        },
        "extracted_sections": extracted_sections,
        "subsection_analysis": subsection_analysis
    }

    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(final_output, f, indent=2, ensure_ascii=False)

    print(f"✅ Done: {output_json_path}")


📂 Current Working Directory: C:\Users\surya\Desktop\sample_project_1\challenge_1b
🔍 Processing: collection 1
✅ Done: .\collection 1\challenge1b_output.json
🔍 Processing: collection 2
✅ Done: .\collection 2\challenge1b_output.json
🔍 Processing: collection 3
✅ Done: .\collection 3\challenge1b_output.json
