In [46]:
from bs4 import BeautifulSoup
import json
from data import html_code_list  # Import the array from data.py

def clean_text(text):
    """Remove BOM and invisible characters, and strip whitespace."""
    return text.replace('\ufeff', '').strip()

def extract_info(html):
    """Extracts information from a single HTML document."""
    soup = BeautifulSoup(html, 'html.parser')
    data = {}

    # Place
    place_tag = soup.find("h3", class_="text-raven dark:text-indigo-100 text-base mt-2 md:mt-0 hover:underline cursor-pointer")
    data['place'] = clean_text(place_tag.get_text()) if place_tag else ""

    # Scheme name
    title_tag = soup.find("h1", class_="font-bold text-xl sm:text-2xl text-[#24262B] dark:text-white mt-1")
    data['Scheme Name'] = clean_text(title_tag.get_text()) if title_tag else ""

    # Details
    details = []
    details_section = soup.find("div", id="details")
    if details_section:
        for tag in details_section.find_all(['li', 'span']):
            text = clean_text(tag.get_text())
            if text and text not in details:
                details.append(text)
    data['Details'] = details

    # Benefits
    benefits = []
    benefits_section = soup.find("div", id="benefits")
    if benefits_section:
        for tag in benefits_section.find_all(['li', 'span']):
            text = clean_text(tag.get_text())
            if text and text not in benefits:
                benefits.append(text)
    data['Benefits'] = benefits

    # Eligibility
    eligibility = []
    eligibility_section = soup.find("div", id="eligibility")
    if eligibility_section:
        for tag in eligibility_section.find_all(['li', 'span']):
            text = clean_text(tag.get_text())
            if text and text not in eligibility:
                eligibility.append(text)
    data['Eligibility'] = eligibility

    # Application Process
    application_process = {}
    application_section = soup.find("div", id="application-process")
    if application_section:
        tab_containers = application_section.find_all("div", class_="rounded")
        for tab in tab_containers:
            parent = tab.find_previous("div", class_="overflow-x-auto")
            label = clean_text(parent.get_text()) if parent else "Unknown"

            content = []
            markdown = tab.find("div", class_="markdown-options")
            if markdown:
                for div in markdown.find_all("div", class_="mb-2"):
                    text = clean_text(div.get_text(separator=" "))
                    if text:
                        content.append(text)

            if label and content:
                application_process[label] = content
    data['Application Process'] = application_process

    # Documents Required
    documents_required = []
    documents_section = soup.find("div", id="documents-required")
    if documents_section:
        for li in documents_section.find_all("li"):
            text = clean_text(li.get_text())
            if text:
                documents_required.append(text)
    data['Documents Required'] = documents_required

    # Exclusions
    exclusions = []
    exclusions_section = soup.find("div", id="exclusions")
    if exclusions_section:
        blockquotes = exclusions_section.find_all("blockquote")
        for bq in blockquotes:
            text = clean_text(bq.get_text())
            if text:
                exclusions.append(text)
    data['Exclusions'] = exclusions

    # FAQs
    faq_dict = {}
    faq_section = soup.find("div", id="faqs")
    if faq_section:
        faq_items = faq_section.find_all("div", class_="py-4")
        for item in faq_items:
            question_tag = item.find("p", class_="font-bold")
            question = clean_text(question_tag.get_text()) if question_tag else ""

            answer_tag = item.find("div", class_="rounded-b")
            answer_text = clean_text(answer_tag.get_text(separator=" ")) if answer_tag else ""

            if question and answer_text:
                faq_dict[question] = answer_text
    data['FAQs'] = faq_dict

    # Sources and References
    sources = []
    sources_section = soup.find("div", id="sources")
    if sources_section:
        links = sources_section.find_all("a", href=True)
        for link in links:
            text_tag = link.find("p")
            link_text = clean_text(text_tag.get_text()) if text_tag else clean_text(link.get_text())
            url = link['href']
            if link_text and url:
                sources.append({"text": link_text, "url": url})
    data['Sources and References'] = sources

    return data


# ==== Main Execution Starts Here ====

all_data = [extract_info(html) for html in html_code_list]

# Save the data to a file
with open("extracted_schemes.json", "w", encoding='utf-8') as f:
    json.dump(all_data, f, indent=2, ensure_ascii=False)

print("✅ Extraction complete. Data saved to extracted_schemes.json.")



✅ Extraction complete. Data saved to extracted_schemes.json.
