# This notebook is for the fiscal generation. 

## The following code is the pipeline for pulling data from the bill overview page


In [1]:
import os
import google.generativeai as genai
os.environ["GOOGLE_API_KEY"] = "AIzaSyAdmHIhoIPCg9gdWCfjTBMVetVy4xgAGLw"


genai.configure(api_key=os.environ["GOOGLE_API_KEY"])





  from .autonotebook import tqdm as notebook_tqdm


This code will pull data from the bill overview page and parse it

In [4]:
import os
import json
import time
import fitz  # PyMuPDF
import shutil
import tempfile
from urllib.parse import urljoin, urlparse, parse_qs
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import undetected_chromedriver as uc

from bs4 import BeautifulSoup

def table_html_to_numbered_list(html):
    soup = BeautifulSoup(html, "html.parser")
    table = soup.find("table", id="MainContent_GridViewStatus")
    if not table:
        return []

    numbered_list = []
    rows = table.find_all("tr")[1:]  # skip header row

    for i, row in enumerate(rows, start=1):
        cells = row.find_all("td")
        if len(cells) >= 3:
            date = cells[0].get_text(strip=True)
            chamber = cells[1].get_text(strip=True)
            status = cells[2].get_text(strip=True)
            numbered_list.append(f"{i}. {date} | {chamber} | {status}")

    return numbered_list

from bs4 import BeautifulSoup
from urllib.parse import urljoin

def extract_measure_links(html, base_url):
    soup = BeautifulSoup(html, "html.parser")
    links = []

    # 1️⃣ Links inside div.noprint
    noprint_div = soup.find("div", class_="noprint")
    if noprint_div:
        for a in noprint_div.find_all("a", href=True):
            links.append(urljoin(base_url, a["href"]))

    # 2️⃣ Links inside subsequent divs with class starting with 'measure-status card shadow'
    measure_divs = soup.find_all("div", class_="measure-status card shadow")
    # Include possible variation with "text-center"
    measure_divs += soup.find_all("div", class_="measure-status card shadow text-center")

    # Find the "Committee Reports" card
    committee_reports_div = soup.find("h2", string="Committee Reports").find_parent("div", class_="measure-status")

    # Extract all <a> links that are NOT the PDF ones (only the text links)
    links = committee_reports_div.select("a[id^='MainContent_RepeaterCommRpt_CategoryLink']")

    names = [link.get_text(strip=True) for link in links]


    for div in measure_divs:
        for a in div.find_all("a", href=True):
            links.append(urljoin(base_url, a["href"]))

    # remove duplicates while preserving order
    seen = set()
    unique_links = []
    for link in links:
        if link not in seen:
            unique_links.append(link)
            seen.add(link)

    return unique_links, names

def extract_measure_documents_with_links(html, base_url):
    """
    Extract documents along with their URLs.
    Returns a list of dicts: [{"name": "HB400_HD1", "url": "..."}]
    """
    soup = BeautifulSoup(html, "html.parser")
    documents = []

    # 1️⃣ Documents inside div.noprint
    noprint_div = soup.find("div", class_="noprint")
    if noprint_div:
        for a in noprint_div.find_all("a", href=True):
            name = a.get_text(strip=True)
            url = urljoin(base_url, a["href"])
            if name:
                documents.append({"name": name, "url": url})

    # 2️⃣ Documents inside divs with class starting with 'measure-status card shadow'
    measure_divs = soup.find_all("div", class_="measure-status card shadow")
    # measure_divs += soup.find_all("div", class_="measure-status card shadow text-center")
    for div in measure_divs:
        for a in div.find_all("a", href=True):
            name = a.get_text(strip=True)
            url = urljoin(base_url, a["href"])
            if name:
                # Avoid duplicates
                if not any(d["name"] == name for d in documents):
                    documents.append({"name": name, "url": url})

    return documents


# Setup output filename
# measure_url = "https://www.capitol.hawaii.gov/session/measure_indiv.aspx?billtype=HB&billnumber=727&year=2025"
measure_url = "https://www.capitol.hawaii.gov/session/measure_indiv.aspx?billtype=HB&billnumber=400&year=2025"
parsed = urlparse(measure_url)
params = parse_qs(parsed.query)
billtype = params.get("billtype", ["UNKNOWN"])[0]
billnumber = params.get("billnumber", ["UNKNOWN"])[0]
year = params.get("year", ["UNKNOWN"])[0]
output_filename = f"{billtype}_{billnumber}_{year}.json"

# Setup download directory
download_dir = tempfile.mkdtemp()

options = uc.ChromeOptions()
options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "plugins.always_open_pdf_externally": True
})
driver = uc.Chrome(options=options)

def clean_html_text(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    return soup.get_text(separator="\n", strip=True)

def extract_pdf_text_from_file(file_path):
    try:
        doc = fitz.open(file_path)
        text = "\n".join(page.get_text() for page in doc)
        doc.close()
        return text
    except Exception as e:
        return f"[ERROR extracting PDF text: {e}]"

try:
    driver.get(measure_url)
    time.sleep(0.1)

    # Collect all links inside main-content
    main = driver.find_element(By.ID, "main-content")
    a_tags = main.find_elements(By.XPATH, ".//a[@href]")
    base_url = measure_url
    raw_links = [urljoin(base_url, a.get_attribute("href")) for a in a_tags]
    filtered = [u for u in raw_links if u.lower().endswith((".htm", ".pdf"))]

    # Prefer .htm if both .htm and .pdf exist for same base
    unique_docs = {}
    # for link in filtered:
    #     path = urlparse(link).path
    #     base = os.path.splitext(os.path.basename(path))[0]
    #     key = os.path.dirname(path) + "/" + base
    #     ext = os.path.splitext(path)[1].lower()
    #     if ext == ".htm":
    #         unique_docs[key] = link
    #     elif ext == ".pdf" and key not in unique_docs:
    #         unique_docs[key] = link

    results = []
    for doc_url in unique_docs.values():
        print(f"Processing: {doc_url}")
        if doc_url.lower().endswith(".htm"):
            driver.get(doc_url)
            time.sleep(3)
            html = driver.page_source
            text = clean_html_text(html)
            results.append({"url": doc_url, "text": text})
        elif doc_url.lower().endswith(".pdf"):
            # Remove old files first
            for f in os.listdir(download_dir):
                os.remove(os.path.join(download_dir, f))
            # Click the PDF link
            driver.get(measure_url)  # Reload base page to stay consistent
            time.sleep(3)
            link_el = driver.find_element(By.XPATH, f'//a[@href="{urlparse(doc_url).path}"]')
            link_el.click()
            time.sleep(5)  # Wait for download

            # Find the downloaded PDF file
            downloaded_pdf = next((os.path.join(download_dir, f)
                                   for f in os.listdir(download_dir)
                                   if f.lower().endswith(".pdf")), None)
            if downloaded_pdf:
                text = extract_pdf_text_from_file(downloaded_pdf)
                results.append({"url": doc_url, "text": text})
            else:
                results.append({"url": doc_url, "text": "[ERROR: PDF not downloaded]"})
    # Process starting page first
    driver.get(measure_url)
    time.sleep(0.5)
    html = driver.page_source
    text = table_html_to_numbered_list(html)
    links, names = extract_measure_links(html, measure_url)
    documents = extract_measure_documents_with_links(html, measure_url)

    results.append({"url": measure_url, "text": text, "links": links, "documents": documents, "comittee_reports": names})
    for item in results:
        cleaned_links = []
        for link in item["links"]:
            if hasattr(link, "get"):  # It's a BeautifulSoup Tag
                cleaned_links.append({
                    "name": link.get_text(strip=True),
                    "url": link["href"]
                })
            else:
                # Already a plain string (like a full URL), keep as-is
                cleaned_links.append(link)
        item["links"] = cleaned_links
    print(results)
    # Save results
    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"✅ Saved to {output_filename}")

finally:
    driver.quit()
    shutil.rmtree(download_dir)


[{'url': 'https://www.capitol.hawaii.gov/session/measure_indiv.aspx?billtype=HB&billnumber=400&year=2025', 'text': ['1. 6/26/2025 | H | Act 227, on 06/26/2025 (Gov. Msg. No. 1329).', '2. 6/26/2025 | S | Act 227, 06/26/2025 (Gov. Msg. No. 1329).', '3. 5/1/2025 | H | Transmitted to Governor.', '4. 5/2/2025 | S | Received notice of passage on Final Reading in House (Hse. Com. No. 821).', '5. 5/1/2025 | H | Received notice of Final Reading (Sen. Com. No. 888).', '6. 4/30/2025 | H | Passed Final Reading as amended in CD 1 with none voting aye with reservations; none voting no (0) and Representative(s) Cochran, Pierick excused (2).', '7. 4/30/2025 | S | Passed Final Reading, as amended (CD 1). Ayes, 25; Aye(s) with reservations: none . 0 No(es): none.  0 Excused: none.', '8. 4/25/2025 | S | 48 Hrs. Notice (as amended CD 1) 04-30-25', '9. 4/25/2025 | S | Reported from Conference Committee as amended CD 1 (Conf. Com. Rep. No. 157).', '10. 4/25/2025 | H | Forty-eight (48) hours notice Wednesday

In [5]:
%config PlainTextFormatter.max_width = 300 # Adjust the number as desired


Specifically, there is a timeline in the following property that we will give to the LLM for producing the links that occur in order


In [6]:
print(results[0]['text'])
for row in results[0]['text']:
    print(row)
    print("-"*100)

['1. 6/26/2025 | H | Act 227, on 06/26/2025 (Gov. Msg. No. 1329).', '2. 6/26/2025 | S | Act 227, 06/26/2025 (Gov. Msg. No. 1329).', '3. 5/1/2025 | H | Transmitted to Governor.', '4. 5/2/2025 | S | Received notice of passage on Final Reading in House (Hse. Com. No. 821).', '5. 5/1/2025 | H | Received notice of Final Reading (Sen. Com. No. 888).', '6. 4/30/2025 | H | Passed Final Reading as amended in CD 1 with none voting aye with reservations; none voting no (0) and Representative(s) Cochran, Pierick excused (2).', '7. 4/30/2025 | S | Passed Final Reading, as amended (CD 1). Ayes, 25; Aye(s) with reservations: none . 0 No(es): none.  0 Excused: none.', '8. 4/25/2025 | S | 48 Hrs. Notice (as amended CD 1) 04-30-25', '9. 4/25/2025 | S | Reported from Conference Committee as amended CD 1 (Conf. Com. Rep. No. 157).', '10. 4/25/2025 | H | Forty-eight (48) hours notice Wednesday, 04-30-25.', '11. 4/25/2025 | H | Reported from Conference Committee (Conf Com. Rep. No. 157) as amended in (CD 1)

Here are the documents that were retrieved from the overview

In [7]:
for row in results[0]['documents']:
    print(row)

{'name': 'HB400_SD2', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_SD2_.HTM'}
{'name': 'HB400_CD1', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_CD1_.HTM'}
{'name': 'HB400_HD1', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_HD1_.HTM'}
{'name': 'HB400_SD1', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_SD1_.HTM'}
{'name': 'HB400', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_.HTM'}
{'name': 'HB400_HSCR286_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_HSCR286_.htm'}
{'name': 'HB400_HD1_HSCR1171_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_HD1_HSCR1171_.htm'}
{'name': 'HB400_SD1_SSCR1253_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_SD1_SSCR1253_.htm'}
{'name': 'HB400_SD2_SSCR1841_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/CommReports

Asking the LLM, or Gemini to rewrite the documents

In [8]:
import json
import re
from pydantic import BaseModel
from google import genai
class Document(BaseModel):
    date: str
    text: str
    documents: list[str]

# Updated Gemini query function
def query_gemini(prompt: str):
    client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
    response = client.models.generate_content(
        model="gemini-2.5-pro",
        contents=prompt,
        config={"response_mime_type": "application/json", "response_schema": list[Document]}
    )
    return response.text, response.parsed

def generate_order_prompt_json(status_rows, document_names):
    """
    Generates a prompt asking Gemini 2.5 Pro to return a chronological JSON timeline of document creation.
    Each event should be an object with:
      - date: string (single date or date range)
      - text: string (description of the event)
      - documents: array of document names (can be empty)
    Include all documents. Place testimony documents under the relevant hearing event.
    """
    status_text = "\n".join(status_rows)
    documents_text = "\n".join(document_names)
    
    prompt = (
        "You are given legislative status updates of a bill and a list of document names.\n\n"
        "Some documents are testimonies related to hearings. Place testimony documents under the event where the hearing occurred.\n\n"
        "Status updates (chronological):\n"
        f"{status_text}\n\n"
        "Document names:\n"
        f"{documents_text}\n\n"
        "Return a chronological timeline as a JSON array of objects. Each object must have:\n"
        "{\n"
        '  "date": "event date or date range",\n'
        '  "text": "description of the event",\n'
        '  "documents": ["doc1", "doc2"]  # list of documents associated with this event, can be empty\n'
        "}\n\n"
        "Use all documents from the list. Do NOT include explanations, reasoning, or extra text. Just return valid JSON."
    )
    
    return prompt



documents = [document['name'] for document in results[0]['documents']]

# Generate the prompt
query = generate_order_prompt_json(results[0]['text'], documents)
print(query)

# Send prompt to Gemini
text, parsed = query_gemini(query)
print("******************\n\n")
print(text)
print("******************\n\n")
print(parsed)

timeline = parsed

doc_map = {doc['name']: doc for doc in results[0]['documents']}

documents_chronological = []
seen = set()
for event in timeline:
    for doc_name in event.documents:
        if doc_name in doc_map and doc_name not in seen:
            documents_chronological.append(doc_map[doc_name])
            seen.add(doc_name)

print(documents_chronological)



You are given legislative status updates of a bill and a list of document names.

Some documents are testimonies related to hearings. Place testimony documents under the event where the hearing occurred.

Status updates (chronological):
1. 6/26/2025 | H | Act 227, on 06/26/2025 (Gov. Msg. No. 1329).
2. 6/26/2025 | S | Act 227, 06/26/2025 (Gov. Msg. No. 1329).
3. 5/1/2025 | H | Transmitted to Governor.
4. 5/2/2025 | S | Received notice of passage on Final Reading in House (Hse. Com. No. 821).
5. 5/1/2025 | H | Received notice of Final Reading (Sen. Com. No. 888).
6. 4/30/2025 | H | Passed Final Reading as amended in CD 1 with none voting aye with reservations; none voting no (0) and Representative(s) Cochran, Pierick excused (2).
7. 4/30/2025 | S | Passed Final Reading, as amended (CD 1). Ayes, 25; Aye(s) with reservations: none . 0 No(es): none.  0 Excused: none.
8. 4/25/2025 | S | 48 Hrs. Notice (as amended CD 1) 04-30-25
9. 4/25/2025 | S | Reported from Conference Committee as amende

The code above also produced the documents that were generated in chronological order

In [9]:
for document in documents_chronological:
    print(document)


{'name': 'HB400', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_.HTM'}
{'name': 'HB400_TESTIMONY_JHA_01-30-25_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/Testimony/HB400_TESTIMONY_JHA_01-30-25_.PDF'}
{'name': 'HB400_HSCR286_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_HSCR286_.htm'}
{'name': 'HB400_TESTIMONY_FIN_03-05-25_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/Testimony/HB400_TESTIMONY_FIN_03-05-25_.PDF'}
{'name': 'HB400_HD1_HSCR1171_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/CommReports/HB400_HD1_HSCR1171_.htm'}
{'name': 'HB400_HD1', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/bills/HB400_HD1_.HTM'}
{'name': 'HB400_HD1_TESTIMONY_JDC_03-19-25_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/Testimony/HB400_HD1_TESTIMONY_JDC_03-19-25_.PDF'}
{'name': 'HB400_SD1_SSCR1253_', 'url': 'https://www.capitol.hawaii.gov/sessions/session2025/Comm

Now that we have the documents in order, we can retrieve the actual content from every link


In [21]:
import os
import time
import fitz  # PyMuPDF
import shutil
import tempfile
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By

def parse_web_document_selenium(url, output_dir="parsed_docs"):
    """
    Fetch a single HTML or PDF page using Selenium (undetected) to bypass Cloudflare,
    extract text, and save it as a .txt file.
    """
    os.makedirs(output_dir, exist_ok=True)
    if url.lower().endswith((".htm", ".html")):
        #skip
        return f"Skipping HTML file: {url}"
    
    # Derive a safe filename from URL
    path = urlparse(url).path
    filename_base = os.path.basename(path) or "document"
    txt_filename = os.path.join(output_dir, f"{filename_base}.txt")

    # Setup temp download directory for PDFs
    download_dir = tempfile.mkdtemp()

    options = uc.ChromeOptions()
    options.add_experimental_option("prefs", {
        "download.default_directory": download_dir,
        "download.prompt_for_download": False,
        "plugins.always_open_pdf_externally": True
    })

    driver = uc.Chrome(options=options)

    def clean_html_text(html):
        soup = BeautifulSoup(html, "html.parser")
        for tag in soup(["script", "style", "noscript"]):
            tag.decompose()
        return soup.get_text(separator="\n", strip=True)

    def extract_pdf_text(file_path):
        try:
            doc = fitz.open(file_path)
            text = "\n".join(page.get_text() for page in doc)
            doc.close()
            return text
        except Exception as e:
            return f"[ERROR extracting PDF text: {e}]"

    try:
        driver.get(url)
        time.sleep(0.5)  # wait for Cloudflare / JS

        # if url.lower().endswith((".htm", ".html")):
        #     html = driver.page_source
        #     text = clean_html_text(html)

        if url.lower().endswith(".pdf"):
            # Navigate directly to the PDF URL
            driver.get(url)
            time.sleep(5)  # give time for PDF to download

            # Find the downloaded PDF
            downloaded_pdf = next((os.path.join(download_dir, f)
                                for f in os.listdir(download_dir)
                                if f.lower().endswith(".pdf")), None)
            if downloaded_pdf:
                text = extract_pdf_text(downloaded_pdf)
            else:
                return f"❌ PDF not downloaded: {url}"


        else:
            return f"Unsupported file type: {url}"

        # Save text
        with open(txt_filename, "w", encoding="utf-8") as f:
            f.write(text)

        return f"✅ Saved text: {txt_filename}"

    except Exception as e:
        return f"❌ Failed to parse {url}: {e}"

    finally:
        driver.quit()
        shutil.rmtree(download_dir)


for doc in documents_chronological:
    parse_web_document_selenium(doc['url'])


then the following code will produce a fiscal note for each document, but focusing on the "changes" between each generated document

In [53]:
import json
import os
from pydantic import BaseModel, create_model
from tenacity import retry, stop_after_attempt, wait_exponential
import time

class FiscalNoteModel(BaseModel):
    overview: str
    appropriations:str
    assumptions_and_methodology:str
    agency_impact:str
    economic_impact:str 
    policy_impact: str
    revenue_sources: str 
    six_year_fiscal_implications: str
    operating_revenue_impact: str
    capital_expenditure_impact: str
    fiscal_implications_after_6_years: str
    updates_from_previous_fiscal_note: str


# Example LLM query function (replace with your actual model)
def query_gemini(prompt: str):
    client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])
    response = client.models.generate_content(
        model="gemini-2.5-pro",
        contents=prompt,
        config={"response_mime_type": "application/json", "response_schema": FiscalNoteModel}
    )
    return response.text, response.parsed

# Property prompts (from your input)
PROPERTY_PROMPTS = { 
    "overview": { "prompt": "Using the provided legislative documents, statutes, and testimonies, write a clear summary describing the purpose, scope, and key components of the proposed measure or bill, including any pilot or permanent programs, reporting requirements, and sunset clauses. This should be around 3 sentences.", "description": "General overview and summary of the measure" }, 
    "appropriations": { "prompt": "Based on budgetary data and legislative appropriations, detail the funding allocated for the program or measure, including fiscal years, amounts, intended uses such as staffing, training, contracts, technology, etc... This should be around 3 sentences.", "description": "Funding allocation and appropriations details" }, 
    "assumptions_and_methodology": { "prompt": "Explain the assumptions, cost estimation methods, and data sources used to calculate the financial projections for this program or measure, referencing comparable programs or historical budgets where applicable. This should be around 3 sentences.", "description": "Cost estimation methodology and assumptions" }, 
    "agency_impact": { "prompt": "Describe the anticipated operational, administrative, and budgetary impact of the program or measure on the relevant government agency or department, including supervision, staffing, and resource allocation. This should be around 3 sentences.", "description": "Impact on government agencies and departments" }, 
    "economic_impact": { "prompt": "Summarize the expected economic effects of the program or measure, such as cost savings, potential reductions in related expenditures, benefits to the community, and any relevant performance or participation statistics. This should be around 3 sentences.", "description": "Economic effects and community benefits" }, 
    "policy_impact": { "prompt": "Analyze the policy implications of the measure, including how it modifies existing laws or programs, its role within broader legislative strategies, and its potential effects on state or local governance. This should be around 3 sentences.", "description": "Policy implications and legislative analysis" }, 
    "revenue_sources": { "prompt": "Identify and describe the funding sources that will support the program or measure, such as general funds, grants, fees, or other revenue streams, based on the provided fiscal documents. This should be around 3 sentences.", "description": "Funding sources and revenue streams" }, 
    "six_year_fiscal_implications": { "prompt": "Provide a multi-year fiscal outlook (e.g., six years) for the program or measure, projecting costs, staffing changes, recurring expenses, and assumptions about program expansion or permanence using available budget and workload data. This should be around 10 sentences.", "description": "Six-year fiscal projections and outlook" }, 
    "operating_revenue_impact": { "prompt": "Describe any anticipated impacts on operating revenues resulting from the program or measure, including increases, decreases, or changes in revenue streams. This should be around 3 sentences.", "description": "Operating revenue impacts" }, 
    "capital_expenditure_impact": { "prompt": "Outline any expected capital expenditures related to the program or measure, such as investments in facilities, equipment, or technology infrastructure, based on capital budgets or agency plans. This should be around 3 sentences.", "description": "Capital expenditure requirements" }, 
    "fiscal_implications_after_6_years": { "prompt": "Summarize the ongoing fiscal obligations after the initial multi-year period for the program or measure, including annual operating costs, expected number of program sites or units, and the sustainability of funding. This should be around 3 sentences.", "description": "Long-term fiscal obligations beyond six years" },
    "updates_from_previous_fiscal_note" : {"prompt": "If you are given a previous fisacl not. Please summarize the MAIN POINTS that are different from the previous fiscal note and the new fisacl note."}
    }

def generate_fiscal_note_for_context(context_text, previous_note=None):
    """
    Generate a full fiscal note (all properties at once) using PROPERTY_PROMPTS.
    If previous_note is provided, instruct the LLM to avoid repeating information.
    """
    # Build a combined instruction
    combined_prompt = "You are tasked with generating a fiscal note based on the context that you are given on a set of documents.\n"
    combined_prompt += "Extract the following information:\n\n"
    for key, prop in PROPERTY_PROMPTS.items():
        combined_prompt += f"- {key}: {prop['prompt']}\n"

    combined_prompt += f"\nContext:\n{context_text}\n"

    if previous_note:
        combined_prompt += (
            f"""
            You are generating a **new fiscal note** based on updated documents. 
Compare it to the previous fiscal note (shown below). Only include information that is **new or has changed**. 
If a section has no changes, leave it **blank**. 
Do **not repeat content** from the previous fiscal note.
Previous fiscal note:
            """
            f"{json.dumps(previous_note, ensure_ascii=False, indent=2)}"
            f"\nAccording to the previous fiscal note, focus on what has been discussed and the main points that have changed. Do not repeat the same content. The previous fiscal note should be very different in the new fiscal note. If no new information is needed, leave the section blank\n"
            
        )
    
    text, parsed = query_gemini(combined_prompt)

    # Convert to dict
    fiscal_note = {}
    if parsed:
        try:
            fiscal_note = parsed.dict()  # Pydantic v1
        except AttributeError:
            fiscal_note = parsed.model_dump()  # Pydantic v2

    return fiscal_note, combined_prompt

def generate_fiscal_notes_chronologically(documents, comittee_reports, output_dir):
    """
    Generate fiscal notes sequentially for a list of chronologically ordered documents.
    Each document adds to the cumulative context, but previous fiscal note is given to reduce redundancy.
    
    documents: list of dicts with {"name": ..., "text": ...}
    """
    os.makedirs(output_dir, exist_ok=True)
    cumulative_context = ""
    previous_fiscal_note = None
    
    for i, doc in enumerate(documents, start=1):
        print(f"Processing document {i}/{len(documents)}: {doc['name']}, text: {doc['text'][:100]}")
        
        # Append the new document to the cumulative context
        cumulative_context += f"\n\n=== Document: {doc['name']} ===\n{doc['text']}"
        
        if doc['name'] in comittee_reports:
            # Generate fiscal note for the current cumulative context
            fiscal_note, combined_prompt = generate_fiscal_note_for_context(cumulative_context, previous_fiscal_note)
            # Save to a JSON file (filename = new document name)
            out_path = os.path.join(output_dir, f"{doc['name']}.json")
            with open(out_path, "w", encoding="utf-8") as f:
                json.dump(fiscal_note, f, ensure_ascii=False, indent=2)
            
            print(f"✅ Fiscal note saved: {out_path}")
            # Update previous fiscal note to avoid redundancy in the next iteration
            previous_fiscal_note = fiscal_note
            cumulative_context = ""



Generate fiscal notes for each document

In [54]:
import os
import glob

parsed_dir = "parsed_docs"
output_dir = "fiscal_notesV2"
os.makedirs(output_dir, exist_ok=True)
committee_reports = results[0]['comittee_reports']
committee_reports.append("HB400")

documents_with_text = []

for doc in documents_chronological:
    name = doc['name']
    # Look for any file that starts with the document name
    matches = glob.glob(os.path.join(parsed_dir, f"{name}*"))
    if matches:
        # Take the first match
        txt_path = matches[0]
        with open(txt_path, "r", encoding="utf-8") as f:
            text = f.read()
        documents_with_text.append({"name": name, "text": text})
    else:
        print(f"⚠️ File not found for {name}")

# Generate fiscal notes in order
generate_fiscal_notes_chronologically(documents_with_text, committee_reports, output_dir=output_dir)


Processing document 1/14: HB400, text:  
 
 
 
The Judiciary, State of Hawai‘i  
Testimony to the Thirty-Third State Legislature 
2025 Regu


/var/folders/_0/nkxt9wx55g97v82kt5fp6lgw0000gn/T/ipykernel_98602/1289967412.py:81: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  fiscal_note = parsed.dict()  # Pydantic v1


✅ Fiscal note saved: fiscal_notesV2/HB400.json
Processing document 2/14: HB400_TESTIMONY_JHA_01-30-25_, text:  
 
 
 
The Judiciary, State of Hawai‘i  
Testimony to the Thirty-Third State Legislature 
2025 Regu
Processing document 3/14: HB400_HSCR286_, text: HSCR286
STAND.
COM. REP. NO.
286
Honolulu, Hawaii
,
2025
RE:
H.B. No. 400
Honorable Nadine K. Nakamu
✅ Fiscal note saved: fiscal_notesV2/HB400_HSCR286_.json
Processing document 4/14: HB400_TESTIMONY_FIN_03-05-25_, text:  
 
 
 
The Judiciary, State of Hawai‘i  
Testimony to the Thirty-Third State Legislature 
2025 Regu
Processing document 5/14: HB400_HD1_HSCR1171_, text: HSCR1171
STAND.
COM. REP. NO.
1171
Honolulu, Hawaii
,
2025
RE:
H.B. No. 400
H.D.
1
Honorable Nadine 
✅ Fiscal note saved: fiscal_notesV2/HB400_HD1_HSCR1171_.json
Processing document 6/14: HB400_HD1, text:  
 
 
 
The Judiciary, State of Hawai‘i  
Testimony to the Thirty-Third State Legislature 
2025 Regu
Processing document 7/14: HB400_HD1_TESTIMONY_JDC_03-19-25_, 

In [49]:
timeline

[Document(date='1/16/2025', text='Pending introduction.', documents=[]),
 Document(date='1/17/2025', text='Introduced and Pass First Reading.', documents=['HB400']),
 Document(date='1/21/2025', text='Referred to JHA, FIN, referral sheet 1', documents=[]),
 Document(date='1/24/2025', text='Bill scheduled to be heard by JHA on Thursday, 01-30-25 2:00PM in House conference room 325 VIA VIDEOCONFERENCE.', documents=['HB400_TESTIMONY_JHA_01-30-25_']),
 Document(date='1/30/2025', text='The committee on JHA recommend that the measure be PASSED, UNAMENDED. The votes were as follows: 8 Ayes: Representative(s) Tarnas, Poepoe, Belatti, Kahaloa, Perruso, Takayama, Garcia, Shimizu; Ayes with reservations: none;  Noes: none; and 3 Excused: Representative(s) Cochran, Hashem, Todd.', documents=[]),
 Document(date='2/10/2025', text='Reported from JHA (Stand. Com. Rep. No. 286), recommending passage on Second Reading and referral to FIN.', documents=['HB400_HSCR286_']),
 Document(date='2/10/2025', text=

# Tried using Llama but limited by context window

In [None]:
import json
import os
from pydantic import BaseModel, create_model
from tenacity import retry, stop_after_attempt, wait_exponential
import time
import requests

class FiscalNoteModel(BaseModel):
    overview: str
    appropriations:str
    assumptions_and_methodology:str
    agency_impact:str
    economic_impact:str 
    policy_impact: str
    revenue_sources: str 
    six_year_fiscal_implications: str
    operating_revenue_impact: str
    capital_expenditure_impact: str
    fiscal_implications_after_6_years: str


# Example LLM query function (replace with your actual model)
def query_llama(prompt: str) -> str:
    """
    Send a prompt to a LLaMA chat endpoint and return the response.
    
    Uses the environment variable LLAMA_API_KEY for authentication.
    """
    llama_url = "https://tejas.tacc.utexas.edu/v1/c31853e6-0a58-4483-9e92-e7c32b021d44/chat/completions"
    llama_model = "Meta-Llama-3.1-405B-Instruct"
    api_key = "bfa9578e-8707-4f32-a07c-4de1ffcd1a77"
    
    if not api_key:
        raise ValueError("Please set the LLAMA_API_KEY environment variable.")
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    structured_prompt = prompt + """\n\n 
    Extract the following information from the text below in JSON format,
    matching exactly these keys:

    overview
    appropriations
    assumptions_and_methodology
    agency_impact
    economic_impact
    policy_impact
    revenue_sources
    six_year_fiscal_implications
    operating_revenue_impact
    capital_expenditure_impact
    fiscal_implications_after_6_years

    Return ONLY valid JSON.
    """
    
    payload = {
        "stream": False,
        "model": llama_model,
        "messages": [{"role": "user", "content": structured_prompt}]
    }

    try:
        response = requests.post(llama_url, headers=headers, data=json.dumps(payload), timeout=60)
        response.raise_for_status()
        data = response.json()

        if "choices" in data and len(data["choices"]) > 0:
            message = data["choices"][0].get("message", {})
            raw_text = message.get("content", "").strip()

            # Clean code fences if present
            if raw_text.startswith("```"):
                raw_text = raw_text.strip("`")
                if raw_text.lower().startswith("json"):
                    raw_text = raw_text[4:].strip()

            # Parse JSON
            try:
                return json.loads(raw_text)
            except json.JSONDecodeError:
                # Attempt to extract the JSON object manually
                start = raw_text.find("{")
                end = raw_text.rfind("}") + 1
                if start != -1 and end != -1:
                    return json.loads(raw_text[start:end])
                raise

        else:
            raise ValueError("Unexpected response format from LLaMA API")

    except requests.RequestException as e:
        raise RuntimeError(f"Failed to call LLaMA API - {str(e)}")
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse LLaMA API response - {str(e)}")

# Property prompts (from your input)
PROPERTY_PROMPTS = { 
    "overview": { "prompt": "Using the provided legislative documents, statutes, and testimonies, write a clear summary describing the purpose, scope, and key components of the proposed measure or bill, including any pilot or permanent programs, reporting requirements, and sunset clauses. This should be around 3 sentences.", "description": "General overview and summary of the measure" }, 
    "appropriations": { "prompt": "Based on budgetary data and legislative appropriations, detail the funding allocated for the program or measure, including fiscal years, amounts, intended uses such as staffing, training, contracts, technology, etc... This should be around 3 sentences.", "description": "Funding allocation and appropriations details" }, 
    "assumptions_and_methodology": { "prompt": "Explain the assumptions, cost estimation methods, and data sources used to calculate the financial projections for this program or measure, referencing comparable programs or historical budgets where applicable. This should be around 3 sentences.", "description": "Cost estimation methodology and assumptions" }, 
    "agency_impact": { "prompt": "Describe the anticipated operational, administrative, and budgetary impact of the program or measure on the relevant government agency or department, including supervision, staffing, and resource allocation. This should be around 3 sentences.", "description": "Impact on government agencies and departments" }, 
    "economic_impact": { "prompt": "Summarize the expected economic effects of the program or measure, such as cost savings, potential reductions in related expenditures, benefits to the community, and any relevant performance or participation statistics. This should be around 3 sentences.", "description": "Economic effects and community benefits" }, 
    "policy_impact": { "prompt": "Analyze the policy implications of the measure, including how it modifies existing laws or programs, its role within broader legislative strategies, and its potential effects on state or local governance. This should be around 3 sentences.", "description": "Policy implications and legislative analysis" }, 
    "revenue_sources": { "prompt": "Identify and describe the funding sources that will support the program or measure, such as general funds, grants, fees, or other revenue streams, based on the provided fiscal documents. This should be around 3 sentences.", "description": "Funding sources and revenue streams" }, 
    "six_year_fiscal_implications": { "prompt": "Provide a multi-year fiscal outlook (e.g., six years) for the program or measure, projecting costs, staffing changes, recurring expenses, and assumptions about program expansion or permanence using available budget and workload data. This should be around 10 sentences.", "description": "Six-year fiscal projections and outlook" }, 
    "operating_revenue_impact": { "prompt": "Describe any anticipated impacts on operating revenues resulting from the program or measure, including increases, decreases, or changes in revenue streams. This should be around 3 sentences.", "description": "Operating revenue impacts" }, 
    "capital_expenditure_impact": { "prompt": "Outline any expected capital expenditures related to the program or measure, such as investments in facilities, equipment, or technology infrastructure, based on capital budgets or agency plans. This should be around 3 sentences.", "description": "Capital expenditure requirements" }, 
    "fiscal_implications_after_6_years": { "prompt": "Summarize the ongoing fiscal obligations after the initial multi-year period for the program or measure, including annual operating costs, expected number of program sites or units, and the sustainability of funding. This should be around 3 sentences.", "description": "Long-term fiscal obligations beyond six years" } 
    }

def generate_fiscal_note_for_context(context_text, previous_note=None):
    """
    Generate a full fiscal note (all properties at once) using PROPERTY_PROMPTS.
    If previous_note is provided, instruct the LLM to avoid repeating information.
    """
    # Build a combined instruction
    combined_prompt = "You are tasked with generating a fiscal note.\n"
    combined_prompt += "Extract the following information:\n\n"
    for key, prop in PROPERTY_PROMPTS.items():
        combined_prompt += f"- {key}: {prop['prompt']}\n"

    combined_prompt += f"\nContext:\n{context_text}\n"

    if previous_note:
        combined_prompt += (
            f"\nAccording to the previous fiscal note, focus on what has been discussed and the main points that have changed. Do not repeat the same content.\n"
            f"{json.dumps(previous_note, ensure_ascii=False, indent=2)}"
        )

    parsed = query_llama(combined_prompt)

    # Convert to dict
    fiscal_note = parsed
    # if parsed:
    #     try:
    #         fiscal_note = parsed.dict()  # Pydantic v1
    #     except AttributeError:
    #         fiscal_note = parsed.model_dump()  # Pydantic v2

    return fiscal_note

def generate_fiscal_notes_chronologically(documents, comittee_reports, output_dir):
    """
    Generate fiscal notes sequentially for a list of chronologically ordered documents.
    Each document adds to the cumulative context, but previous fiscal note is given to reduce redundancy.
    
    documents: list of dicts with {"name": ..., "text": ...}
    """
    os.makedirs(output_dir, exist_ok=True)
    cumulative_context = ""
    previous_fiscal_note = None
    
    for i, doc in enumerate(documents, start=1):
        print(f"Processing document {i}/{len(documents)}: {doc['name']}")
        
        # Append the new document to the cumulative context
        cumulative_context += f"\n\n=== Document: {doc['name']} ===\n{doc['text']}"
        
        if doc['name'] in committee_reports:
            # Generate fiscal note for the current cumulative context
            fiscal_note = generate_fiscal_note_for_context(cumulative_context, previous_fiscal_note)
            
            # Save to a JSON file (filename = new document name)
            out_path = os.path.join(output_dir, f"{doc['name']}.json")
            with open(out_path, "w", encoding="utf-8") as f:
                json.dump(fiscal_note, f, ensure_ascii=False, indent=2)
            
            print(f"✅ Fiscal note saved: {out_path}")
            time.sleep(5)
            # Update previous fiscal note to avoid redundancy in the next iteration
            previous_fiscal_note = fiscal_note



# The following code will pull all of the bills from the https://www.capitol.hawaii.gov/sessions/session2025/bills/.
## But only focusing on the .html documents

In [None]:
import time
import json
import re
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import logging
from urllib.parse import urljoin, urlparse
import requests
import pdfplumber
import fitz
import io
import tempfile
import shutil # Import shutil for cleaning up temporary directories

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class GenericWebScraper:
    def __init__(self, use_delays=False, max_depth=3, progress_filepath=None, output_directory=None):
        self.driver = None
        self.download_dir = tempfile.mkdtemp()
        self.setup_driver()
        self.processed_urls = set()
        self.max_depth = max_depth
        self.use_delays = use_delays
        self.progress_filepath = progress_filepath
        self.output_directory = output_directory # Store output directory
        self.extracted_documents = [] # This will now be the main list for extracted documents

        # Load from both progress file and existing scraped data files
        self.extracted_documents = self._load_progress()
        for doc in self.extracted_documents:
            self.processed_urls.add(doc['url'])
        logger.info(f"💾 Loaded {len(self.extracted_documents)} documents from previous sessions.")
        
    def setup_driver(self):
        """Setup Chrome driver with anti-detection measures"""
        options = Options()
        
        # Anti-detection settings
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)
        options.add_argument('--disable-web-security')
        options.add_argument('--allow-running-insecure-content')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        
        # PDF handling
        options.add_experimental_option('prefs', {
            "download.default_directory": self.download_dir,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "plugins.always_open_pdf_externally": True
        })
        
        # Make it look like a regular user
        options.add_argument('--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        
        # Uncomment to run headless
        # options.add_argument('--headless')
        
        try:
            self.driver = webdriver.Chrome(options=options)
            
            # Additional anti-detection
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            self.driver.execute_cdp_cmd('Network.setUserAgentOverride', {
                "userAgent": 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
            })
            
            logger.info("✅ Chrome driver initialized successfully")
            
        except Exception as e:
            logger.error(f"❌ Error setting up Chrome driver: {e}")
            raise
            
    def maybe_delay(self, min_sec=0.5, max_sec=1):
        """Add optional delay if enabled"""
        if self.use_delays:
            import random
            delay = random.uniform(min_sec, max_sec)
            time.sleep(delay)
        
    def load_page(self, url, timeout=30):
        """Load a page and return BeautifulSoup object"""
        try:
            logger.info(f"🌐 Loading: {url}")
            self.driver.get(url)
            
            # Wait for page to load
            WebDriverWait(self.driver, timeout).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            
            # Small wait for dynamic content
            time.sleep(1)  # Increased slightly for more reliable loading
            
            # Get page source
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            logger.debug(f"🔍 Page source length: {len(page_source)}")
            
            # Check if blocked
            if "403" in self.driver.title or "forbidden" in self.driver.title.lower():
                logger.warning("⚠️ Page might be blocked")
                return None
                
            logger.info("✅ Page loaded successfully")
            return soup
            
        except Exception as e:
            logger.error(f"❌ Error loading {url}: {e}")
            return None
    
    def extract_clean_text(self, soup):
        """Extract clean text from BeautifulSoup object"""
        if not soup:
            return ""
            
        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'noscript']):
            element.decompose()
            
        # Get text content
        text = soup.get_text(separator=' ', strip=True)
        logger.debug(f"Raw extracted text length: {len(text)}")
        
        # Clean up text
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\n\s*\n', '\n\n', text)
        
        return text.strip()
    
    def _load_progress(self):
        """Loads previously extracted data from the progress file and existing scraped data files."""
        all_loaded_docs = []
        loaded_urls = set()

        # 1. Load from the temporary progress file (if exists and valid)
        if self.progress_filepath and os.path.exists(self.progress_filepath):
            try:
                with open(self.progress_filepath, 'r', encoding='utf-8') as f:
                    progress_data = json.load(f)
                    for doc in progress_data:
                        if doc['url'] not in loaded_urls:
                            all_loaded_docs.append(doc)
                            loaded_urls.add(doc['url'])
                logger.info(f"Loaded {len(progress_data)} documents from progress file {self.progress_filepath}")
            except json.JSONDecodeError as e:
                logger.error(f"❌ Error decoding JSON from progress file {self.progress_filepath}: {e}. Ignoring this file.")
            except Exception as e:
                logger.error(f"❌ Error loading progress file {self.progress_filepath}: {e}. Ignoring this file.")

        # 2. Load from existing scraped_data_*.json files in the output directory
        if self.output_directory and os.path.exists(self.output_directory):
            for filename in os.listdir(self.output_directory):
                if filename.startswith("scraped_data_") and filename.endswith(".json"):
                    filepath = os.path.join(self.output_directory, filename)
                    try:
                        with open(filepath, 'r', encoding='utf-8') as f:
                            scraped_data = json.load(f)
                            for doc in scraped_data:
                                if doc['url'] not in loaded_urls:
                                    all_loaded_docs.append(doc)
                                    loaded_urls.add(doc['url'])
                        logger.info(f"Loaded {len(scraped_data)} documents from existing file {filepath}")
                    except json.JSONDecodeError as e:
                        logger.warning(f"⚠️ Error decoding JSON from {filepath}: {e}. Skipping file.")
                    except Exception as e:
                        logger.warning(f"⚠️ Error reading existing data file {filepath}: {e}. Skipping file.")

        return all_loaded_docs

    def _save_progress(self):
        """Saves the current state of extracted documents to the progress file."""
        if not self.progress_filepath:
            return
        try:
            with open(self.progress_filepath, 'w', encoding='utf-8') as f:
                json.dump(self.extracted_documents, f, indent=2, ensure_ascii=False)
            logger.info(f"💾 Progress saved to {self.progress_filepath} ({len(self.extracted_documents)} documents).")
        except Exception as e:
            logger.error(f"❌ Error saving progress to {self.progress_filepath}: {e}")

    def extract_text_from_pdf(self, pdf_url):
        """Download PDF using the browser and extract text."""
        logger.info(f"📄 Processing PDF: {pdf_url}")
        
        try:
            # Clear download directory to ensure we get the right file
            for f in os.listdir(self.download_dir):
                os.remove(os.path.join(self.download_dir, f))

            # Navigate to the URL, which will trigger the download
            self.driver.get(pdf_url)

            # Wait for the download to complete
            filepath = None
            time_waited = 0
            timeout = 60
            while time_waited < timeout:
                # Look for a file with a .pdf extension, which indicates the download is complete.
                pdf_files = [f for f in os.listdir(self.download_dir) if f.lower().endswith('.pdf')]
                if pdf_files:
                    filepath = os.path.join(self.download_dir, pdf_files[0])
                    # Give it a moment to ensure the file handle is released
                    time.sleep(1)
                    logger.info(f"✅ PDF downloaded to: {filepath}")
                    break
                time.sleep(1)
                time_waited += 1

            if not filepath:
                logger.error(f"❌ PDF download timed out for {pdf_url}")
                return None
            
            with open(filepath, 'rb') as f:
                pdf_data = f.read()

            if not pdf_data or len(pdf_data) < 1000:
                logger.warning(f"⚠️ PDF data from {filepath} seems too small ({len(pdf_data)} bytes)")
                return None

            # Try pdfplumber first
            try:
                logger.debug("📄 Trying pdfplumber extraction...")
                with pdfplumber.open(io.BytesIO(pdf_data)) as pdf:
                    text = ""
                    for page_num, page in enumerate(pdf.pages):
                        page_text = page.extract_text()
                        if page_text:
                            text += page_text + "\n"
                    
                    if text.strip():
                        logger.info(f"✅ Extracted {len(text)} characters from PDF using pdfplumber")
                        return text.strip()
            except Exception as e:
                logger.warning(f"⚠️ pdfplumber failed: {e}")

            # Fallback to PyMuPDF (fitz)
            try:
                logger.debug("📄 Trying PyMuPDF (fitz) extraction...")
                doc = fitz.open(stream=pdf_data, filetype="pdf")
                text = ""
                for page_num in range(len(doc)):
                    page = doc.load_page(page_num)
                    page_text = page.get_text()
                    if page_text:
                        text += page_text + "\n"
                doc.close()
                
                if text.strip():
                    logger.info(f"✅ Extracted {len(text)} characters from PDF using PyMuPDF")
                    return text.strip()
            except Exception as e:
                logger.warning(f"⚠️ PyMuPDF failed: {e}")

            logger.error(f"❌ Both PDF extraction methods failed for {pdf_url}")
            return None

        except Exception as e:
            logger.error(f"❌ Error processing PDF {pdf_url}: {e}")
            return None

    def is_directory(self, url):
        """Check if this is a directory based on the URL structure."""
        return url.endswith('/')
    
    def is_content_file(self, url):
        """Determine if this is a content file (HTML, PDF, etc.)"""
        if self.is_directory(url):
            return False

        parsed_url = urlparse(url)
        path = parsed_url.path
        
        # Check for common file extensions. PDFs are handled separately later.
        file_extensions = ['.htm', '.html', '.txt', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']
        if any(path.lower().endswith(ext) for ext in file_extensions):
            return True
        
        # If it's not a directory and doesn't have a common file extension,
        # it's very likely an HTML page we want to scrape.
        # This handles "pretty URLs" like /about, /products/item1
        # Check if the last segment of the path contains a dot, implying a file extension.
        # If no dot, and not a directory, treat as a content page.
        if '.' not in path.split('/')[-1]:
            return True
            
        return False
    
    def is_within_scope(self, url, start_url):
        """Check if URL is within the scraping scope defined by the start_url."""
        return url.startswith(start_url)

    def get_links_in_page(self, soup, current_url, start_url):
        """Extract links that are within the scraping scope"""
        if not soup:
            return []
            
        links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            
            if href in ['.', '..', '#', ''] or href.startswith('mailto:') or href.startswith('javascript:'):
                continue
                
            full_url = urljoin(current_url, href)
            logger.debug(f"Found link - Href: {href}, Full URL: {full_url}")
                
            # Use the correct scope-checking method
            if not self.is_within_scope(full_url, start_url):
                logger.debug(f"Skipping link outside scope: {full_url}")
                continue
                
            # Avoid adding the same page again if href is empty or just a fragment
            if full_url == current_url:
                continue

            link_info = {
                'url': full_url,
                'href': href,
                'text': link.get_text(strip=True),
                'is_file': self.is_content_file(full_url),
                'is_directory': self.is_directory(full_url)
            }
            
            links.append(link_info)
            
        return links
    
    def crawl_website(self, start_url):
        """
        Crawl a website by processing each page for text and finding new links.
        
        Args:
            start_url: URL to start crawling from
        """
        logger.info(f"🔍 Starting to crawl from: {start_url}")
        
        # self.processed_urls is populated by __init__ from existing files
        queue = [(start_url, 0)]
        
        # URLs we have added to the queue during this session, to avoid duplicates
        queued_urls = {start_url}

        while queue:
            current_url, depth = queue.pop(0)
            
            # --- Depth Check ---
            if depth > self.max_depth:
                logger.info(f"⏭️ Skipping {current_url} - max depth ({self.max_depth}) reached")
                continue
            
            soup = None # Reset soup for each item in queue

            # --- Content Extraction ---
            # We only extract content if we haven't processed this URL before
            if current_url not in self.processed_urls:
                extracted_text = None
                content_type = 'unknown'

                if current_url.lower().endswith('.pdf'):
                    content_type = 'pdf'
                    extracted_text = self.extract_text_from_pdf(current_url)
                else: # Assumed to be HTML
                    soup = self.load_page(current_url)
                    if soup:
                        content_type = 'html'
                        extracted_text = self.extract_clean_text(soup)
                
                # If we got text, save it
                if extracted_text and len(extracted_text.strip()) > 0:
                    file_data = {
                        "url": current_url,
                        "type": content_type,
                        "text": extracted_text,
                        "text_length": len(extracted_text),
                        "depth": depth,
                        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
                    }
                    self.extracted_documents.append(file_data)
                    self.processed_urls.add(current_url)
                    self._save_progress()
                else:
                    logger.warning(f"⚠️ No text extracted from {current_url}")

            # --- Link Finding ---
            # We only scan for links on HTML pages that are within crawling depth
            if not current_url.lower().endswith('.pdf') and depth < self.max_depth:
                # If we didn't load the page during content extraction, load it now
                if soup is None:
                    soup = self.load_page(current_url)
                
                if soup:
                    links = self.get_links_in_page(soup, current_url, start_url)
                    logger.info(f"🔗 Found {len(links)} links on page {current_url} to consider for queue.")
                    for link_info in links:
                        link_url = link_info['url']
                        # Add to queue only if it's new for this session
                        if link_url not in queued_urls:
                            queue.append((link_url, depth + 1))
                            queued_urls.add(link_url)
                            logger.debug(f"📝 Added to queue: {link_url} (depth: {depth + 1})")

            self.maybe_delay(0.5, 1)

        logger.info(f"✅ Website crawling completed: {len(self.extracted_documents)} documents extracted")
        return self.extracted_documents
    
    def close(self):
        """Close the browser"""
        if self.driver:
            self.driver.quit()
            logger.info("🔒 Browser closed")
        
        # Clean up temp directory
        if hasattr(self, 'download_dir') and os.path.exists(self.download_dir):
            shutil.rmtree(self.download_dir)
            logger.info(f"🗑️ Removed temp directory: {self.download_dir}")

def crawl_and_extract(start_url, output_directory, max_depth=1, use_delays=False):
    """
    Generalized function to crawl a website, scrape text, and output to a flat JSON.

    Args:
        start_url (str): The URL to start crawling from.
        output_directory (str): The directory to save the output JSON file.
        max_depth (int): Maximum depth to crawl from the start_url.
        use_delays (bool): Whether to use delays between requests.
    """
    
    # Ensure output directory exists
    os.makedirs(output_directory, exist_ok=True)
    
    # Create a unique progress filename
    progress_filename = os.path.join(output_directory, f"progress_{re.sub(r'[^a-zA-Z0-9_.-]', '_', start_url)[:50]}.json")
    
    # Create a unique filename for the output
    # Sanitize URL for filename
    sanitized_url = re.sub(r'[^a-zA-Z0-9_.-]', '_', start_url)
    output_filename = os.path.join(output_directory, f"scraped_data_{sanitized_url[:50]}_{int(time.time())}.json") # Limit URL length for filename

    scraper = GenericWebScraper(use_delays=use_delays, max_depth=max_depth, progress_filepath=progress_filename, output_directory=output_directory)
    all_extracted_data = []

    try:
        all_extracted_data = scraper.crawl_website(
            start_url=start_url 
        )
        
        # Save the collected data to a single flat JSON file
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(all_extracted_data, f, indent=2, ensure_ascii=False)
            
        logger.info(f"🎉 Crawling and extraction completed! Data saved to: {output_filename}")
        logger.info(f"📚 Total documents extracted: {len(all_extracted_data)}")
        
        # Remove progress file on successful completion
        if os.path.exists(progress_filename):
            os.remove(progress_filename)
            logger.info(f"🗑️ Removed progress file: {progress_filename}")
        
    except Exception as e:
        logger.error(f"❌ An error occurred during the crawling process: {e}")
    finally:
        scraper.close() 

# Example Usage:
if __name__ == "__main__":
    start_url = "https://www.capitol.hawaii.gov/sessions/session2025/bills/"  # Replace with the URL you want to crawl
    output_directory = "output" # Directory to save the extracted data

    # Create the output directory if it doesn't exist
    os.makedirs(output_directory, exist_ok=True)

    crawl_and_extract(
        start_url=start_url,
        output_directory=output_directory,
        max_depth=1,        # Only process links on the start page
        use_delays=True     # Set to True to use delays between requests (slower but safer)
    )

In [None]:
ai_crawler("https://www.capitol.hawaii.gov/sessions/session2025/bills/", 8, 1)


=== WEB CRAWLER STARTING ===
Starting URL: https://www.capitol.hawaii.gov/sessions/session2025/bills/
Base URL for filtering: https://www.capitol.hawaii.gov/
Number of workers: 8
Max depth: 1
AI extraction enabled: False
Output will be saved to: /Users/rodericktabalba/Documents/GitHub/RAG-system/scraped_data/web_scrape_raw_1755740881.json

=== PROCESSING DEPTH LEVEL 0 ===
URLs to process: 1

Processing (depth 0): https://www.capitol.hawaii.gov/sessions/session2025/bills/
Page content length: 1499321 characters
