In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import json
import time


def scrape_case(url, output_filename):
    # Set up headless Chrome browser
    options = Options()
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument("--disable-blink-features=AutomationControlled")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)

    try:
        driver.get(url)
        time.sleep(3)  # Wait for JavaScript to load

        soup = BeautifulSoup(driver.page_source, "html.parser")
        case_text = soup.find("div", class_="judgments")

        if not case_text:
            print("Could not find case text.")
            return

        text = case_text.get_text("\n", strip=True)

        sections = {
            "Facts": extract_section(text, "FACTS", "ISSUES"),
            "Issues": extract_section(text, "ISSUES", "Petitioner's Arguments"),
            "Petitioner's Arguments": extract_section(
                text, "Petitioner's Arguments", "Analysis of the law"
            ),
            "Analysis of the law": extract_section(
                text, "Analysis of the law", "Precedent Analysis"
            ),
            "Precedent Analysis": extract_section(
                text, "Precedent Analysis", "Court's Reasoning"
            ),
            "Court's Reasoning": extract_section(
                text, "Court's Reasoning", "Conclusion"
            ),
            "Conclusion": extract_section(text, "Conclusion", None),
        }

        with open(output_filename, "w", encoding="utf-8") as file:
            json.dump(sections, file, indent=4, ensure_ascii=False)

        print(f"Case data saved to {output_filename}")

    finally:
        driver.quit()


def extract_section(text, start, end):
    start_index = text.find(start)
    end_index = text.find(end) if end and text.find(end) > start_index else len(text)
    if start_index == -1 or start_index >= end_index:
        return ""
    return text[start_index + len(start) : end_index].strip()


# Example usage
case_url = "https://indiankanoon.org/doc/173448784/"  # Replace with a real case URL
scrape_case(case_url, "case_data.txt")

Could not find case text.
