In [5]:
import requests
from bs4 import BeautifulSoup
import json


def scrape_case(url, output_filename):
    response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    if response.status_code != 200:
        print("Failed to retrieve the page.")
        return

    soup = BeautifulSoup(response.text, "html.parser")
    case_text = soup.find("div", class_="judgments")  # Main judgment content

    if not case_text:
        print("Could not find case text.")
        return

    text = case_text.get_text("\n", strip=True)

    sections = {
        "Facts": "",
        "Issues": "",
        "Petitioner's Arguments": "",
        "Analysis of the law": "",
        "Precedent Analysis": "",
        "Court's Reasoning": "",
        "Conclusion": "",
    }

    # Simple heuristic-based extraction (modify as needed)
    if "FACTS" in text:
        sections["Facts"] = extract_section(text, "FACTS", "ISSUES")
    if "ISSUES" in text:
        sections["Issues"] = extract_section(text, "ISSUES", "Petitioner's Arguments")
    if "Petitioner's Arguments" in text:
        sections["Petitioner's Arguments"] = extract_section(
            text, "Petitioner's Arguments", "Analysis of the law"
        )
    if "Analysis of the law" in text:
        sections["Analysis of the law"] = extract_section(
            text, "Analysis of the law", "Precedent Analysis"
        )
    if "Precedent Analysis" in text:
        sections["Precedent Analysis"] = extract_section(
            text, "Precedent Analysis", "Court's Reasoning"
        )
    if "Court's Reasoning" in text:
        sections["Court's Reasoning"] = extract_section(
            text, "Court's Reasoning", "Conclusion"
        )
    if "Conclusion" in text:
        sections["Conclusion"] = extract_section(text, "Conclusion", None)

    # Save output as JSON in a .txt file
    with open(output_filename, "w", encoding="utf-8") as file:
        json.dump(sections, file, indent=4, ensure_ascii=False)

    print(f"Case data saved to {output_filename}")


def extract_section(text, start, end):
    start_index = text.find(start)
    end_index = text.find(end) if end else len(text)
    if start_index == -1 or start_index >= end_index:
        return ""
    return text[start_index + len(start) : end_index].strip()


# Example usage
case_url = "https://indiankanoon.org/doc/173448784/"  # Replace with actual case URL
scrape_case(case_url, "case_data.txt")

Failed to retrieve the page.
