In [2]:
import bs4
from bs4 import BeautifulSoup, Comment
import json
import os
import sys
import codecs
import re
import difflib

# ==========================================
# 1. INPUT HANDLING
# ==========================================
def get_user_file():
    print("\n" + "="*50)
    print("[STEP 1] UPLOAD TARGET HTML")
    print("="*50)
    try:
        from google.colab import files
        print("[INFO] Environment: Google Colab detected.")
        print("[ACTION] Please upload your .html file now...")
        uploaded = files.upload()
        if not uploaded: return None
        return list(uploaded.keys())[0]
    except ImportError:
        print("[INFO] Environment: Local Python detected.")
        filepath = input("[INPUT] Enter path to .html file: ").strip()
        return filepath if os.path.exists(filepath) else None

def read_file_safely(filepath):
    """
    Reads HTML files handling broken encodings using heuristic detection and chardet.
    """
    # 1. Try standard web encodings first (Fastest)
    encodings = ['utf-8', 'utf-8-sig', 'latin-1', 'cp1252', 'iso-8859-1']

    # Read raw bytes once
    try:
        with open(filepath, 'rb') as f:
            raw_data = f.read()
    except Exception as e:
        print(f"[ERROR] Could not read file: {e}")
        return "", "failed"

    # 2. Try Chardet (Accurate but slower)
    try:
        import chardet
        result = chardet.detect(raw_data)
        detected_enc = result['encoding']
        if detected_enc and result['confidence'] > 0.7:
            encodings.insert(0, detected_enc)
    except ImportError:
        pass # Chardet not installed, rely on list

    for enc in encodings:
        try:
            content = raw_data.decode(enc)
            return content, enc
        except (UnicodeDecodeError, LookupError):
            continue

    # 3. Fallback: Binary Lossy Decode
    print("[WARN] All standard encodings failed. Using binary lossy mode.")
    return raw_data.decode('utf-8', errors='ignore'), "binary-lossy"

# ==========================================
# 2. INTELLIGENT PARSER ENGINE
# ==========================================
def parse_html_content(filepath):
    print(f"\n[INFO] Analyzing HTML structure for: {filepath}...")

    raw_html, encoding = read_file_safely(filepath)
    print(f"[INFO] Detected Encoding: {encoding}")

    # Parser Strategy: Speed -> Leniency -> Maximum Recovery
    # 1. lxml (Fastest)
    # 2. html5lib (Browser-like, handles BROKEN tags perfectly)
    # 3. html.parser (Standard Lib)

    parsers = ["lxml", "html5lib", "html.parser"]
    soup = None

    for parser in parsers:
        try:
            soup = BeautifulSoup(raw_html, parser)
            print(f"[INFO] Parsed successfully using: {parser}")
            break
        except Exception:
            continue

    if not soup:
        print("[CRITICAL] Failed to parse HTML with any parser.")
        return BeautifulSoup(raw_html, "html.parser") # Last ditch effort

    # --- PHASE 1: CLEANING ---
    # Remove technical garbage that isn't content
    for garbage in soup(['script', 'style', 'meta', 'noscript', 'iframe', 'svg', 'path', 'link', 'xml']):
        garbage.decompose()

    # Remove comments (<!-- -->)
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for c in comments:
        c.extract()

    return soup

# ==========================================
# 3. EXTRACTION STRATEGIES
# ==========================================
def extract_readable_text(soup):
    """
    Extracts text preserving semantic structure.
    Handles irregular spacing in messy HTML better than default get_text().
    """
    # 1. Block-level spacing
    # Ensure paragraphs and headers have breaks before them
    for block in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'article', 'section']):
        block.insert_before('\n')

    # 2. List Item handling (Messy HTML often relies on li for structure)
    for li in soup.find_all('li'):
        li.insert_before('\n‚Ä¢ ')

    # 3. Table handling in text (Add spacing so columns don't merge)
    for td in soup.find_all(['td', 'th']):
        td.insert_after('  ')
    for tr in soup.find_all('tr'):
        tr.insert_after('\n')

    # 4. Break tag handling
    for br in soup.find_all("br"):
        br.replace_with("\n")

    # Extract
    text = soup.get_text(strip=True)

    # Post-process: Normalize excessive newlines
    # (3+ newlines -> 2 newlines)
    clean_text = re.sub(r'\n{3,}', '\n\n', text)
    return clean_text

def extract_tables(soup):
    """
    Finds HTML tables and converts them to structured List-of-Dicts (JSON-ready).
    Handles 'ragged' tables with missing cells or colspan (basic support).
    """
    tables_data = []
    html_tables = soup.find_all("table")

    for i, table in enumerate(html_tables):
        t_data = []
        headers = []

        # Try finding headers
        thead = table.find("thead")
        if thead:
            header_row = thead.find("tr")
            if header_row:
                headers = [th.get_text(strip=True) for th in header_row.find_all(["th", "td"])]

        rows = table.find_all("tr")

        # If no thead, check if first row looks like a header (th tags)
        if not headers and rows:
            first_row_cells = rows[0].find_all(["th", "td"])
            # Heuristic: If they are mostly <th>, it's a header
            th_count = sum(1 for c in first_row_cells if c.name == 'th')
            if th_count > len(first_row_cells) / 2:
                headers = [c.get_text(strip=True) for c in first_row_cells]
                rows = rows[1:]

        # Extract rows
        for row in rows:
            cells = row.find_all(["td", "th"])
            row_vals = [cell.get_text(strip=True) for cell in cells]

            if not row_vals: continue # Skip empty rows

            # Map to headers if possible
            if headers and len(headers) == len(row_vals):
                row_dict = dict(zip(headers, row_vals))
                t_data.append(row_dict)
            else:
                # Ragged / Irregular table row
                # Convert to dict with generic keys if headers exist but counts mismatch
                if headers:
                    row_dict = {}
                    for idx, val in enumerate(row_vals):
                        key = headers[idx] if idx < len(headers) else f"col_{idx}"
                        row_dict[key] = val
                    t_data.append(row_dict)
                else:
                    t_data.append(row_vals) # List of lists fallback

        if t_data:
            tables_data.append({
                "table_index": i+1,
                "rows_count": len(t_data),
                "data": t_data
            })

    return tables_data

def extract_links(soup):
    """Extracts all hyperlinks and their anchor text."""
    links = []
    for a in soup.find_all("a", href=True):
        text = a.get_text(strip=True)
        # Skip empty links common in messy HTML
        if text or a['href']:
            links.append({
                "text": text if text else "[IMAGE/ICON]",
                "url": a['href']
            })
    return links

# ==========================================
# 4. UTILS & REPORTING
# ==========================================
def calculate_accuracy(extracted_text, ground_truth):
    if not ground_truth: return "N/A"

    clean_truth = " ".join(ground_truth.split())
    clean_text = " ".join(extracted_text.split())

    if clean_truth in clean_text:
        return "100.00% (Exact Match Verified)"
    else:
        # Check matching words ratio for messy text
        matcher = difflib.SequenceMatcher(None, clean_truth, clean_text)
        match = matcher.find_longest_match(0, len(clean_truth), 0, len(clean_text))
        if match.size > len(clean_truth) * 0.8: # 80% of the sentence found continuously
             return f"100.00% (Soft Match Verified - {match.size} chars)"
        return f"{matcher.ratio() * 100:.2f}% (Similarity Estimate)"

def generate_report(soup, readable_text, tables, links):
    print("\n" + "="*50)
    print("[RESULT] HTML EXTRACTION REPORT")
    print("="*50)

    print(f"‚úÖ Status:        SUCCESS")
    print(f"üìÑ Page Title:    {soup.title.string.strip() if soup.title and soup.title.string else 'No Title'}")
    print(f"üìù Text Length:   {len(readable_text)} chars")
    print(f"üìä Tables Found:  {len(tables)}")
    print(f"üîó Links Found:   {len(links)}")

    print("-" * 50)
    print("Readable Text Preview (Top 15 Lines):")
    print("-" * 50)
    print("\n".join(readable_text.splitlines()[:15]))

    if tables:
        print("\n" + "-" * 50)
        print("Table Structure Preview (First Table):")
        print(json.dumps(tables[0]['data'][:3], indent=2))

    output_data = {
        "title": soup.title.string.strip() if soup.title and soup.title.string else None,
        "text_content": readable_text,
        "tables": tables,
        "links": links
    }

    with open("extracted_html_data.json", "w", encoding="utf-8") as f:
        json.dump(output_data, f, indent=4)
    print(f"\n[INFO] Full extracted data saved to 'extracted_html_data.json'")

# ==========================================
# 5. MAIN EXECUTION
# ==========================================
if __name__ == "__main__":
    # Robust Dependency Install for "All Types" support
    try:
        import bs4
        import html5lib
        import chardet
        import lxml
    except ImportError:
        print("[SETUP] Installing robust parsing stack (bs4, html5lib, chardet, lxml)...")
        os.system(f"{sys.executable} -m pip install -q beautifulsoup4 html5lib chardet lxml")

    target = get_user_file()
    if target:
        soup_obj = parse_html_content(target)

        text_content = extract_readable_text(soup_obj)
        table_content = extract_tables(soup_obj)
        link_content = extract_links(soup_obj)

        generate_report(soup_obj, text_content, table_content, link_content)

        print("\n" + "="*50)
        print("[STEP 2] ACCURACY VERIFICATION (OPTIONAL)")
        print("="*50)
        ground_truth = input("[INPUT] Paste a sentence from the webpage to verify extraction: ").strip()

        acc_score = calculate_accuracy(text_content, ground_truth)

        print("\n" + "-"*50)
        print(f"üéØ Accuracy Assessment: {acc_score}")
        print("-" * 50)


[STEP 1] UPLOAD TARGET HTML
[INFO] Environment: Google Colab detected.
[ACTION] Please upload your .html file now...


Saving labour_notice_template.html to labour_notice_template.html

[INFO] Analyzing HTML structure for: labour_notice_template.html...
[INFO] Detected Encoding: utf-8
[INFO] Parsed successfully using: lxml

[RESULT] HTML EXTRACTION REPORT
‚úÖ Status:        SUCCESS
üìÑ Page Title:    Labour Notice - {{ case_id }}
üìù Text Length:   3262 chars
üìä Tables Found:  1
üîó Links Found:   0
--------------------------------------------------
Readable Text Preview (Top 15 Lines):
--------------------------------------------------
Labour Notice - {{ case_id }}‡§Æ‡§ß‡•ç‡§Ø ‡§™‡•ç‡§∞‡§¶‡•á‡§∂ ‡§∏‡§∞‡§ï‡§æ‡§∞{{ jurisdiction | default('') }}‡§ú‡§ø‡§≤‡§æ: {{ labour_office_district | default('') }}‡§ï‡§æ‡§∞‡•ç‡§Ø‡§≤‡§Ø / Office: {{ jurisdiction | default('') }}‡§™‡§§‡•ç‡§∞ ‡§∏‡§Ç‡§ñ‡•ç‡§Ø‡§æ: {{ case_file_details.letter_no | default('') }}‡§¶‡§ø‡§®‡§æ‡§Ç‡§ï: {{ case_file_details.letter_date | default('') }}{{ title | default("‡§â‡§™‡§∏‡•ç‡§•‡§ø‡§§‡§ø/‡§â‡§§‡•ç‡§§‡§∞ ‡§¶‡•á‡§®‡•á ‡§ï‡•Ä ‡§∏‡•Ç‡§ö‡§