In [4]:
import xml.etree.ElementTree as ET
import json
import os
import sys
import codecs
import re
import difflib
from collections import defaultdict

# ==========================================
# 1. INPUT HANDLING
# ==========================================
def get_user_file():
    print("\n" + "="*50)
    print("[STEP 1] UPLOAD TARGET XML")
    print("="*50)
    try:
        from google.colab import files
        print("[INFO] Environment: Google Colab detected.")
        print("[ACTION] Please upload your .xml file now...")
        uploaded = files.upload()
        if not uploaded: return None
        return list(uploaded.keys())[0]
    except ImportError:
        print("[INFO] Environment: Local Python detected.")
        filepath = input("[INPUT] Enter path to .xml file: ").strip()
        return filepath if os.path.exists(filepath) else None

def read_file_safely(filepath):
    """
    Reads file handling various encoding issues and stripping BOM.
    Attempts aggressive fallback for completely corrupted encodings.
    """
    encodings = ['utf-8-sig', 'utf-8', 'latin-1', 'cp1252', 'iso-8859-1', 'ascii']

    for enc in encodings:
        try:
            with codecs.open(filepath, 'r', encoding=enc) as f:
                content = f.read()
                return content.strip(), enc
        except UnicodeDecodeError:
            continue

    # Final Fallback: Lossy conversion
    print("[WARN] All standard encodings failed. Using lossy binary recovery.")
    with open(filepath, 'rb') as f:
        # Decode bytes ignoring errors to force a string result
        return f.read().decode('utf-8', errors='ignore').strip(), "binary-lossy"

# ==========================================
# 2. ADVANCED CLEANING UTILITIES
# ==========================================
def sanitize_xml_chars(text):
    """
    Removes control characters (like null bytes, vertical tabs) that are illegal in XML 1.0.
    Keeps only valid XML characters: #x9 | #xA | #xD | [#x20-#xD7FF] | ...
    """
    # Regex for invalid XML characters
    illegal_xml_re = re.compile(u'[^\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]')
    return illegal_xml_re.sub('', text)

def wrap_fragment(text):
    """Wraps content in a fake root if it's missing a single root element."""
    return f"<recovered_root>{text}</recovered_root>"

# ==========================================
# 3. MULTI-STAGE PARSER ENGINE
# ==========================================
def parse_xml_content(filepath):
    print(f"\n[INFO] Analyzing XML structure for: {filepath}...")

    # 1. Read Raw Content
    raw_xml, encoding = read_file_safely(filepath)
    print(f"[INFO] Detected Encoding: {encoding}")

    # 2. Basic Cleaning
    # Remove XML comments which can break parsers if malformed
    clean_xml = re.sub(r'<!--.*?-->', '', raw_xml, flags=re.DOTALL)

    # 3. Define Recovery Strategies
    strategies = [
        ("Level 1: Strict Standard Parse", lambda x: ET.fromstring(x)),
        ("Level 2: Sanitized Chars Parse", lambda x: ET.fromstring(sanitize_xml_chars(x))),
    ]

    # Add LXML Recovery (Gold Standard for broken tags)
    try:
        from lxml import etree as lxml_etree
        parser = lxml_etree.XMLParser(recover=True) # The magic 'recover' flag
        strategies.append(("Level 3: LXML Recovery Mode", lambda x: ET.fromstring(lxml_etree.tostring(lxml_etree.fromstring(x, parser=parser)))))
        strategies.append(("Level 4: LXML + Sanitized", lambda x: ET.fromstring(lxml_etree.tostring(lxml_etree.fromstring(sanitize_xml_chars(x), parser=parser)))))
    except ImportError:
        print("[INFO] LXML not found. Skipping specialized recovery layers.")

    # Add BeautifulSoup (For tag soup/mismatched tags)
    try:
        from bs4 import BeautifulSoup
        strategies.append(("Level 5: BeautifulSoup (XML Mode)", lambda x: ET.fromstring(str(BeautifulSoup(x, "xml")))))
        # HTML parser is incredibly lenient and can often fix XML that looks like HTML
        strategies.append(("Level 6: BeautifulSoup (HTML Mode)", lambda x: ET.fromstring(str(BeautifulSoup(x, "html.parser")))))
    except ImportError:
        pass

    # 4. Execution Loop
    # We try every strategy. If one fails, we try adding a fake root node (Level X.5) and retrying.

    final_root = None
    success_method = "None"

    for name, strategy_func in strategies:
        try:
            # Attempt A: Direct Parse
            final_root = strategy_func(clean_xml)
            success_method = name
            break
        except Exception:
            # Attempt B: Wrap in Root and Retry (Fixes "multiple root elements" error)
            try:
                final_root = strategy_func(wrap_fragment(clean_xml))
                success_method = f"{name} (with Root Wrapper)"
                break
            except Exception:
                continue

    if final_root is not None:
        print(f"[SUCCESS] XML Structure recovered using: {success_method}")
        return final_root, raw_xml
    else:
        print("[CRITICAL ERROR] File is too damaged. All recovery strategies failed.")
        return None, None

def element_to_dict(element):
    """
    Recursively converts an XML Element into a Python Dictionary.
    Preserves Attributes (@attr), Text (#text), and Children structure.
    """
    node = {}

    # 1. Capture Attributes (if any)
    if element.attrib:
        node["@attributes"] = element.attrib

    # 2. Capture Text Content
    text = element.text
    if text and text.strip():
        node["#text"] = text.strip()

    # 3. Capture Children
    children = list(element)
    if children:
        child_dict = defaultdict(list)
        for child in children:
            parsed_child = element_to_dict(child)
            # Handle Tag Names removing namespaces like {http://url}Tag
            tag_name = child.tag.split('}')[-1] if '}' in child.tag else child.tag
            child_dict[tag_name].append(parsed_child)

        final_children = {}
        for tag, items in child_dict.items():
            if len(items) == 1:
                final_children[tag] = items[0]
            else:
                final_children[tag] = items

        node.update(final_children)

    return node

# ==========================================
# 4. UTILS
# ==========================================
def calculate_accuracy(json_data, ground_truth):
    """
    Calculates similarity between extracted JSON content and a user-provided text snippet.
    Uses Smart Component Search to handle format differences (XML vs JSON).
    """
    if not ground_truth:
        return "N/A"

    # 1. Convert JSON back to a string for text searching
    extracted_text = json.dumps(json_data, ensure_ascii=False)

    # 2. Normalize strings (remove extra whitespace/newlines)
    clean_truth = " ".join(ground_truth.split())
    clean_extracted = " ".join(extracted_text.split())

    # 3. Strategy A: Exact Match
    if clean_truth in clean_extracted:
        return "100.00% (Exact Match Verified)"

    # 4. Strategy B: Smart Component Match (Ignores XML vs JSON syntax)
    # Extract "data-like" tokens from the ground truth (text between tags, or attribute values)
    # Matches >Value< or "Value" or 'Value'
    data_tokens = re.findall(r'>([^<]+)<|["\']([^"\']+)["\']', ground_truth)

    # Flatten regex results and clean empty matches
    tokens_to_find = []
    for t in data_tokens:
        tokens_to_find.extend([x.strip() for x in t if x.strip()])

    # If regex failed to find structure (e.g., plain text input), use the whole string
    if not tokens_to_find:
        tokens_to_find = [ground_truth.strip()]

    found_count = 0
    matched_items = []

    for token in tokens_to_find:
        # Search for the value inside the JSON string (escaping regex symbols)
        if re.search(re.escape(token), extracted_text):
            found_count += 1
            matched_items.append(token)

    if found_count == len(tokens_to_find) and found_count > 0:
        return f"100.00% (Content Verified - Format Ignored)\n[INFO] Matched Elements: {matched_items}"

    # 5. Strategy C: Fuzzy Match (Fallback)
    if found_count > 0:
        percentage = (found_count / len(tokens_to_find)) * 100
        return f"{percentage:.2f}% (Found {found_count}/{len(tokens_to_find)} data points)\n[INFO] Matched: {matched_items}"
    else:
        ratio = difflib.SequenceMatcher(None, clean_truth, clean_extracted).ratio()
        return f"{ratio * 100:.2f}% (Similarity Estimate)"

# ==========================================
# 5. REPORTING
# ==========================================
def generate_xml_report(root, raw_xml):
    if root is None: return

    print("\n" + "="*50)
    print("[RESULT] XML STRUCTURE REPORT")
    print("="*50)

    # Handle root tag namespace cleanup
    root_tag = root.tag.split('}')[-1] if '}' in root.tag else root.tag

    xml_dict = {root_tag: element_to_dict(root)}
    json_output = json.dumps(xml_dict, indent=4)

    total_len = len(raw_xml)

    print(f"‚úÖ Status:        RECOVERED")
    print(f"üìä Root Element:  <{root_tag}>")
    print(f"üìè Size:          {total_len/1024:.2f} KB")

    print("-" * 50)
    print("Exact Structure Preview (JSON Representation):")
    print(json_output[:3000])
    if len(json_output) > 3000:
        print("\n... [Output Truncated] ...")

    output_filename = "recovered_structure.json"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(json_output)
    print(f"\n[INFO] Full recovered structure saved to '{output_filename}'")

    return xml_dict

# ==========================================
# 6. MAIN
# ==========================================
if __name__ == "__main__":
    # Ensure dependencies for advanced recovery are present
    try:
        import lxml
        import bs4
    except ImportError:
        print("[SETUP] Installing advanced recovery libraries (lxml, beautifulsoup4)...")
        subprocess_cmd = f"{sys.executable} -m pip install -q lxml beautifulsoup4"
        os.system(subprocess_cmd)
        print("[SETUP] Libraries installed.")

    target = get_user_file()
    if target:
        root_node, raw_content = parse_xml_content(target)
        extracted_data = generate_xml_report(root_node, raw_content)

        # Accuracy Check Step
        if extracted_data:
            print("\n" + "="*50)
            print("[STEP 2] ACCURACY VERIFICATION (OPTIONAL)")
            print("="*50)
            print("[INFO] Paste a snippet from the original file.")
            print("[INFO] The tool will verify if the DATA exists, even if the format (XML vs JSON) differs.")
            ground_truth = input("[INPUT] Paste snippet: ").strip()

            acc_score = calculate_accuracy(extracted_data, ground_truth)

            print("\n" + "-"*50)
            print(f"üéØ Accuracy Assessment: {acc_score}")
            print("-" * 50)


[STEP 1] UPLOAD TARGET XML
[INFO] Environment: Google Colab detected.
[ACTION] Please upload your .xml file now...


Saving critical_test_payload.xml to critical_test_payload (3).xml

[INFO] Analyzing XML structure for: critical_test_payload (3).xml...
[INFO] Detected Encoding: utf-8-sig
[SUCCESS] XML Structure recovered using: Level 5: BeautifulSoup (XML Mode)

[RESULT] XML STRUCTURE REPORT
‚úÖ Status:        RECOVERED
üìä Root Element:  <root>
üìè Size:          3.48 KB
--------------------------------------------------
Exact Structure Preview (JSON Representation):
{
    "root": {
        "metadata": {
            "@attributes": {
                "id": "meta_001"
            },
            "version": {
                "#text": "2.5.1"
            },
            "timestamp": {
                "#text": "2025-11-29T23:59:59Z"
            },
            "source": {
                "#text": "Legacy_Mainframe_Export"
            }
        },
        "inventory": {
            "table": {
                "@attributes": {
                    "border": "1"
                },
                "tr": {
      