In [7]:
import re
import json
import os
import sys
import csv
import time
import difflib
import codecs

# ==========================================
# 1. INPUT HANDLING & UTILITIES
# ==========================================
def get_user_file():
    print("\n" + "="*50)
    print("[STEP 1] FILE UPLOAD SELECTION")
    print("="*50)
    try:
        from google.colab import files
        print("[INFO] Environment: Google Colab detected.")
        print("[ACTION] Please upload your .txt file now...")
        uploaded = files.upload()
        if not uploaded:
            return None
        return list(uploaded.keys())[0]
    except ImportError:
        print("[INFO] Environment: Local Python detected.")
        filepath = input("[INPUT] Enter the full path to your .txt file: ").strip()
        return filepath if os.path.exists(filepath) else None

def read_file_safely(filepath):
    """
    Attempts to read a file using multiple encodings to handle legacy/messy data.
    Returns: content (str), encoding_used (str)
    """
    encodings = ['utf-8', 'latin-1', 'cp1252', 'ascii']

    for enc in encodings:
        try:
            with codecs.open(filepath, 'r', encoding=enc) as f:
                return f.read(), enc
        except UnicodeDecodeError:
            continue

    # Fallback: Ignore errors if all encodings fail
    with open(filepath, 'r', errors='ignore') as f:
        return f.read(), "unknown (lossy)"

# ==========================================
# 2. STRATEGY A: STRUCTURED DATA MINING
# ==========================================
def extract_structured_blocks(filepath):
    """
    Scans for tabular data blocks using heuristic delimiter density analysis.
    Supported Delimiters: Pipe (|), Comma (,), Tab (\t), Semicolon (;)
    """
    print(f"\n[INFO] Initiating structured data scan on: {filepath}...")
    start_time = time.time()

    content, _ = read_file_safely(filepath)
    lines = content.splitlines()

    extracted_tables = []
    current_block = []

    # Heuristic: A line is "structured" if it contains a significant number of delimiters
    def is_likely_table_row(line):
        if len(line) > 1000: return False # Skip massive text dumps
        # Check for common delimiters
        if line.count('|') >= 2: return True
        if line.count('\t') >= 2: return True
        if line.count(';') >= 2: return True
        if line.count(',') >= 2: return True
        return False

    for line in lines:
        stripped = line.strip()
        if not stripped: continue

        if is_likely_table_row(stripped):
            current_block.append(stripped)
        else:
            if len(current_block) > 1:
                parsed_data = parse_block(current_block)
                if parsed_data:
                    extracted_tables.append(parsed_data)
                current_block = []

    # Capture trailing block
    if len(current_block) > 1:
        parsed_data = parse_block(current_block)
        if parsed_data:
            extracted_tables.append(parsed_data)

    end_time = time.time()

    metrics = {
        "time": end_time - start_time,
        "tables_found": len(extracted_tables),
        "total_rows": sum(len(t) for t in extracted_tables)
    }

    return extracted_tables, metrics

def parse_block(block_lines):
    """Attempts to parse a block of text lines into a list of dictionaries."""
    try:
        # Auto-detect delimiter from the first valid line
        sample = "\n".join(block_lines[:5])
        dialect = csv.Sniffer().sniff(sample, delimiters='|,\t;')

        data = []
        reader = csv.DictReader(block_lines, dialect=dialect)
        for row in reader:
            # Clean whitespace from keys and values
            clean_row = {k.strip(): v.strip() for k, v in row.items() if k and v}
            if clean_row:
                data.append(clean_row)
        return data
    except csv.Error:
        return []
    except Exception:
        return []

# ==========================================
# 3. STRATEGY B: ADVANCED REGEX EXTRACTION
# ==========================================
def extract_unstructured(filepath):
    """
    Extracts entities using advanced Regex patterns optimized for messy data.
    """
    print(f"\n[INFO] Running natural language entity extraction...")
    start_time = time.time()

    content, encoding = read_file_safely(filepath)
    print(f"[INFO] File read successfully using encoding: {encoding}")

    # Professional-grade patterns
    patterns = {
        "Emails": r'(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])',
        "URLs": r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)',
        "IPv4 Addresses": r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b',
        "IPv6 Addresses": r'([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])',
        "Phone Numbers": r'\+?1?\s*\(?-*\.*(\d{3})\)?\.*-*\s*(\d{3})\.*-*\s*(\d{4})',
        "Dates": r'\b(?:\d{4}[-/]\d{2}[-/]\d{2})|(?:\d{2}[-/]\d{2}[-/]\d{4})|(?:\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2}(?:st|nd|rd|th)?(?:, \d{4})?)\b',
        "Currency": r'\$\s?\d+(?:,\d{3})*(?:\.\d{2})?',
        "MAC Addresses": r'([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})',
        "System Error Codes": r'\b[A-Z0-9]+-[0-9]{3,}\b',
        "Key-Value Pairs": r'^([a-zA-Z0-9_]+)\s*[:=]\s*(.+)$' # e.g. "Status: Active" or "id=500"
    }

    results = {}
    match_count = 0

    # Use re.IGNORECASE for broader matching
    flags = re.IGNORECASE | re.MULTILINE

    for key, pattern in patterns.items():
        matches = list(set(re.findall(pattern, content, flags)))

        # Filter out empty tuples for complex regex groups
        cleaned_matches = []
        for m in matches:
            if isinstance(m, tuple):
                # Join tuple groups or pick the longest non-empty string
                valid_parts = [p for p in m if p]
                if valid_parts:
                    cleaned_matches.append(valid_parts[0])
            else:
                cleaned_matches.append(m)

        if cleaned_matches:
            results[key] = cleaned_matches
            match_count += len(cleaned_matches)

    end_time = time.time()

    metrics = {
        "time": end_time - start_time,
        "entities": match_count
    }
    return results, metrics, content

# ==========================================
# 4. UTILS & MAIN EXECUTION
# ==========================================
def calculate_accuracy(extracted_text, ground_truth):
    if not ground_truth: return "N/A"

    # Normalize for comparison
    clean_truth = " ".join(ground_truth.split())
    clean_text = " ".join(extracted_text.split())

    # Check for exact substring presence
    if clean_truth in clean_text:
        return "100.00% (Exact Match Verified)"
    else:
        # Fallback to similarity ratio
        ratio = difflib.SequenceMatcher(None, clean_truth, clean_text).ratio()
        return f"{ratio * 100:.2f}% (Similarity Estimate)"

if __name__ == "__main__":
    target_file = get_user_file()

    if target_file:
        print("\n" + "="*50)
        print("[STEP 2] ACCURACY VERIFICATION (OPTIONAL)")
        print("="*50)
        ground_truth = input("[INPUT] Paste a known snippet from the file to verify accuracy (Enter to skip): ").strip()

        # Strategy 1: Tables
        tables, t_metrics = extract_structured_blocks(target_file)

        print("\n" + "-"*50)
        print(f"[REPORT] STRUCTURED DATA EXTRACTION")
        print("-" * 50)
        print(f"Time Taken: {t_metrics['time']:.4f}s")
        print(f"Tables Detected: {t_metrics['tables_found']}")

        if tables:
            for i, table in enumerate(tables):
                print(f"\n--- Table Block {i+1} Preview (First 2 Rows) ---")
                print(json.dumps(table[:2], indent=4))
        else:
            print("[INFO] No structured table blocks identified.")

        # Strategy 2: Entities
        entities, e_metrics, raw_text = extract_unstructured(target_file)

        print("\n" + "-"*50)
        print(f"[REPORT] ENTITY EXTRACTION")
        print("-" * 50)
        print(f"Time Taken: {e_metrics['time']:.4f}s")
        print(f"Entities Found: {e_metrics['entities']}")

        if entities:
            print(json.dumps(entities, indent=4))

        # Accuracy
        acc = calculate_accuracy(raw_text, ground_truth)
        print("\n" + "="*50)
        print(f"[FINAL VERDICT] Accuracy Assessment: {acc}")
        print("=" * 50)


[STEP 1] FILE UPLOAD SELECTION
[INFO] Environment: Google Colab detected.
[ACTION] Please upload your .txt file now...


Saving test_data.txt to test_data (3).txt

[STEP 2] ACCURACY VERIFICATION (OPTIONAL)
[INPUT] Paste a known snippet from the file to verify accuracy (Enter to skip): Wire the outstanding balance of $3,250.50 to the external contractor account.

[INFO] Initiating structured data scan on: test_data (3).txt...

--------------------------------------------------
[REPORT] STRUCTURED DATA EXTRACTION
--------------------------------------------------
Time Taken: 0.0032s
Tables Detected: 2

--- Table Block 1 Preview (First 2 Rows) ---
[
    {
        "ID": "1001",
        "TIMESTAMP": "2025-01-15 08:30:00",
        "SEVERITY": "INFO",
        "USER_EMAIL": "admin.sys@gateway.org",
        "COST_IMPACT": "$0.00",
        "STATUS": "ACTIVE"
    },
    {
        "ID": "1002",
        "TIMESTAMP": "2025-01-15 08:45:12",
        "SEVERITY": "WARN",
        "USER_EMAIL": "j.doe_dev@startup.io",
        "COST_IMPACT": "$150.50",
        "STATUS": "REVIEW"
    }
]

--- Table Block 2 Preview (First 2 Ro