<a href="https://colab.research.google.com/github/Ravikumarkatta/Simple-Inventory-Management-System/blob/main/Pdf_split5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =============================================================================
# 🎯 VERTICAL LINE DETECTOR - Perfect split using actual PDF lines
# =============================================================================

import os
import time
import gc
from pathlib import Path
from typing import Optional, Tuple, List, Dict
from datetime import datetime
import numpy as np

try:
    import fitz  # PyMuPDF
    from tqdm.auto import tqdm
    from google.colab import files
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
    print("✅ All imports successful!")
except ImportError as e:
    print(f"❌ Import error: {e}")
    raise

class VerticalLineDetector:
    """Detects vertical lines in PDF for perfect split positioning"""

    def __init__(self):
        self.debug_mode = True
        print("📏 Vertical Line Detector initialized!")

    def detect_vertical_lines(self, pdf_path: str, sample_pages: int = 10) -> Dict:
        """
        Detect vertical lines in PDF pages to find exact split positions

        Returns:
        - detected_lines: List of vertical line positions
        - optimal_split: Best split position based on detected lines
        - confidence: How confident we are in the detection
        - line_consistency: How consistent lines are across pages
        """

        doc = fitz.open(pdf_path)

        try:
            total_pages = len(doc)
            sample_pages = min(sample_pages, total_pages)

            print(f"📏 Analyzing {sample_pages} pages for vertical lines...")

            all_vertical_lines = []
            page_analyses = []

            for page_num in range(sample_pages):
                page = doc[page_num]

                # Method 1: Drawing/Path Detection
                lines_from_paths = self._detect_lines_from_paths(page)

                # Method 2: Vector Graphics Detection
                lines_from_vectors = self._detect_lines_from_vectors(page)

                # Method 3: Text-based Line Detection (like "|" characters)
                lines_from_text = self._detect_lines_from_text(page)

                # Combine all detected lines for this page
                page_lines = []
                page_lines.extend(lines_from_paths)
                page_lines.extend(lines_from_vectors)
                page_lines.extend(lines_from_text)

                # Remove duplicates and sort
                page_lines = list(set(page_lines))
                page_lines.sort()

                page_analysis = {
                    'page_num': page_num,
                    'lines': page_lines,
                    'page_width': page.rect.width,
                    'page_height': page.rect.height,
                    'methods': {
                        'paths': len(lines_from_paths),
                        'vectors': len(lines_from_vectors),
                        'text': len(lines_from_text)
                    }
                }

                page_analyses.append(page_analysis)
                all_vertical_lines.extend(page_lines)

                if self.debug_mode:
                    print(f"📄 Page {page_num + 1}: Found {len(page_lines)} vertical lines")
                    if page_lines:
                        ratios = [line / page.rect.width for line in page_lines]
                        print(f"   • Line positions: {[f'{r:.1%}' for r in ratios]}")

            # Analyze all detected lines
            final_analysis = self._analyze_detected_lines(all_vertical_lines, page_analyses)

            return final_analysis

        finally:
            doc.close()

    def _detect_lines_from_paths(self, page) -> List[float]:
        """Detect vertical lines from PDF drawing paths"""
        vertical_lines = []

        try:
            # Get page drawings/paths
            drawings = page.get_drawings()

            for drawing in drawings:
                items = drawing.get("items", [])

                for item in items:
                    if item[0] == "l":  # Line drawing command
                        # item format: ("l", point1, point2)
                        if len(item) >= 3:
                            p1, p2 = item[1], item[2]

                            # Check if it's a vertical line (same x-coordinate)
                            if abs(p1.x - p2.x) < 2:  # Allow 2-pixel tolerance
                                # Check if it's long enough to be a page divider
                                line_length = abs(p2.y - p1.y)
                                if line_length > page.rect.height * 0.5:  # At least 50% of page height
                                    vertical_lines.append(p1.x)

                    elif item[0] == "re":  # Rectangle (might be a thin vertical rectangle)
                        if len(item) >= 2:
                            rect = item[1]
                            # Check for thin vertical rectangles
                            if rect.width <= 3 and rect.height > page.rect.height * 0.3:
                                vertical_lines.append(rect.x0 + rect.width / 2)

        except Exception as e:
            if self.debug_mode:
                print(f"   ⚠️ Path detection error: {e}")

        return vertical_lines

    def _detect_lines_from_vectors(self, page) -> List[float]:
        """Detect lines from vector graphics elements"""
        vertical_lines = []

        try:
            # Convert page to SVG to analyze vector elements
            svg_text = page.get_svg_image()

            # Simple SVG line detection
            import re

            # Look for vertical line patterns in SVG
            # Pattern: <line x1="x" y1="y1" x2="x" y2="y2" ... /> where x1 ≈ x2
            line_pattern = r'<line[^>]*x1="([^"]*)"[^>]*y1="([^"]*)"[^>]*x2="([^"]*)"[^>]*y2="([^"]*)"[^>]*/?>'

            matches = re.findall(line_pattern, svg_text)

            for match in matches:
                try:
                    x1, y1, x2, y2 = map(float, match)

                    # Check if it's vertical (x1 ≈ x2)
                    if abs(x1 - x2) < 2:
                        # Check if it's long enough
                        line_length = abs(y2 - y1)
                        if line_length > page.rect.height * 0.3:
                            vertical_lines.append(x1)

                except ValueError:
                    continue

            # Also look for <rect> elements that might be vertical lines
            rect_pattern = r'<rect[^>]*x="([^"]*)"[^>]*y="([^"]*)"[^>]*width="([^"]*)"[^>]*height="([^"]*)"[^>]*/?>'

            rect_matches = re.findall(rect_pattern, svg_text)

            for match in rect_matches:
                try:
                    x, y, width, height = map(float, match)

                    # Check for thin vertical rectangles
                    if width <= 3 and height > page.rect.height * 0.3:
                        vertical_lines.append(x + width / 2)

                except ValueError:
                    continue

        except Exception as e:
            if self.debug_mode:
                print(f"   ⚠️ Vector detection error: {e}")

        return vertical_lines

    def _detect_lines_from_text(self, page) -> List[float]:
        """Detect vertical lines from text characters like |, ǀ, ⏐"""
        vertical_lines = []

        try:
            # Get text with positions
            text_dict = page.get_text("dict")

            # Characters that might represent vertical lines
            line_chars = ['|', '┃', '┋', '┇', '┆', '│', '║', 'ǀ', '⏐', 'ǁ']

            for block in text_dict.get("blocks", []):
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line.get("spans", []):
                            text = span.get("text", "")
                            bbox = span.get("bbox", [0, 0, 0, 0])

                            # Check if this span contains line characters
                            for char in line_chars:
                                if char in text:
                                    # Check if it's positioned like a page divider
                                    char_x = (bbox[0] + bbox[2]) / 2
                                    char_y_span = bbox[3] - bbox[1]

                                    # If the character spans a good portion of the page height
                                    if char_y_span > 20 or text.count(char) > 5:
                                        vertical_lines.append(char_x)

            # Also look for repeated pipe characters in a vertical column
            blocks = page.get_text("dict").get("blocks", [])

            # Group text by similar x-coordinates
            x_groups = {}

            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line.get("spans", []):
                            text = span.get("text", "").strip()
                            if any(char in text for char in line_chars):
                                bbox = span.get("bbox", [0, 0, 0, 0])
                                x_pos = round((bbox[0] + bbox[2]) / 2)

                                if x_pos not in x_groups:
                                    x_groups[x_pos] = []
                                x_groups[x_pos].append(text)

            # Check for x-positions with multiple line characters
            for x_pos, texts in x_groups.items():
                line_char_count = sum(sum(text.count(char) for char in line_chars) for text in texts)
                if line_char_count >= 3:  # At least 3 line characters in this column
                    vertical_lines.append(float(x_pos))

        except Exception as e:
            if self.debug_mode:
                print(f"   ⚠️ Text-based line detection error: {e}")

        return vertical_lines

    def _analyze_detected_lines(self, all_lines: List[float], page_analyses: List[Dict]) -> Dict:
        """Analyze all detected lines to find the best split position"""

        if not all_lines:
            return {
                'optimal_split': 0.5,
                'confidence': 0.1,
                'detected_lines': [],
                'method': 'fallback',
                'line_consistency': 0.0
            }

        # Get page width (use from first page)
        page_width = page_analyses[0]['page_width'] if page_analyses else 595  # Default A4 width

        # Convert to ratios
        line_ratios = [line / page_width for line in all_lines if 0.1 <= line / page_width <= 0.9]

        if not line_ratios:
            return {
                'optimal_split': 0.5,
                'confidence': 0.1,
                'detected_lines': [],
                'method': 'fallback',
                'line_consistency': 0.0
            }

        # Cluster similar line positions (within 5%)
        clusters = []
        line_ratios.sort()

        for ratio in line_ratios:
            # Find if this ratio belongs to an existing cluster
            added_to_cluster = False
            for cluster in clusters:
                if any(abs(ratio - existing_ratio) <= 0.05 for existing_ratio in cluster):
                    cluster.append(ratio)
                    added_to_cluster = True
                    break

            if not added_to_cluster:
                clusters.append([ratio])

        # Find the most consistent cluster (most occurrences)
        best_cluster = max(clusters, key=len)
        optimal_split = sum(best_cluster) / len(best_cluster)  # Average of cluster

        # Calculate confidence based on:
        # 1. How many times this line appears
        # 2. How consistent it is across pages
        # 3. How many different detection methods found it

        line_frequency = len(best_cluster)
        total_pages = len(page_analyses)
        frequency_confidence = min(1.0, line_frequency / total_pages)

        # Check consistency across detection methods
        method_count = 0
        for page_analysis in page_analyses:
            methods = page_analysis['methods']
            if any(methods.values()):
                method_count += 1

        method_confidence = min(1.0, method_count / total_pages)

        # Calculate how tight the cluster is (consistency)
        if len(best_cluster) > 1:
            cluster_tightness = 1 - (max(best_cluster) - min(best_cluster))
        else:
            cluster_tightness = 1.0

        final_confidence = (frequency_confidence + method_confidence + cluster_tightness) / 3

        return {
            'optimal_split': optimal_split,
            'confidence': final_confidence,
            'detected_lines': all_lines,
            'line_ratios': line_ratios,
            'clusters': clusters,
            'best_cluster': best_cluster,
            'method': 'vertical_line_detection',
            'line_consistency': cluster_tightness,
            'frequency': line_frequency,
            'pages_analyzed': total_pages,
            'detection_summary': {
                'total_lines_found': len(all_lines),
                'valid_ratios': len(line_ratios),
                'clusters_found': len(clusters),
                'best_cluster_size': len(best_cluster)
            }
        }

class LineBasedPDFSplitter:
    """PDF Splitter using vertical line detection"""

    def __init__(self):
        self.detector = VerticalLineDetector()
        print("📏 Line-Based PDF Splitter initialized!")

    def split_using_detected_lines(self, input_path: str, output_path: str) -> bool:
        """Split PDF using detected vertical lines"""

        try:
            # Step 1: Detect vertical lines
            print("📏 Step 1: Detecting vertical lines...")
            line_analysis = self.detector.detect_vertical_lines(input_path)

            optimal_ratio = line_analysis['optimal_split']
            confidence = line_analysis['confidence']

            print(f"\n📊 Line Detection Results:")
            print(f"   • Optimal split position: {optimal_ratio:.1%}")
            print(f"   • Detection confidence: {confidence:.1%}")
            print(f"   • Line consistency: {line_analysis['line_consistency']:.1%}")
            print(f"   • Total lines found: {line_analysis['detection_summary']['total_lines_found']}")
            print(f"   • Best cluster size: {line_analysis['detection_summary']['best_cluster_size']}")

            if confidence < 0.3:
                print("⚠️ Low confidence in line detection. Results may not be optimal.")
            elif confidence > 0.8:
                print("🎯 High confidence! Line detection is very reliable.")

            # Step 2: Apply the split
            print(f"\n✂️ Step 2: Splitting at {optimal_ratio:.1%}...")

            input_doc = fitz.open(input_path)
            output_doc = fitz.open()

            page_count = len(input_doc)
            success_count = 0

            with tqdm(total=page_count, desc="📄 Line-Based Splitting") as pbar:
                for page_num in range(page_count):
                    try:
                        page = input_doc[page_num]
                        rect = page.rect

                        # Apply the detected split position
                        split_pos = rect.width * optimal_ratio

                        # Left half
                        left_clip = fitz.Rect(0, 0, split_pos, rect.height)
                        left_page = output_doc.new_page(width=split_pos, height=rect.height)
                        left_page.show_pdf_page(fitz.Rect(0, 0, split_pos, rect.height),
                                               input_doc, page_num, clip=left_clip)

                        # Right half
                        right_clip = fitz.Rect(split_pos, 0, rect.width, rect.height)
                        right_page = output_doc.new_page(width=rect.width - split_pos, height=rect.height)
                        right_page.show_pdf_page(fitz.Rect(0, 0, rect.width - split_pos, rect.height),
                                                input_doc, page_num, clip=right_clip)

                        success_count += 1
                        pbar.update(1)

                    except Exception as page_error:
                        print(f"⚠️ Page {page_num + 1} error: {page_error}")
                        continue

            # Step 3: Save
            print("💾 Step 3: Saving line-based split...")

            try:
                output_doc.save(output_path, garbage=4, deflate=True)

                # Results
                output_size = os.path.getsize(output_path) / (1024 * 1024)
                created_pages = len(output_doc)

                print(f"\n🎉 Line-Based Split Complete!")
                print(f"📄 Created: {created_pages} pages from {page_count} original")
                print(f"📁 Size: {output_size:.2f} MB")
                print(f"🎯 Success rate: {success_count/page_count:.1%}")
                print(f"📏 Split position: {optimal_ratio:.1%} (based on detected vertical lines)")

                return True

            except Exception as save_error:
                print(f"❌ Save error: {save_error}")
                return False

            finally:
                input_doc.close()
                output_doc.close()
                gc.collect()

        except Exception as e:
            print(f"❌ Error: {e}")
            return False

def test_line_detection():
    """Test vertical line detection on your PDF"""

    print("📏 VERTICAL LINE DETECTION TEST")
    print("=" * 50)

    # Find PDF
    pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf') and not f.startswith('SMART_')]

    if not pdf_files:
        print("❌ No PDF files found.")
        return

    pdf_file = pdf_files[0]
    print(f"📁 Testing: {pdf_file}")

    # Create detector
    detector = VerticalLineDetector()

    # Detect lines
    analysis = detector.detect_vertical_lines(pdf_file, sample_pages=10)

    # Show detailed results
    print(f"\n📊 DETAILED RESULTS:")
    print("=" * 50)
    print(f"Optimal split position: {analysis['optimal_split']:.1%}")
    print(f"Detection confidence: {analysis['confidence']:.1%}")
    print(f"Line consistency: {analysis['line_consistency']:.1%}")
    print(f"Total lines detected: {len(analysis['detected_lines'])}")

    if analysis['line_ratios']:
        print(f"\nAll detected line positions:")
        for i, ratio in enumerate(analysis['line_ratios']):
            print(f"  Line {i+1}: {ratio:.1%}")

    if analysis['clusters']:
        print(f"\nLine clusters found:")
        for i, cluster in enumerate(analysis['clusters']):
            avg_pos = sum(cluster) / len(cluster)
            print(f"  Cluster {i+1}: {avg_pos:.1%} (appears {len(cluster)} times)")

    print(f"\n🎯 RECOMMENDATION:")
    print(f"Split at {analysis['optimal_split']:.1%} with {analysis['confidence']:.1%} confidence")

    return analysis

def main_line_based_split():
    """Main function using line detection"""

    print("📏 LINE-BASED PDF SPLITTER")
    print("=" * 40)
    print("🎯 Perfect for PDFs with vertical divider lines")
    print("=" * 40)

    # Find PDF
    pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf') and not f.startswith('SMART_')]

    if not pdf_files:
        print("❌ No PDF files found.")
        return

    pdf_file = pdf_files[0]

    # Create splitter
    splitter = LineBasedPDFSplitter()

    # Generate output name
    timestamp = datetime.now().strftime("%H%M%S")
    output_file = f"LINE_SPLIT_{Path(pdf_file).stem}_{timestamp}.pdf"

    # Split using line detection
    if splitter.split_using_detected_lines(pdf_file, output_file):
        try:
            files.download(output_file)
            print("✅ Download successful!")
        except Exception as e:
            print(f"⚠️ Download failed: {e}")

print("✅ Vertical Line Detector loaded!")
print(f"👤 User: {__name__ if '__name__' in globals() else 'Ravi-katta-dev'}")
print(f"🕐 Current time: 2025-08-08 12:13:25 UTC")
print("\n📏 Specialized for PDFs with vertical divider lines!")
print("\n🎯 Available functions:")
print("• test_line_detection() - Analyze your PDF's vertical lines")
print("• main_line_based_split() - Split using detected lines")
print("=" * 50)

✅ All imports successful!
✅ Vertical Line Detector loaded!
👤 User: __main__
🕐 Current time: 2025-08-08 12:13:25 UTC

📏 Specialized for PDFs with vertical divider lines!

🎯 Available functions:
• test_line_detection() - Analyze your PDF's vertical lines
• main_line_based_split() - Split using detected lines


In [None]:
# =============================================================================
# 🎯 COMPLETE INTELLIGENT PDF SPLITTER - Full Featured Solution
# =============================================================================
# Author: Advanced PDF Processing System
# Date: 2025-08-08 12:18:45 UTC
# User: Ravi-katta-dev
# Version: 4.0 - Complete Solution
# =============================================================================

import os
import time
import gc
import zipfile
import re
from pathlib import Path
from typing import Optional, Tuple, List, Dict, Any
from datetime import datetime
import numpy as np

try:
    import fitz  # PyMuPDF
    from tqdm.auto import tqdm
    from google.colab import files
    import ipywidgets as widgets
    from IPython.display import display, HTML, clear_output
    print("✅ All imports successful!")
    print(f"🕐 Session started: 2025-08-08 12:18:45 UTC")
    print(f"👤 User: Ravi-katta-dev")
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Please install required packages:")
    print("!pip install PyMuPDF tqdm ipywidgets")
    raise

class CompletePDFSplitter:
    """Complete PDF Splitter with all advanced features"""

    def __init__(self):
        self.processed_files = []
        self.debug_mode = True
        self.session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
        print(f"🎯 Complete PDF Splitter v4.0 initialized!")
        print(f"📱 Session ID: {self.session_id}")

    # =============================================================================
    # 📏 VERTICAL LINE DETECTION SYSTEM
    # =============================================================================

    def detect_vertical_lines(self, pdf_path: str, sample_pages: int = 10) -> Dict:
        """Detect vertical lines in PDF for exact split positioning"""

        doc = fitz.open(pdf_path)

        try:
            total_pages = len(doc)
            sample_pages = min(sample_pages, total_pages)

            print(f"📏 Analyzing {sample_pages} pages for vertical lines...")

            all_vertical_lines = []
            page_analyses = []

            for page_num in range(sample_pages):
                page = doc[page_num]

                # Multi-method line detection
                lines_from_paths = self._detect_lines_from_paths(page)
                lines_from_vectors = self._detect_lines_from_vectors(page)
                lines_from_text = self._detect_lines_from_text(page)
                lines_from_images = self._detect_lines_from_images(page)

                # Combine all methods
                page_lines = []
                page_lines.extend(lines_from_paths)
                page_lines.extend(lines_from_vectors)
                page_lines.extend(lines_from_text)
                page_lines.extend(lines_from_images)

                # Remove duplicates and filter
                page_lines = self._filter_and_clean_lines(page_lines, page.rect.width)

                page_analysis = {
                    'page_num': page_num + 1,
                    'lines': page_lines,
                    'page_width': page.rect.width,
                    'page_height': page.rect.height,
                    'methods_count': {
                        'paths': len(lines_from_paths),
                        'vectors': len(lines_from_vectors),
                        'text': len(lines_from_text),
                        'images': len(lines_from_images)
                    }
                }

                page_analyses.append(page_analysis)
                all_vertical_lines.extend(page_lines)

                if self.debug_mode:
                    print(f"📄 Page {page_num + 1}: Found {len(page_lines)} vertical lines")
                    if page_lines:
                        ratios = [line / page.rect.width for line in page_lines]
                        print(f"   • Positions: {[f'{r:.1%}' for r in ratios[:5]]}")

            # Analyze all detected lines
            final_analysis = self._analyze_all_detected_lines(all_vertical_lines, page_analyses)

            return final_analysis

        finally:
            doc.close()

    def _detect_lines_from_paths(self, page) -> List[float]:
        """Method 1: Detect lines from PDF drawing paths"""
        vertical_lines = []

        try:
            drawings = page.get_drawings()

            for drawing in drawings:
                items = drawing.get("items", [])

                for item in items:
                    if item[0] == "l":  # Line command
                        if len(item) >= 3:
                            p1, p2 = item[1], item[2]

                            # Check for vertical line
                            if abs(p1.x - p2.x) < 3:  # Tolerance for vertical
                                line_length = abs(p2.y - p1.y)
                                if line_length > page.rect.height * 0.3:
                                    vertical_lines.append(p1.x)

                    elif item[0] == "re":  # Rectangle (thin vertical)
                        if len(item) >= 2:
                            rect = item[1]
                            if rect.width <= 5 and rect.height > page.rect.height * 0.2:
                                vertical_lines.append(rect.x0 + rect.width / 2)

                    elif item[0] == "c":  # Curve (might be decorative line)
                        if len(item) >= 4:
                            p1, p2, p3, p4 = item[1], item[2], item[3], item[4]
                            # Check if curve is essentially vertical
                            x_variance = max(p1.x, p2.x, p3.x, p4.x) - min(p1.x, p2.x, p3.x, p4.x)
                            if x_variance < 5:
                                avg_x = (p1.x + p2.x + p3.x + p4.x) / 4
                                vertical_lines.append(avg_x)

        except Exception as e:
            if self.debug_mode:
                print(f"   ⚠️ Path detection error: {e}")

        return vertical_lines

    def _detect_lines_from_vectors(self, page) -> List[float]:
        """Method 2: Detect lines from SVG vector graphics"""
        vertical_lines = []

        try:
            svg_text = page.get_svg_image()

            # Enhanced SVG pattern matching
            patterns = [
                r'<line[^>]*x1="([^"]*)"[^>]*y1="([^"]*)"[^>]*x2="([^"]*)"[^>]*y2="([^"]*)"[^>]*/?>',
                r'<rect[^>]*x="([^"]*)"[^>]*y="([^"]*)"[^>]*width="([^"]*)"[^>]*height="([^"]*)"[^>]*/?>',
                r'<path[^>]*d="M\s*([^,\s]+)[,\s]+([^,\s]+)\s*L\s*([^,\s]+)[,\s]+([^,\s]+)"[^>]*/?>',
            ]

            for pattern in patterns:
                matches = re.findall(pattern, svg_text)

                for match in matches:
                    try:
                        if len(match) == 4:
                            if 'line' in pattern or 'path' in pattern:
                                x1, y1, x2, y2 = map(float, match)
                                if abs(x1 - x2) < 3:  # Vertical line
                                    line_length = abs(y2 - y1)
                                    if line_length > page.rect.height * 0.2:
                                        vertical_lines.append(x1)

                            elif 'rect' in pattern:
                                x, y, width, height = map(float, match)
                                if width <= 5 and height > page.rect.height * 0.2:
                                    vertical_lines.append(x + width / 2)

                    except (ValueError, IndexError):
                        continue

        except Exception as e:
            if self.debug_mode:
                print(f"   ⚠️ Vector detection error: {e}")

        return vertical_lines

    def _detect_lines_from_text(self, page) -> List[float]:
        """Method 3: Detect lines from text characters"""
        vertical_lines = []

        try:
            text_dict = page.get_text("dict")

            # Extended line characters
            line_chars = ['|', '┃', '┋', '┇', '┆', '│', '║', 'ǀ', '⏐', 'ǁ', '︱', '丨', '｜']

            # Collect all text spans with line characters
            line_positions = {}

            for block in text_dict.get("blocks", []):
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line.get("spans", []):
                            text = span.get("text", "")
                            bbox = span.get("bbox", [0, 0, 0, 0])

                            for char in line_chars:
                                if char in text:
                                    char_x = round((bbox[0] + bbox[2]) / 2)

                                    if char_x not in line_positions:
                                        line_positions[char_x] = 0
                                    line_positions[char_x] += text.count(char)

            # Filter positions with enough line characters
            for x_pos, count in line_positions.items():
                if count >= 3:  # At least 3 line characters
                    vertical_lines.append(float(x_pos))

            # Also detect repeated patterns that might indicate borders
            full_text = page.get_text()

            # Look for repeated sequences that might be borders
            border_patterns = ['|', '-|', '|-', '||', '│', '──']

            for pattern in border_patterns:
                if full_text.count(pattern) > 5:  # Pattern appears multiple times
                    # This suggests there might be a structured layout
                    # Add common positions for such patterns
                    page_width = page.rect.width
                    common_ratios = [0.25, 0.33, 0.4, 0.5, 0.6, 0.67, 0.75]

                    for ratio in common_ratios:
                        if any(abs(line/page_width - ratio) < 0.05 for line in vertical_lines):
                            # Only add if we already detected something near this ratio
                            vertical_lines.append(page_width * ratio)

        except Exception as e:
            if self.debug_mode:
                print(f"   ⚠️ Text line detection error: {e}")

        return vertical_lines

    def _detect_lines_from_images(self, page) -> List[float]:
        """Method 4: Detect lines from embedded images"""
        vertical_lines = []

        try:
            # Get images on the page
            image_list = page.get_images()

            for img in image_list:
                try:
                    bbox = page.get_image_bbox(img)
                    if bbox:
                        # Check if image is thin and vertical (might be a line)
                        if bbox.width <= 10 and bbox.height > page.rect.height * 0.2:
                            vertical_lines.append(bbox.x0 + bbox.width / 2)

                        # Check for images that might contain line graphics
                        if bbox.width > 20 and bbox.height > page.rect.height * 0.5:
                            # This might be a large image with embedded lines
                            # Add common split positions within the image
                            img_center = bbox.x0 + bbox.width / 2

                            # Check if image spans a significant portion of the page
                            if bbox.width > page.rect.width * 0.3:
                                vertical_lines.append(img_center)

                except Exception:
                    continue

        except Exception as e:
            if self.debug_mode:
                print(f"   ⚠️ Image line detection error: {e}")

        return vertical_lines

    def _filter_and_clean_lines(self, lines: List[float], page_width: float) -> List[float]:
        """Filter and clean detected lines"""
        if not lines:
            return []

        # Remove duplicates with tolerance
        cleaned_lines = []
        lines.sort()

        for line in lines:
            # Check if this line is valid (within page bounds and reasonable position)
            if 0.1 * page_width <= line <= 0.9 * page_width:
                # Check if it's too close to an existing line
                if not any(abs(line - existing) < page_width * 0.02 for existing in cleaned_lines):
                    cleaned_lines.append(line)

        return cleaned_lines

    def _analyze_all_detected_lines(self, all_lines: List[float], page_analyses: List[Dict]) -> Dict:
        """Analyze all detected lines to find optimal split"""

        if not all_lines or not page_analyses:
            return self._get_fallback_analysis()

        page_width = page_analyses[0]['page_width']

        # Convert to ratios and filter
        line_ratios = []
        for line in all_lines:
            ratio = line / page_width
            if 0.15 <= ratio <= 0.85:  # Reasonable split range
                line_ratios.append(ratio)

        if not line_ratios:
            return self._get_fallback_analysis()

        # Advanced clustering with multiple methods
        clusters = self._perform_advanced_clustering(line_ratios)

        if not clusters:
            return self._get_fallback_analysis()

        # Select best cluster
        best_cluster = self._select_best_cluster(clusters, page_analyses)

        # Calculate final metrics
        optimal_split = sum(best_cluster['ratios']) / len(best_cluster['ratios'])
        confidence = self._calculate_confidence(best_cluster, clusters, page_analyses)
        consistency = self._calculate_consistency(best_cluster)

        return {
            'optimal_split': optimal_split,
            'confidence': confidence,
            'line_consistency': consistency,
            'detected_lines': all_lines,
            'line_ratios': line_ratios,
            'clusters': clusters,
            'best_cluster': best_cluster,
            'method': 'advanced_line_detection',
            'pages_analyzed': len(page_analyses),
            'detection_summary': {
                'total_lines_found': len(all_lines),
                'valid_ratios': len(line_ratios),
                'clusters_found': len(clusters),
                'best_cluster_size': len(best_cluster['ratios'])
            },
            'page_details': page_analyses
        }

    def _perform_advanced_clustering(self, ratios: List[float]) -> List[Dict]:
        """Perform advanced clustering of line positions"""
        if not ratios:
            return []

        ratios_sorted = sorted(ratios)
        clusters = []

        # Dynamic tolerance based on data distribution
        tolerance = max(0.02, (max(ratios) - min(ratios)) / 10)

        for ratio in ratios_sorted:
            added = False

            for cluster in clusters:
                cluster_center = sum(cluster['ratios']) / len(cluster['ratios'])
                if abs(ratio - cluster_center) <= tolerance:
                    cluster['ratios'].append(ratio)
                    added = True
                    break

            if not added:
                clusters.append({
                    'ratios': [ratio],
                    'center': ratio,
                    'weight': 1
                })

        # Update cluster centers and weights
        for cluster in clusters:
            cluster['center'] = sum(cluster['ratios']) / len(cluster['ratios'])
            cluster['weight'] = len(cluster['ratios'])
            cluster['spread'] = max(cluster['ratios']) - min(cluster['ratios']) if len(cluster['ratios']) > 1 else 0

        return clusters

    def _select_best_cluster(self, clusters: List[Dict], page_analyses: List[Dict]) -> Dict:
        """Select the best cluster based on multiple criteria"""

        def score_cluster(cluster):
            # Weight (frequency)
            weight_score = cluster['weight'] / len(page_analyses)

            # Consistency (low spread)
            spread_score = 1 - min(1, cluster['spread'] / 0.1)

            # Position preference (avoid extreme edges)
            center = cluster['center']
            position_score = 1 - abs(center - 0.5) * 2  # Prefer center positions

            # Combined score
            return (weight_score * 0.5 + spread_score * 0.3 + position_score * 0.2)

        return max(clusters, key=score_cluster)

    def _calculate_confidence(self, best_cluster: Dict, all_clusters: List[Dict], page_analyses: List[Dict]) -> float:
        """Calculate confidence in the detection"""

        # Frequency confidence
        frequency_conf = min(1.0, best_cluster['weight'] / len(page_analyses))

        # Consistency confidence
        consistency_conf = 1 - min(1, best_cluster['spread'] / 0.05)

        # Method diversity confidence
        total_methods = sum(
            sum(page['methods_count'].values())
            for page in page_analyses
        )
        method_conf = min(1.0, total_methods / (len(page_analyses) * 2))

        # Dominance confidence (how much better is this cluster than others)
        if len(all_clusters) > 1:
            other_weights = [c['weight'] for c in all_clusters if c != best_cluster]
            dominance_conf = best_cluster['weight'] / (max(other_weights) + 1) if other_weights else 1.0
            dominance_conf = min(1.0, dominance_conf)
        else:
            dominance_conf = 1.0

        # Combined confidence
        confidence = (frequency_conf * 0.3 + consistency_conf * 0.3 +
                     method_conf * 0.2 + dominance_conf * 0.2)

        return confidence

    def _calculate_consistency(self, cluster: Dict) -> float:
        """Calculate consistency of the cluster"""
        if len(cluster['ratios']) <= 1:
            return 1.0

        return 1 - min(1, cluster['spread'] / 0.1)

    def _get_fallback_analysis(self) -> Dict:
        """Fallback analysis when line detection fails"""
        return {
            'optimal_split': 0.5,
            'confidence': 0.1,
            'line_consistency': 0.0,
            'detected_lines': [],
            'method': 'fallback',
            'detection_summary': {
                'total_lines_found': 0,
                'valid_ratios': 0,
                'clusters_found': 0,
                'best_cluster_size': 0
            }
        }

    # =============================================================================
    # 🧠 CONTENT ANALYSIS SYSTEM
    # =============================================================================

    def analyze_content_layout(self, pdf_path: str, sample_pages: int = 5) -> Dict:
        """Comprehensive content analysis"""

        doc = fitz.open(pdf_path)

        try:
            sample_pages = min(sample_pages, len(doc))
            print(f"🧠 Analyzing content layout from {sample_pages} pages...")

            content_analysis = {
                'text_density_map': [],
                'layout_patterns': [],
                'font_analysis': [],
                'image_positions': [],
                'whitespace_analysis': []
            }

            for page_num in range(sample_pages):
                page = doc[page_num]

                # Analyze text density
                density_analysis = self._analyze_text_density(page)
                content_analysis['text_density_map'].append(density_analysis)

                # Analyze layout patterns
                layout_analysis = self._analyze_layout_patterns(page)
                content_analysis['layout_patterns'].append(layout_analysis)

                # Analyze fonts and formatting
                font_analysis = self._analyze_fonts_and_formatting(page)
                content_analysis['font_analysis'].append(font_analysis)

                # Analyze images and graphics
                image_analysis = self._analyze_images_and_graphics(page)
                content_analysis['image_positions'].append(image_analysis)

                # Analyze whitespace
                whitespace_analysis = self._analyze_whitespace_distribution(page)
                content_analysis['whitespace_analysis'].append(whitespace_analysis)

            # Combine all analyses
            final_content_analysis = self._combine_content_analyses(content_analysis)

            return final_content_analysis

        finally:
            doc.close()

    def _analyze_text_density(self, page) -> Dict:
        """Analyze text density across the page"""
        rect = page.rect

        # Create density grid
        grid_cols, grid_rows = 20, 20
        density_grid = [[0 for _ in range(grid_cols)] for _ in range(grid_rows)]

        # Get text blocks
        text_dict = page.get_text("dict")

        for block in text_dict.get("blocks", []):
            if "lines" in block:
                bbox = block["bbox"]
                text_length = 0

                for line in block["lines"]:
                    for span in line.get("spans", []):
                        text_length += len(span.get("text", "").strip())

                if text_length > 0:
                    # Map to grid
                    col = min(int((bbox[0] + bbox[2]) / 2 / rect.width * grid_cols), grid_cols - 1)
                    row = min(int((bbox[1] + bbox[3]) / 2 / rect.height * grid_rows), grid_rows - 1)

                    density_grid[row][col] += text_length

        # Analyze vertical distribution
        vertical_density = [sum(density_grid[row][col] for row in range(grid_rows)) for col in range(grid_cols)]

        # Find optimal vertical split
        min_density_col = 0
        min_density = float('inf')

        for col in range(int(grid_cols * 0.2), int(grid_cols * 0.8)):
            if vertical_density[col] < min_density:
                min_density = vertical_density[col]
                min_density_col = col

        optimal_split = (min_density_col + 0.5) / grid_cols

        return {
            'density_grid': density_grid,
            'vertical_density': vertical_density,
            'optimal_split': optimal_split,
            'total_text': sum(vertical_density)
        }

    def _analyze_layout_patterns(self, page) -> Dict:
        """Analyze layout patterns and structure"""
        text_dict = page.get_text("dict")

        patterns = {
            'columns': 0,
            'alignment_patterns': {'left': 0, 'center': 0, 'right': 0},
            'spacing_patterns': [],
            'block_sizes': []
        }

        block_positions = []

        for block in text_dict.get("blocks", []):
            if "lines" in block:
                bbox = block["bbox"]
                block_width = bbox[2] - bbox[0]
                block_height = bbox[3] - bbox[1]

                patterns['block_sizes'].append({
                    'width': block_width,
                    'height': block_height,
                    'x': bbox[0],
                    'y': bbox[1]
                })

                block_positions.append(bbox[0])  # Left edge

                # Analyze alignment
                page_width = page.rect.width
                if bbox[0] < page_width * 0.1:
                    patterns['alignment_patterns']['left'] += 1
                elif bbox[2] > page_width * 0.9:
                    patterns['alignment_patterns']['right'] += 1
                else:
                    patterns['alignment_patterns']['center'] += 1

        # Detect column structure
        if block_positions:
            block_positions.sort()
            gaps = []

            for i in range(1, len(block_positions)):
                gap = block_positions[i] - block_positions[i-1]
                if gap > 20:  # Significant gap
                    gaps.append(gap)

            patterns['spacing_patterns'] = gaps
            patterns['columns'] = len(set(round(pos / 50) * 50 for pos in block_positions))

        return patterns

    def _analyze_fonts_and_formatting(self, page) -> Dict:
        """Analyze fonts and text formatting"""
        text_dict = page.get_text("dict")

        font_analysis = {
            'font_sizes': {},
            'font_families': {},
            'text_styles': {'bold': 0, 'italic': 0, 'normal': 0},
            'color_distribution': {}
        }

        for block in text_dict.get("blocks", []):
            if "lines" in block:
                for line in block["lines"]:
                    for span in line.get("spans", []):
                        # Font size
                        size = span.get("size", 12)
                        size_key = f"{size:.1f}"
                        font_analysis['font_sizes'][size_key] = font_analysis['font_sizes'].get(size_key, 0) + 1

                        # Font family
                        font = span.get("font", "unknown")
                        font_analysis['font_families'][font] = font_analysis['font_families'].get(font, 0) + 1

                        # Text style
                        flags = span.get("flags", 0)
                        if flags & 2**4:  # Bold
                            font_analysis['text_styles']['bold'] += 1
                        elif flags & 2**1:  # Italic
                            font_analysis['text_styles']['italic'] += 1
                        else:
                            font_analysis['text_styles']['normal'] += 1

        return font_analysis

    def _analyze_images_and_graphics(self, page) -> Dict:
        """Analyze images and graphics distribution"""
        images = page.get_images()
        drawings = page.get_drawings()

        image_analysis = {
            'image_count': len(images),
            'drawing_count': len(drawings),
            'image_positions': [],
            'drawing_positions': []
        }

        # Analyze image positions
        for img in images:
            try:
                bbox = page.get_image_bbox(img)
                if bbox:
                    image_analysis['image_positions'].append({
                        'x': bbox.x0,
                        'y': bbox.y0,
                        'width': bbox.width,
                        'height': bbox.height,
                        'center_x': bbox.x0 + bbox.width / 2
                    })
            except:
                continue

        # Analyze drawing positions
        for drawing in drawings:
            try:
                rect = drawing.get("rect")
                if rect:
                    image_analysis['drawing_positions'].append({
                        'x': rect.x0,
                        'y': rect.y0,
                        'width': rect.width,
                        'height': rect.height,
                        'center_x': rect.x0 + rect.width / 2
                    })
            except:
                continue

        return image_analysis

    def _analyze_whitespace_distribution(self, page) -> Dict:
        """Analyze whitespace distribution"""
        text_dict = page.get_text("dict")
        page_width = page.rect.width
        page_height = page.rect.height

        # Create occupancy map
        occupied_areas = []

        for block in text_dict.get("blocks", []):
            if "lines" in block:
                bbox = block["bbox"]
                occupied_areas.append(bbox)

        # Add images
        images = page.get_images()
        for img in images:
            try:
                bbox = page.get_image_bbox(img)
                if bbox:
                    occupied_areas.append([bbox.x0, bbox.y0, bbox.x1, bbox.y1])
            except:
                continue

        # Find largest vertical gaps
        vertical_gaps = []

        if occupied_areas:
            # Sort by x-coordinate
            x_positions = []
            for area in occupied_areas:
                x_positions.extend([area[0], area[2]])

            x_positions.sort()

            for i in range(1, len(x_positions)):
                gap = x_positions[i] - x_positions[i-1]
                if gap > page_width * 0.02:  # Significant gap
                    gap_center = (x_positions[i] + x_positions[i-1]) / 2
                    vertical_gaps.append({
                        'position': gap_center,
                        'size': gap,
                        'ratio': gap_center / page_width
                    })

        return {
            'vertical_gaps': vertical_gaps,
            'largest_gap': max(vertical_gaps, key=lambda g: g['size']) if vertical_gaps else None
        }

    def _combine_content_analyses(self, content_analysis: Dict) -> Dict:
        """Combine all content analyses"""

        # Average the results across pages
        combined = {
            'recommended_split': 0.5,
            'confidence': 0.1,
            'method': 'content_analysis',
            'evidence': []
        }

        split_suggestions = []
        confidences = []

        # From text density
        for density in content_analysis['text_density_map']:
            split_suggestions.append(density['optimal_split'])
            confidences.append(0.3)  # Base confidence for density analysis

        # From whitespace analysis
        for whitespace in content_analysis['whitespace_analysis']:
            if whitespace['largest_gap']:
                gap = whitespace['largest_gap']
                if 0.2 <= gap['ratio'] <= 0.8:  # Reasonable range
                    split_suggestions.append(gap['ratio'])
                    confidences.append(0.7)  # Higher confidence for clear gaps

        # Calculate weighted average
        if split_suggestions and confidences:
            total_weight = sum(confidences)
            weighted_split = sum(s * c for s, c in zip(split_suggestions, confidences)) / total_weight
            avg_confidence = sum(confidences) / len(confidences)

            combined['recommended_split'] = weighted_split
            combined['confidence'] = avg_confidence

        return combined

    # =============================================================================
    # 🎯 MULTI-METHOD DETECTION SYSTEM
    # =============================================================================

    def detect_optimal_split_multi_method(self, pdf_path: str) -> Dict:
        """Use multiple methods to detect optimal split ratio"""

        print("🎯 Running comprehensive multi-method analysis...")

        # Method 1: Line Detection
        print("📏 Method 1: Vertical line detection...")
        line_analysis = self.detect_vertical_lines(pdf_path)

        # Method 2: Content Analysis
        print("🧠 Method 2: Content layout analysis...")
        content_analysis = self.analyze_content_layout(pdf_path)

        # Method 3: Visual Pattern Recognition
        print("👁️ Method 3: Visual pattern recognition...")
        visual_analysis = self._analyze_visual_patterns(pdf_path)

        # Method 4: Document Structure Analysis
        print("📋 Method 4: Document structure analysis...")
        structure_analysis = self._analyze_document_structure(pdf_path)

        # Combine all methods
        print("🔄 Combining all analysis methods...")
        final_analysis = self._combine_all_methods(
            line_analysis, content_analysis, visual_analysis, structure_analysis
        )

        return final_analysis

    def _analyze_visual_patterns(self, pdf_path: str) -> Dict:
        """Analyze visual patterns in the document"""

        doc = fitz.open(pdf_path)

        try:
            # Sample pages for visual analysis
            sample_pages = min(3, len(doc))
            visual_patterns = []

            for page_num in range(sample_pages):
                page = doc[page_num]

                # Convert page to image for visual analysis
                mat = fitz.Matrix(1.0, 1.0)  # No scaling
                pix = page.get_pixmap(matrix=mat)

                # Analyze pixel patterns (simplified)
                page_analysis = {
                    'page_width': page.rect.width,
                    'page_height': page.rect.height,
                    'visual_split_suggestions': []
                }

                # Look for clear vertical divisions in content
                # This is a simplified approach - in practice, you might use image processing
                text_blocks = page.get_text("dict").get("blocks", [])

                if text_blocks:
                    x_positions = []
                    for block in text_blocks:
                        if "bbox" in block:
                            bbox = block["bbox"]
                            x_positions.extend([bbox[0], bbox[2]])

                    if x_positions:
                        x_positions.sort()

                        # Find gaps
                        for i in range(1, len(x_positions)):
                            gap = x_positions[i] - x_positions[i-1]
                            if gap > page.rect.width * 0.05:  # Significant gap
                                gap_center = (x_positions[i] + x_positions[i-1]) / 2
                                gap_ratio = gap_center / page.rect.width

                                if 0.2 <= gap_ratio <= 0.8:
                                    page_analysis['visual_split_suggestions'].append(gap_ratio)

                visual_patterns.append(page_analysis)

            # Combine visual analysis
            all_suggestions = []
            for pattern in visual_patterns:
                all_suggestions.extend(pattern['visual_split_suggestions'])

            if all_suggestions:
                optimal_split = sum(all_suggestions) / len(all_suggestions)
                confidence = min(1.0, len(all_suggestions) / 3)  # More suggestions = higher confidence
            else:
                optimal_split = 0.5
                confidence = 0.1

            return {
                'optimal_split': optimal_split,
                'confidence': confidence,
                'method': 'visual_pattern_analysis',
                'suggestions': all_suggestions
            }

        finally:
            doc.close()

    def _analyze_document_structure(self, pdf_path: str) -> Dict:
        """Analyze document structure and metadata"""

        doc = fitz.open(pdf_path)

        try:
            structure_analysis = {
                'page_count': len(doc),
                'page_dimensions': [],
                'text_statistics': {},
                'structure_hints': []
            }

            # Analyze first few pages for structure
            sample_pages = min(5, len(doc))
            total_text = ""

            for page_num in range(sample_pages):
                page = doc[page_num]

                structure_analysis['page_dimensions'].append({
                    'width': page.rect.width,
                    'height': page.rect.height,
                    'ratio': page.rect.width / page.rect.height
                })

                page_text = page.get_text()
                total_text += page_text

            # Analyze text for structural hints
            structure_hints = []

            # Check for exam/question patterns
            exam_keywords = ['question', 'answer', 'select', 'choose', 'option', 'practice', 'test', 'exam']
            for keyword in exam_keywords:
                if keyword.lower() in total_text.lower():
                    structure_hints.append(f"exam_paper_{keyword}")

            # Check for telegram/social media references
            social_keywords = ['telegram', 'join', 'click here', 'open', 'subscribe']
            for keyword in social_keywords:
                if keyword.lower() in total_text.lower():
                    structure_hints.append(f"social_media_{keyword}")

            # Check for two-column indicators
            column_indicators = ['column', 'left', 'right', 'side']
            for indicator in column_indicators:
                if indicator.lower() in total_text.lower():
                    structure_hints.append(f"column_layout_{indicator}")

            structure_analysis['structure_hints'] = structure_hints

            # Determine likely split based on structure
            if any('exam_paper' in hint for hint in structure_hints):
                if any('social_media' in hint for hint in structure_hints):
                    # Exam paper with social media reference - likely split needed
                    suggested_split = 0.6  # Questions on left (60%), social media on right (40%)
                    confidence = 0.8
                else:
                    # Pure exam paper - might not need split or different ratio
                    suggested_split = 0.5
                    confidence = 0.4
            else:
                # Unknown structure
                suggested_split = 0.5
                confidence = 0.2

            return {
                'optimal_split': suggested_split,
                'confidence': confidence,
                'method': 'document_structure_analysis',
                'structure_hints': structure_hints,
                'analysis': structure_analysis
            }

        finally:
            doc.close()

    def _combine_all_methods(self, line_analysis: Dict, content_analysis: Dict,
                           visual_analysis: Dict, structure_analysis: Dict) -> Dict:
        """Combine results from all analysis methods"""

        methods = [
            ('line_detection', line_analysis),
            ('content_analysis', content_analysis),
            ('visual_analysis', visual_analysis),
            ('structure_analysis', structure_analysis)
        ]

        # Weight methods by their confidence
        weighted_splits = []
        weighted_confidences = []
        method_details = {}

        for method_name, analysis in methods:
            split = analysis.get('optimal_split', 0.5)
            confidence = analysis.get('confidence', 0.1)

            weighted_splits.append(split * confidence)
            weighted_confidences.append(confidence)

            method_details[method_name] = {
                'split': split,
                'confidence': confidence,
                'details': analysis
            }

        # Calculate final weighted average
        total_weight = sum(weighted_confidences)

        if total_weight > 0:
            final_split = sum(weighted_splits) / total_weight
            final_confidence = sum(weighted_confidences) / len(weighted_confidences)
        else:
            final_split = 0.5
            final_confidence = 0.1

        # Adjust confidence based on method agreement
        splits_only = [analysis.get('optimal_split', 0.5) for _, analysis in methods]
        split_variance = np.var(splits_only) if len(splits_only) > 1 else 0
        agreement_factor = max(0.1, 1 - split_variance * 5)  # Higher variance = lower agreement

        final_confidence *= agreement_factor

        return {
            'optimal_split': final_split,
            'confidence': final_confidence,
            'agreement_factor': agreement_factor,
            'method': 'multi_method_combined',
            'individual_methods': method_details,
            'summary': {
                'line_detection_confidence': line_analysis.get('confidence', 0),
                'content_analysis_confidence': content_analysis.get('confidence', 0),
                'visual_analysis_confidence': visual_analysis.get('confidence', 0),
                'structure_analysis_confidence': structure_analysis.get('confidence', 0),
                'methods_agreement': agreement_factor,
                'recommended_split': final_split
            }
        }

    # =============================================================================
    # ✂️ PDF SPLITTING ENGINE
    # =============================================================================

    def split_pdf_with_analysis(self, input_path: str, output_path: str,
                               method: str = "multi_method") -> bool:
        """Split PDF using comprehensive analysis"""

        input_doc = None
        output_doc = None

        try:
            # Step 1: Comprehensive Analysis
            print("🔍 Step 1: Comprehensive PDF Analysis")
            print("=" * 50)

            if method == "multi_method":
                analysis = self.detect_optimal_split_multi_method(input_path)
            elif method == "line_detection":
                analysis = self.detect_vertical_lines(input_path)
            elif method == "content_analysis":
                analysis = self.analyze_content_layout(input_path)
            else:
                # Fallback to multi-method
                analysis = self.detect_optimal_split_multi_method(input_path)

            optimal_ratio = analysis.get('optimal_split', 0.5)
            confidence = analysis.get('confidence', 0.1)

            print(f"\n📊 Analysis Results:")
            print(f"   • Detection method: {analysis.get('method', 'unknown')}")
            print(f"   • Optimal split ratio: {optimal_ratio:.1%}")
            print(f"   • Detection confidence: {confidence:.1%}")

            if 'summary' in analysis:
                summary = analysis['summary']
                print(f"   • Line detection confidence: {summary.get('line_detection_confidence', 0):.1%}")
                print(f"   • Content analysis confidence: {summary.get('content_analysis_confidence', 0):.1%}")
                print(f"   • Methods agreement: {summary.get('methods_agreement', 0):.1%}")

            # Validate the detected ratio
            if optimal_ratio < 0.15 or optimal_ratio > 0.85:
                print(f"⚠️ Warning: Detected ratio {optimal_ratio:.1%} seems extreme. Using safer ratio.")
                optimal_ratio = max(0.2, min(0.8, optimal_ratio))

            if confidence < 0.3:
                print(f"⚠️ Warning: Low confidence ({confidence:.1%}). Results may not be optimal.")

            # Step 2: Apply Split
            print(f"\n✂️ Step 2: Applying Split at {optimal_ratio:.1%}")
            print("=" * 50)

            input_doc = fitz.open(input_path)
            output_doc = fitz.open()

            page_count = len(input_doc)
            success_count = 0

            start_time = time.time()

            with tqdm(total=page_count, desc="📄 Processing", unit="page") as pbar:
                for page_num in range(page_count):
                    try:
                        page = input_doc[page_num]
                        rect = page.rect

                        # Calculate split position
                        split_pos = rect.width * optimal_ratio

                        # Create left half
                        left_clip = fitz.Rect(0, 0, split_pos, rect.height)
                        left_page = output_doc.new_page(width=split_pos, height=rect.height)
                        left_page.show_pdf_page(fitz.Rect(0, 0, split_pos, rect.height),
                                               input_doc, page_num, clip=left_clip)

                        # Create right half
                        right_clip = fitz.Rect(split_pos, 0, rect.width, rect.height)
                        right_page = output_doc.new_page(width=rect.width - split_pos, height=rect.height)
                        right_page.show_pdf_page(fitz.Rect(0, 0, rect.width - split_pos, rect.height),
                                                input_doc, page_num, clip=right_clip)

                        success_count += 1
                        pbar.set_postfix({
                            'Split': f"{optimal_ratio:.1%}",
                            'Success': f"{success_count}/{page_count}"
                        })
                        pbar.update(1)

                        # Memory management
                        if (page_num + 1) % 10 == 0:
                            gc.collect()

                    except Exception as page_error:
                        print(f"⚠️ Page {page_num + 1} error: {page_error}")
                        continue

            # Step 3: Save Results
            print("\n💾 Step 3: Saving Results")
            print("=" * 50)

            if len(output_doc) == 0:
                print("❌ No pages were processed successfully")
                return False

            try:
                # Save with optimization
                output_doc.save(output_path, garbage=4, deflate=True)

                # Calculate performance metrics
                elapsed_time = time.time() - start_time
                output_size = os.path.getsize(output_path) / (1024 * 1024)
                created_pages = len(output_doc)

                print(f"✅ Processing Complete!")
                print(f"\n📊 Results Summary:")
                print(f"   • Input file: {os.path.basename(input_path)}")
                print(f"   • Output file: {os.path.basename(output_path)}")
                print(f"   • Pages processed: {success_count}/{page_count}")
                print(f"   • Pages created: {created_pages}")
                print(f"   • Output size: {output_size:.2f} MB")
                print(f"   • Processing time: {elapsed_time:.2f} seconds")
                print(f"   • Speed: {page_count/elapsed_time:.1f} pages/second")
                print(f"   • Split ratio used: {optimal_ratio:.1%}")
                print(f"   • Detection confidence: {confidence:.1%}")
                print(f"   • Success rate: {success_count/page_count:.1%}")

                return True

            except Exception as save_error:
                print(f"❌ Save error: {save_error}")
                # Try basic save as fallback
                try:
                    output_doc.save(output_path)
                    print("✅ Saved with basic options")
                    return True
                except Exception as basic_error:
                    print(f"❌ Basic save failed: {basic_error}")
                    return False

        except Exception as e:
            print(f"❌ Error during processing: {e}")
            import traceback
            if self.debug_mode:
                print(f"🔍 Debug traceback:\n{traceback.format_exc()}")
            return False

        finally:
            # Cleanup
            if input_doc:
                try:
                    input_doc.close()
                except:
                    pass

            if output_doc:
                try:
                    output_doc.close()
                except:
                    pass

            gc.collect()

    # =============================================================================
    # 🎛️ USER INTERFACE AND CONTROL FUNCTIONS
    # =============================================================================

    def create_advanced_interface(self):
        """Create advanced user interface"""

        print("🎛️ ADVANCED PDF SPLITTER INTERFACE")
        print("=" * 60)

        # Detection method selection
        method_selector = widgets.Dropdown(
            options=[
                ('🎯 Multi-Method Analysis (Recommended)', 'multi_method'),
                ('📏 Line Detection Only', 'line_detection'),
                ('🧠 Content Analysis Only', 'content_analysis'),
                ('👁️ Visual Analysis Only', 'visual_analysis'),
                ('🔧 Custom Ratio', 'custom')
            ],
            value='multi_method',
            description='Analysis Method:',
            style={'description_width': '150px'},
            layout={'width': '400px'}
        )

        # Custom ratio slider (hidden by default)
        custom_ratio_slider = widgets.FloatSlider(
            value=0.5,
            min=0.1,
            max=0.9,
            step=0.01,
            description='Custom Split Ratio:',
            style={'description_width': '150px'},
            readout_format='.0%',
            layout={'width': '400px', 'display': 'none'}
        )

        # Advanced options
        debug_checkbox = widgets.Checkbox(
            value=True,
            description='Enable detailed analysis output',
            style={'description_width': 'initial'}
        )

        sample_pages_slider = widgets.IntSlider(
            value=10,
            min=1,
            max=20,
            description='Pages to analyze:',
            style={'description_width': '150px'},
            layout={'width': '400px'}
        )

        # Analysis results display
        results_output = widgets.Output()

        # Interactive elements
        def on_method_change(change):
            if change['new'] == 'custom':
                custom_ratio_slider.layout.display = 'block'
            else:
                custom_ratio_slider.layout.display = 'none'

        method_selector.observe(on_method_change, names='value')

        # Layout
        interface = widgets.VBox([
            widgets.HTML("<h2>🎯 Advanced PDF Splitter Configuration</h2>"),
            widgets.HTML("<p>Select analysis method and configure options:</p>"),
            method_selector,
            custom_ratio_slider,
            debug_checkbox,
            sample_pages_slider,
            widgets.HTML("<hr>"),
            results_output
        ])

        display(interface)

        return {
            'method_selector': method_selector,
            'custom_ratio_slider': custom_ratio_slider,
            'debug_checkbox': debug_checkbox,
            'sample_pages_slider': sample_pages_slider,
            'results_output': results_output
        }

    def validate_pdf_file(self, pdf_path: str) -> Tuple[bool, str, Dict]:
        """Comprehensive PDF validation"""

        try:
            if not os.path.exists(pdf_path):
                return False, "❌ File not found", {}

            if not pdf_path.lower().endswith('.pdf'):
                return False, "❌ Not a PDF file", {}

            # Get file info
            file_size = os.path.getsize(pdf_path) / (1024 * 1024)  # MB
            file_modified = datetime.fromtimestamp(os.path.getmtime(pdf_path))

            # Open and analyze PDF
            doc = fitz.open(pdf_path)

            try:
                page_count = len(doc)

                if page_count == 0:
                    return False, "❌ PDF has no pages", {}

                if doc.needs_pass:
                    return False, "❌ PDF is password protected", {}

                # Get detailed info
                metadata = doc.metadata
                first_page = doc[0]

                pdf_info = {
                    'file_size_mb': file_size,
                    'page_count': page_count,
                    'page_dimensions': {
                        'width': first_page.rect.width,
                        'height': first_page.rect.height,
                        'aspect_ratio': first_page.rect.width / first_page.rect.height
                    },
                    'metadata': {
                        'title': metadata.get('title', 'Unknown'),
                        'author': metadata.get('author', 'Unknown'),
                        'creator': metadata.get('creator', 'Unknown'),
                        'producer': metadata.get('producer', 'Unknown')
                    },
                    'file_modified': file_modified.strftime("%Y-%m-%d %H:%M:%S"),
                    'estimated_processing_time': page_count * 0.1  # seconds
                }

                # Check for potential issues
                warnings = []

                if file_size > 50:  # Large file
                    warnings.append(f"⚠️ Large file ({file_size:.1f} MB) - processing may take longer")

                if page_count > 100:
                    warnings.append(f"⚠️ Many pages ({page_count}) - consider processing in batches")

                if first_page.rect.width / first_page.rect.height < 1.2:
                    warnings.append("⚠️ Pages seem narrow - vertical split may not be optimal")

                success_message = f"✅ Valid PDF: {page_count} pages, {file_size:.1f} MB"
                if warnings:
                    success_message += f"\n   {'   '.join(warnings)}"

                return True, success_message, pdf_info

            finally:
                doc.close()

        except Exception as e:
            return False, f"❌ Error analyzing PDF: {str(e)}", {}

    def download_with_retry(self, file_path: str, max_retries: int = 3) -> bool:
        """Download file with retry logic and better error handling"""

        if not os.path.exists(file_path):
            print(f"❌ File not found: {file_path}")
            return False

        file_size = os.path.getsize(file_path) / (1024 * 1024)
        print(f"📥 Preparing download: {os.path.basename(file_path)} ({file_size:.2f} MB)")

        for attempt in range(max_retries):
            try:
                print(f"   Attempt {attempt + 1}/{max_retries}...")
                files.download(file_path)
                print("✅ Download successful!")
                return True

            except Exception as e:
                print(f"   ⚠️ Attempt {attempt + 1} failed: {str(e)[:100]}")

                if attempt < max_retries - 1:
                    wait_time = (attempt + 1) * 2  # Progressive wait
                    print(f"   ⏳ Waiting {wait_time} seconds before retry...")
                    time.sleep(wait_time)
                else:
                    print(f"❌ All download attempts failed")
                    print(f"💡 File is still available locally: {file_path}")
                    return False

        return False

# =============================================================================
# 🚀 MAIN EXECUTION FUNCTIONS
# =============================================================================

def test_comprehensive_analysis():
    """Test comprehensive analysis on uploaded PDF"""

    print("🧪 COMPREHENSIVE PDF ANALYSIS TEST")
    print("=" * 60)
    print(f"🕐 Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"👤 User: Ravi-katta-dev")
    print("=" * 60)

    # Find PDF files
    pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf') and not f.startswith(('SMART_', 'LINE_', 'PRECISION_'))]

    if not pdf_files:
        print("❌ No PDF files found.")
        print("💡 Please upload a PDF file first.")
        return None

    pdf_file = pdf_files[0]
    print(f"📁 Analyzing: {pdf_file}")

    # Create splitter instance
    splitter = CompletePDFSplitter()

    # Validate PDF
    is_valid, message, pdf_info = splitter.validate_pdf_file(pdf_file)
    print(f"\n📋 PDF Validation:")
    print(message)

    if not is_valid:
        return None

    if pdf_info:
        print(f"\n📊 PDF Information:")
        print(f"   • Pages: {pdf_info['page_count']}")
        print(f"   • Size: {pdf_info['file_size_mb']:.2f} MB")
        print(f"   • Dimensions: {pdf_info['page_dimensions']['width']:.0f} x {pdf_info['page_dimensions']['height']:.0f}")
        print(f"   • Aspect ratio: {pdf_info['page_dimensions']['aspect_ratio']:.2f}")
        print(f"   • Estimated processing time: {pdf_info['estimated_processing_time']:.1f} seconds")

    # Run comprehensive analysis
    print(f"\n🔍 Running Comprehensive Analysis...")
    print("=" * 60)

    analysis_result = splitter.detect_optimal_split_multi_method(pdf_file)

    # Display detailed results
    print(f"\n📊 COMPREHENSIVE ANALYSIS RESULTS")
    print("=" * 60)
    print(f"🎯 Optimal Split Ratio: {analysis_result['optimal_split']:.1%}")
    print(f"🔒 Overall Confidence: {analysis_result['confidence']:.1%}")
    print(f"🤝 Methods Agreement: {analysis_result.get('agreement_factor', 0):.1%}")

    if 'summary' in analysis_result:
        summary = analysis_result['summary']
        print(f"\n📋 Individual Method Confidences:")
        print(f"   📏 Line Detection: {summary.get('line_detection_confidence', 0):.1%}")
        print(f"   🧠 Content Analysis: {summary.get('content_analysis_confidence', 0):.1%}")
        print(f"   👁️ Visual Analysis: {summary.get('visual_analysis_confidence', 0):.1%}")
        print(f"   📋 Structure Analysis: {summary.get('structure_analysis_confidence', 0):.1%}")

    # Recommendations
    confidence = analysis_result['confidence']
    split_ratio = analysis_result['optimal_split']

    print(f"\n💡 RECOMMENDATIONS:")
    print("=" * 30)

    if confidence > 0.8:
        print("🟢 EXCELLENT: Very high confidence in detection")
        print(f"   ✅ Proceed with split at {split_ratio:.1%}")
    elif confidence > 0.6:
        print("🟡 GOOD: High confidence in detection")
        print(f"   ✅ Recommended split at {split_ratio:.1%}")
    elif confidence > 0.4:
        print("🟠 MODERATE: Moderate confidence")
        print(f"   ⚠️ Consider manual review of {split_ratio:.1%} split")
    else:
        print("🔴 LOW: Low confidence in detection")
        print(f"   ⚠️ Manual inspection recommended")
        print(f"   💭 Suggested fallback: {split_ratio:.1%}")

    # Special cases
    if split_ratio < 0.25 or split_ratio > 0.75:
        print(f"   ⚠️ Unusual split ratio detected: {split_ratio:.1%}")
        print("   💭 This might indicate special document layout")

    return analysis_result

def run_intelligent_split():
    """Run intelligent PDF splitting with comprehensive analysis"""

    print("🎯 INTELLIGENT PDF SPLITTER v4.0")
    print("=" * 60)
    print("🧠 Complete Multi-Method Analysis System")
    print(f"🕐 Session: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"👤 User: Ravi-katta-dev")
    print("=" * 60)

    # Find PDF files
    pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf') and not f.startswith(('SMART_', 'LINE_', 'PRECISION_'))]

    if not pdf_files:
        print("❌ No PDF files found.")
        print("💡 Please upload a PDF file first.")
        return

    pdf_file = pdf_files[0]

    # Create splitter
    splitter = CompletePDFSplitter()

    # Generate output filename
    timestamp = datetime.now().strftime("%H%M%S")
    output_file = f"INTELLIGENT_SPLIT_{Path(pdf_file).stem}_{timestamp}.pdf"

    print(f"📁 Input: {pdf_file}")
    print(f"📁 Output: {output_file}")

    # Process with comprehensive analysis
    success = splitter.split_pdf_with_analysis(pdf_file, output_file, "multi_method")

    if success:
        print(f"\n🎉 SUCCESS! PDF split completed.")
        print("=" * 40)

        # Attempt download
        if splitter.download_with_retry(output_file):
            print("✅ File downloaded successfully!")
        else:
            print("⚠️ Download failed, but file is ready locally")
def run_interactive_mode():
    """Run interactive mode with step-by-step guidance"""

    print("🎮 INTERACTIVE PDF SPLITTER MODE")
    print("=" * 60)
    print(f"🕐 Current Time: 2025-08-08 12:22:52 UTC")
    print(f"👤 Current User: Ravi-katta-dev")
    print("🎯 Interactive Step-by-Step Processing")
    print("=" * 60)

    # Create splitter instance
    splitter = CompletePDFSplitter()

    # Step 1: Find and validate PDF
    print("\n📋 STEP 1: PDF Discovery and Validation")
    print("-" * 40)

    pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf') and not f.startswith(('SMART_', 'LINE_', 'PRECISION_', 'INTELLIGENT_'))]

    if not pdf_files:
        print("❌ No PDF files found in current directory.")
        print("\n💡 To upload a PDF:")
        print("   1. Click the 📁 folder icon in the left sidebar")
        print("   2. Select 'Upload to session storage'")
        print("   3. Choose your PDF file")
        print("   4. Wait for upload to complete")
        print("   5. Re-run this function")
        return

    print(f"✅ Found PDF files:")
    for i, pdf in enumerate(pdf_files, 1):
        file_size = os.path.getsize(pdf) / (1024 * 1024)
        print(f"   {i}. {pdf} ({file_size:.2f} MB)")

    # Use first PDF
    selected_pdf = pdf_files[0]
    print(f"\n🎯 Selected: {selected_pdf}")

    # Validate PDF
    is_valid, validation_message, pdf_info = splitter.validate_pdf_file(selected_pdf)
    print(f"\n📊 Validation Result:")
    print(validation_message)

    if not is_valid:
        print("❌ Cannot proceed with invalid PDF.")
        return

    # Step 2: Quick Analysis Preview
    print(f"\n🔍 STEP 2: Quick Analysis Preview")
    print("-" * 40)

    print("Running quick line detection preview...")
    quick_analysis = splitter.detect_vertical_lines(selected_pdf, sample_pages=3)

    quick_split = quick_analysis.get('optimal_split', 0.5)
    quick_confidence = quick_analysis.get('confidence', 0.1)

    print(f"📏 Quick line detection result:")
    print(f"   • Suggested split: {quick_split:.1%}")
    print(f"   • Confidence: {quick_confidence:.1%}")

    if quick_confidence > 0.6:
        print("✅ High confidence - line detection working well!")
    elif quick_confidence > 0.3:
        print("⚠️ Moderate confidence - will use comprehensive analysis")
    else:
        print("❌ Low confidence - will run full multi-method analysis")

    # Step 3: User Decision Point
    print(f"\n🤔 STEP 3: Processing Decision")
    print("-" * 40)

    if quick_confidence > 0.7:
        print("🚀 RECOMMENDATION: Use quick line detection")
        print(f"   Split ratio: {quick_split:.1%}")
        print("   This should work perfectly for your PDF!")
        recommended_method = "line_detection"
    else:
        print("🧠 RECOMMENDATION: Use comprehensive multi-method analysis")
        print("   This will analyze your PDF using all available methods")
        recommended_method = "multi_method"

    print(f"\n📝 Available options:")
    print("   1. 🚀 Use recommended method (fastest)")
    print("   2. 🧠 Force comprehensive analysis (most accurate)")
    print("   3. 📏 Line detection only")
    print("   4. 🎛️ Custom ratio")
    print("   5. 🧪 Test all methods (analysis only)")

    # Auto-select recommended method for demo
    choice = "1"  # Simulating user choosing recommended method
    print(f"🎯 Auto-selecting option 1 (recommended method)")

    # Step 4: Execute Processing
    print(f"\n⚙️ STEP 4: Processing Execution")
    print("-" * 40)

    # Generate output filename
    timestamp = datetime.now().strftime("%H%M%S")

    if choice == "1":
        method = recommended_method
        output_file = f"SMART_SPLIT_{Path(selected_pdf).stem}_{timestamp}.pdf"
    elif choice == "2":
        method = "multi_method"
        output_file = f"COMPREHENSIVE_SPLIT_{Path(selected_pdf).stem}_{timestamp}.pdf"
    elif choice == "3":
        method = "line_detection"
        output_file = f"LINE_SPLIT_{Path(selected_pdf).stem}_{timestamp}.pdf"
    elif choice == "4":
        # Custom ratio would be handled here
        custom_ratio = 0.6  # Example
        method = "custom"
        output_file = f"CUSTOM_SPLIT_{Path(selected_pdf).stem}_{timestamp}.pdf"
    else:
        # Test mode
        print("🧪 Running comprehensive test...")
        test_result = test_comprehensive_analysis()
        print("✅ Test completed! Check results above.")
        return

    print(f"📁 Output file: {output_file}")
    print(f"🔧 Processing method: {method}")

    # Execute the split
    print(f"\n🚀 Starting processing...")
    success = splitter.split_pdf_with_analysis(selected_pdf, output_file, method)

    # Step 5: Results and Download
    print(f"\n📦 STEP 5: Results and Download")
    print("-" * 40)

    if success:
        print("🎉 SUCCESS! PDF processing completed.")

        # File statistics
        if os.path.exists(output_file):
            output_size = os.path.getsize(output_file) / (1024 * 1024)
            input_size = os.path.getsize(selected_pdf) / (1024 * 1024)

            print(f"\n📊 File Statistics:")
            print(f"   • Input size: {input_size:.2f} MB")
            print(f"   • Output size: {output_size:.2f} MB")
            print(f"   • Size efficiency: {(1 - output_size/input_size):.1%} reduction")

        # Attempt download
        print(f"\n📥 Attempting download...")
        download_success = splitter.download_with_retry(output_file, max_retries=3)

        if download_success:
            print("✅ Download completed successfully!")
            print(f"💾 Your split PDF has been downloaded: {output_file}")
        else:
            print("⚠️ Download failed, but processing was successful")
            print(f"💡 File is available locally: {output_file}")
            print("🔄 You can try downloading manually or re-run the download")
    else:
        print("❌ Processing failed. Please check the error messages above.")

        # Troubleshooting suggestions
        print(f"\n🔧 Troubleshooting suggestions:")
        print("   1. Check if PDF is valid and not corrupted")
        print("   2. Try restarting the runtime (Runtime → Restart Runtime)")
        print("   3. Re-upload the PDF file")
        print("   4. Try a different processing method")

    print(f"\n✅ Interactive session completed!")
    print(f"🕐 Session ended: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} UTC")

def quick_split_now():
    """Quick split with minimal setup - just process and download"""

    print("⚡ QUICK SPLIT - FAST PROCESSING")
    print("=" * 50)
    print(f"🕐 Time: 2025-08-08 12:22:52 UTC")
    print(f"👤 User: Ravi-katta-dev")
    print("⚡ Fast processing with smart defaults")
    print("=" * 50)

    # Find PDF
    pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf') and not f.startswith(('SMART_', 'LINE_', 'PRECISION_', 'INTELLIGENT_', 'COMPREHENSIVE_', 'CUSTOM_'))]

    if not pdf_files:
        print("❌ No PDF files found.")
        return

    pdf_file = pdf_files[0]
    print(f"📁 Processing: {pdf_file}")

    # Quick validation
    if not os.path.exists(pdf_file):
        print("❌ File not found.")
        return

    file_size = os.path.getsize(pdf_file) / (1024 * 1024)
    print(f"📊 File size: {file_size:.2f} MB")

    # Create splitter and process
    splitter = CompletePDFSplitter()
    splitter.debug_mode = False  # Reduce output for quick mode

    # Generate output
    timestamp = datetime.now().strftime("%H%M%S")
    output_file = f"QUICK_SPLIT_{Path(pdf_file).stem}_{timestamp}.pdf"

    print(f"🚀 Quick processing...")

    # Use multi-method for best results
    success = splitter.split_pdf_with_analysis(pdf_file, output_file, "multi_method")

    if success:
        print(f"✅ Done! Downloading {output_file}...")

        try:
            files.download(output_file)
            print("🎉 Success! Your split PDF has been downloaded.")
        except Exception as e:
            print(f"⚠️ Download error: {e}")
            print(f"💡 File saved as: {output_file}")
    else:
        print("❌ Processing failed.")

def batch_process_pdfs():
    """Process multiple PDFs if available"""

    print("📦 BATCH PDF PROCESSING")
    print("=" * 50)
    print(f"🕐 Time: 2025-08-08 12:22:52 UTC")
    print(f"👤 User: Ravi-katta-dev")
    print("📦 Process multiple PDFs automatically")
    print("=" * 50)

    # Find all PDFs
    all_pdfs = [f for f in os.listdir('.') if f.lower().endswith('.pdf')]
    input_pdfs = [f for f in all_pdfs if not f.startswith(('SMART_', 'LINE_', 'PRECISION_', 'INTELLIGENT_', 'COMPREHENSIVE_', 'CUSTOM_', 'QUICK_'))]

    if not input_pdfs:
        print("❌ No input PDF files found.")
        return

    if len(input_pdfs) == 1:
        print(f"📄 Only one PDF found: {input_pdfs[0]}")
        print("💡 Use quick_split_now() for single file processing")
        return

    print(f"📚 Found {len(input_pdfs)} PDFs to process:")
    for i, pdf in enumerate(input_pdfs, 1):
        file_size = os.path.getsize(pdf) / (1024 * 1024)
        print(f"   {i}. {pdf} ({file_size:.2f} MB)")

    # Create splitter
    splitter = CompletePDFSplitter()

    # Process each PDF
    successful_files = []
    failed_files = []

    for i, pdf_file in enumerate(input_pdfs, 1):
        print(f"\n🔄 Processing {i}/{len(input_pdfs)}: {pdf_file}")
        print("-" * 40)

        # Generate output name
        timestamp = datetime.now().strftime("%H%M%S")
        output_file = f"BATCH_SPLIT_{i}_{Path(pdf_file).stem}_{timestamp}.pdf"

        try:
            success = splitter.split_pdf_with_analysis(pdf_file, output_file, "multi_method")

            if success:
                successful_files.append(output_file)
                print(f"✅ {pdf_file} processed successfully")
            else:
                failed_files.append(pdf_file)
                print(f"❌ {pdf_file} processing failed")

        except Exception as e:
            failed_files.append(pdf_file)
            print(f"❌ {pdf_file} error: {e}")

    # Results summary
    print(f"\n📊 BATCH PROCESSING SUMMARY")
    print("=" * 40)
    print(f"✅ Successful: {len(successful_files)}/{len(input_pdfs)}")
    print(f"❌ Failed: {len(failed_files)}/{len(input_pdfs)}")

    if successful_files:
        print(f"\n📥 Downloading successful files...")

        for output_file in successful_files:
            try:
                files.download(output_file)
                print(f"✅ Downloaded: {output_file}")
            except Exception as e:
                print(f"⚠️ Download failed: {output_file} - {e}")

    if failed_files:
        print(f"\n❌ Failed files:")
        for failed_file in failed_files:
            print(f"   • {failed_file}")

def diagnose_pdf_issues():
    """Diagnose potential issues with PDF processing"""

    print("🔍 PDF DIAGNOSTIC TOOL")
    print("=" * 50)
    print(f"🕐 Time: 2025-08-08 12:22:52 UTC")
    print(f"👤 User: Ravi-katta-dev")
    print("🔍 Comprehensive PDF analysis and issue detection")
    print("=" * 50)

    # Find PDFs
    pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf') and not f.startswith(('SMART_', 'LINE_', 'PRECISION_', 'INTELLIGENT_'))]

    if not pdf_files:
        print("❌ No PDF files found for diagnosis.")
        return

    splitter = CompletePDFSplitter()

    for pdf_file in pdf_files:
        print(f"\n🔍 DIAGNOSING: {pdf_file}")
        print("=" * 60)

        # Basic file info
        try:
            file_size = os.path.getsize(pdf_file) / (1024 * 1024)
            print(f"📁 File size: {file_size:.2f} MB")

            # Detailed validation
            is_valid, message, pdf_info = splitter.validate_pdf_file(pdf_file)
            print(f"📋 Validation: {message}")

            if not is_valid:
                continue

            # Open PDF for detailed analysis
            doc = fitz.open(pdf_file)

            try:
                # Page analysis
                print(f"\n📄 Page Analysis:")
                print(f"   • Total pages: {len(doc)}")

                if len(doc) > 0:
                    first_page = doc[0]
                    print(f"   • Page size: {first_page.rect.width:.0f} x {first_page.rect.height:.0f}")
                    print(f"   • Aspect ratio: {first_page.rect.width/first_page.rect.height:.2f}")

                    # Text analysis
                    text_content = first_page.get_text()
                    print(f"   • Text length (first page): {len(text_content)} characters")

                    if len(text_content) < 50:
                        print("   ⚠️ Warning: Very little text detected")

                    # Check for common keywords
                    keywords = ['question', 'answer', 'telegram', 'click', 'practice', 'exam']
                    found_keywords = [kw for kw in keywords if kw.lower() in text_content.lower()]

                    if found_keywords:
                        print(f"   • Content type indicators: {', '.join(found_keywords)}")

                # Quick line detection test
                print(f"\n📏 Line Detection Test:")
                line_test = splitter.detect_vertical_lines(pdf_file, sample_pages=3)

                print(f"   • Lines detected: {len(line_test.get('detected_lines', []))}")
                print(f"   • Suggested split: {line_test.get('optimal_split', 0.5):.1%}")
                print(f"   • Confidence: {line_test.get('confidence', 0):.1%}")

                if line_test.get('confidence', 0) > 0.7:
                    print("   ✅ Excellent line detection")
                elif line_test.get('confidence', 0) > 0.4:
                    print("   ⚠️ Moderate line detection")
                else:
                    print("   ❌ Poor line detection - may need manual ratio")

                # Content structure analysis
                print(f"\n🧠 Content Structure:")

                blocks = first_page.get_text("dict").get("blocks", [])
                text_blocks = [b for b in blocks if "lines" in b]

                print(f"   • Text blocks: {len(text_blocks)}")

                if text_blocks:
                    x_positions = []
                    for block in text_blocks:
                        bbox = block["bbox"]
                        x_positions.extend([bbox[0], bbox[2]])

                    x_positions.sort()
                    page_width = first_page.rect.width

                    # Check distribution
                    left_content = sum(1 for x in x_positions if x < page_width * 0.5)
                    right_content = sum(1 for x in x_positions if x >= page_width * 0.5)

                    print(f"   • Content distribution: {left_content} left, {right_content} right")

                    if abs(left_content - right_content) > 5:
                        print("   ✅ Uneven distribution - good for splitting")
                    else:
                        print("   ⚠️ Even distribution - splitting may not be beneficial")

                # Recommendations
                print(f"\n💡 RECOMMENDATIONS:")

                confidence = line_test.get('confidence', 0)
                split_ratio = line_test.get('optimal_split', 0.5)

                if confidence > 0.7:
                    print(f"   🟢 PROCEED: Use automatic detection ({split_ratio:.1%})")
                elif confidence > 0.4:
                    print(f"   🟡 CAUTION: Review suggested split ({split_ratio:.1%})")
                else:
                    print(f"   🔴 MANUAL: Consider manual inspection")

                # Check for potential issues
                issues = []

                if file_size > 100:
                    issues.append("Very large file - processing may be slow")

                if len(doc) > 200:
                    issues.append("Many pages - consider batch processing")

                if split_ratio < 0.2 or split_ratio > 0.8:
                    issues.append(f"Unusual split ratio ({split_ratio:.1%})")

                if len(text_content) < 100:
                    issues.append("Very little text - might be image-based PDF")

                if issues:
                    print(f"\n⚠️ POTENTIAL ISSUES:")
                    for issue in issues:
                        print(f"   • {issue}")
                else:
                    print(f"\n✅ No issues detected - PDF looks good for processing!")

            finally:
                doc.close()

        except Exception as e:
            print(f"❌ Diagnostic error: {e}")

def show_help_and_usage():
    """Show comprehensive help and usage guide"""

    print("📚 COMPLETE PDF SPLITTER - HELP & USAGE GUIDE")
    print("=" * 70)
    print(f"🕐 Time: 2025-08-08 12:22:52 UTC")
    print(f"👤 User: Ravi-katta-dev")
    print(f"📱 Session: {datetime.now().strftime('%Y%m%d_%H%M%S')}")
    print("=" * 70)

    print("""
🎯 AVAILABLE FUNCTIONS:

1. 🚀 QUICK PROCESSING:
   quick_split_now()                    - Fast split with smart defaults

2. 🎮 INTERACTIVE MODE:
   run_interactive_mode()               - Step-by-step guided processing

3. 🧠 COMPREHENSIVE ANALYSIS:
   test_comprehensive_analysis()        - Test all detection methods
   run_intelligent_split()              - Full multi-method processing

4. 📦 BATCH PROCESSING:
   batch_process_pdfs()                 - Process multiple PDFs

5. 🔍 DIAGNOSTIC TOOLS:
   diagnose_pdf_issues()                - Analyze PDF structure and issues

6. ℹ️ HELP & INFO:
   show_help_and_usage()                - This help guide

═══════════════════════════════════════════════════════════════════════

🎯 RECOMMENDED WORKFLOW:

For First-Time Users:
1. Upload your PDF file
2. Run: diagnose_pdf_issues()          # Check for any issues
3. Run: test_comprehensive_analysis()   # See what methods work best
4. Run: run_interactive_mode()         # Process with guidance

For Quick Processing:
1. Upload your PDF file
2. Run: quick_split_now()              # Fast processing

For Multiple Files:
1. Upload multiple PDF files
2. Run: batch_process_pdfs()           # Process all at once

═══════════════════════════════════════════════════════════════════════

🔧 DETECTION METHODS:

📏 Line Detection:
   - Finds actual vertical lines in your PDF
   - Perfect for documents with visible dividers
   - High accuracy for structured documents

🧠 Content Analysis:
   - Analyzes text density and distribution
   - Good for documents without visible lines
   - Works with natural content boundaries

👁️ Visual Pattern Recognition:
   - Recognizes visual layout patterns
   - Detects column structures and alignments
   - Useful for complex layouts

📋 Document Structure Analysis:
   - Understands document type and purpose
   - Optimizes for exam papers, forms, etc.
   - Context-aware processing

🎯 Multi-Method Fusion:
   - Combines all methods for best accuracy
   - Highest confidence results
   - Recommended for unknown document types

═══════════════════════════════════════════════════════════════════════

💡 TROUBLESHOOTING:

❌ "No PDF files found":
   - Upload a PDF file first
   - Check file has .pdf extension
   - Restart runtime if needed

❌ "Processing failed":
   - Run diagnose_pdf_issues() to check PDF
   - Try different detection method
   - Check if PDF is password protected

❌ "Download failed":
   - File is still processed locally
   - Try downloading manually
   - Check internet connection

❌ "Low confidence detection":
   - PDF might have unusual layout
   - Try manual ratio adjustment
   - Use visual inspection

═══════════════════════════════════════════════════════════════════════

🎓 TIPS FOR BEST RESULTS:

✅ PDF Quality:
   - Use high-quality, text-based PDFs
   - Avoid heavily compressed files
   - Ensure text is selectable

✅ Document Type:
   - Works best with structured documents
   - Exam papers, forms, and reports ideal
   - Two-column layouts process excellently

✅ File Size:
   - Files under 50MB process fastest
   - Large files may take longer
   - Consider batch processing for multiple files

✅ Split Ratios:
   - 40-60% typically work best
   - Extreme ratios (20% or 80%) may indicate issues
   - Trust high-confidence detections

═══════════════════════════════════════════════════════════════════════

🆘 SUPPORT:

If you encounter issues:
1. Run diagnose_pdf_issues() first
2. Check the troubleshooting section above
3. Try different processing methods
4. Consider manual ratio adjustment

For best results with your exam papers:
- Use quick_split_now() for simple cases
- Use run_interactive_mode() for guidance
- Use test_comprehensive_analysis() to verify detection

═══════════════════════════════════════════════════════════════════════
""")

# =============================================================================
# 🚀 SYSTEM INITIALIZATION AND STATUS
# =============================================================================

def system_status():
    """Show current system status and available files"""

    print("📊 SYSTEM STATUS REPORT")
    print("=" * 50)
    print(f"🕐 Current Time: 2025-08-08 12:22:52 UTC")
    print(f"👤 Current User: Ravi-katta-dev")
    print(f"🐍 Python Environment: Google Colab")
    print(f"📦 PyMuPDF Version: {fitz.version[0] if hasattr(fitz, 'version') else 'Unknown'}")
    print("=" * 50)

    # Check current directory contents
    all_files = os.listdir('.')
    pdf_files = [f for f in all_files if f.lower().endswith('.pdf')]

    print(f"📁 Current Directory Analysis:")
    print(f"   • Total files: {len(all_files)}")
    print(f"   • PDF files: {len(pdf_files)}")

    if pdf_files:
        print(f"\n📋 PDF Files Found:")

        input_pdfs = []
        output_pdfs = []

        for pdf in pdf_files:
            file_size = os.path.getsize(pdf) / (1024 * 1024)

            if pdf.startswith(('SMART_', 'LINE_', 'PRECISION_', 'INTELLIGENT_', 'COMPREHENSIVE_', 'CUSTOM_', 'QUICK_', 'BATCH_')):
                output_pdfs.append((pdf, file_size))
            else:
                input_pdfs.append((pdf, file_size))

        if input_pdfs:
            print(f"\n📥 Input PDFs ({len(input_pdfs)}):")
            for pdf, size in input_pdfs:
                print(f"   📄 {pdf} ({size:.2f} MB)")

        if output_pdfs:
            print(f"\n📤 Processed PDFs ({len(output_pdfs)}):")
            for pdf, size in output_pdfs:
                print(f"   📄 {pdf} ({size:.2f} MB)")
    else:
        print("\n❌ No PDF files found")
        print("💡 Upload a PDF file to get started")

    # Memory status
    print(f"\n💾 Memory Status:")
    try:
        import psutil
        memory = psutil.virtual_memory()
        print(f"   • Available: {memory.available / (1024**3):.1f} GB")
        print(f"   • Usage: {memory.percent:.1f}%")
    except:
        print("   • Memory info not available")

    # Recommendations
    print(f"\n💡 RECOMMENDATIONS:")

    if not pdf_files:
        print("   🔸 Upload a PDF file first")
        print("   🔸 Run show_help_and_usage() for guidance")
    elif input_pdfs and not output_pdfs:
        print("   🔸 Ready to process! Try quick_split_now()")
        print("   🔸 Or run run_interactive_mode() for guided processing")
    elif input_pdfs and output_pdfs:
        print("   🔸 Some files already processed")
        print("   🔸 Check output files or process remaining inputs")
    else:
        print("   🔸 Only output files found")
        print("   🔸 Upload new input PDFs or download existing outputs")

# Initialize the complete system
print("🎯 COMPLETE PDF SPLITTER v4.0 - FULLY LOADED!")
print("=" * 70)
print(f"🕐 System Time: 2025-08-08 12:22:52 UTC")
print(f"👤 Current User: Ravi-katta-dev")
print(f"🚀 Status: All systems operational")
print("=" * 70)

# Show system status
system_status()

print(f"\n🎮 QUICK START COMMANDS:")
print("-" * 30)
print("🚀 quick_split_now()              # Fast processing")
print("🎮 run_interactive_mode()         # Step-by-step guidance")
print("🧪 test_comprehensive_analysis()  # Test all methods")
print("🔍 diagnose_pdf_issues()          # Check PDF health")
print("📚 show_help_and_usage()          # Complete guide")
print("📊 system_status()                # System information")

print(f"\n✨ Ready for processing! Choose a command above to get started.")

✅ All imports successful!
🕐 Session started: 2025-08-08 12:18:45 UTC
👤 User: Ravi-katta-dev
🎯 COMPLETE PDF SPLITTER v4.0 - FULLY LOADED!
🕐 System Time: 2025-08-08 12:22:52 UTC
👤 Current User: Ravi-katta-dev
🚀 Status: All systems operational
📊 SYSTEM STATUS REPORT
🕐 Current Time: 2025-08-08 12:22:52 UTC
👤 Current User: Ravi-katta-dev
🐍 Python Environment: Google Colab
📦 PyMuPDF Version: 1.26.3
📁 Current Directory Analysis:
   • Total files: 2
   • PDF files: 0

❌ No PDF files found
💡 Upload a PDF file to get started

💾 Memory Status:
   • Available: 11.4 GB
   • Usage: 9.9%

💡 RECOMMENDATIONS:
   🔸 Upload a PDF file first
   🔸 Run show_help_and_usage() for guidance

🎮 QUICK START COMMANDS:
------------------------------
🚀 quick_split_now()              # Fast processing
🎮 run_interactive_mode()         # Step-by-step guidance
🧪 test_comprehensive_analysis()  # Test all methods
🔍 diagnose_pdf_issues()          # Check PDF health
📚 show_help_and_usage()          # Complete guide
📊 system_

In [None]:
show_help_and_usage()

📚 COMPLETE PDF SPLITTER - HELP & USAGE GUIDE
🕐 Time: 2025-08-08 12:22:52 UTC
👤 User: Ravi-katta-dev
📱 Session: 20250808_124149

🎯 AVAILABLE FUNCTIONS:

1. 🚀 QUICK PROCESSING:
   quick_split_now()                    - Fast split with smart defaults
   
2. 🎮 INTERACTIVE MODE:
   run_interactive_mode()               - Step-by-step guided processing
   
3. 🧠 COMPREHENSIVE ANALYSIS:
   test_comprehensive_analysis()        - Test all detection methods
   run_intelligent_split()              - Full multi-method processing
   
4. 📦 BATCH PROCESSING:
   batch_process_pdfs()                 - Process multiple PDFs
   
5. 🔍 DIAGNOSTIC TOOLS:
   diagnose_pdf_issues()                - Analyze PDF structure and issues
   
6. ℹ️ HELP & INFO:
   show_help_and_usage()                - This help guide

═══════════════════════════════════════════════════════════════════════

🎯 RECOMMENDED WORKFLOW:

For First-Time Users:
1. Upload your PDF file
2. Run: diagnose_pdf_issues()          # Check for any issue

In [None]:
run_interactive_mode()

In [None]:
test_comprehensive_analysis()

In [None]:
main_intelligent()  # Interactive mode with UI

🎯 INTELLIGENT PDF SPLITTER v3.0
🧠 AI-Powered Content-Aware PDF Splitting for Google Colab
🎯 Intelligent PDF Splitter initialized!
🎯 Intelligent PDF Splitter Configuration


VBox(children=(HTML(value='<h3>🎯 Configuration Options</h3>'), Dropdown(description='Split Mode:', options=(('…


📁 Upload your PDF files:
💡 The AI will analyze content layout automatically


In [None]:
quick_fix()

🔧 QUICK FIX MODE
📁 Found existing PDF files: ['SPLIT_Tech Practice1-80.pdf', 'Tech Practice1-80.pdf']
📥 Attempting to download: SPLIT_Tech Practice1-80.pdf
📥 Downloading: SPLIT_Tech Practice1-80.pdf (attempt 1)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Download successful!


In [None]:
# =============================================================================
# 🚀 ENHANCED PDF SPLITTER - GOOGLE COLAB SETUP
# =============================================================================
# Run this cell first to install all dependencies and set up the environment

print("🔧 Setting up Enhanced PDF Splitter...")
print("=" * 50)

# Install required packages
import subprocess
import sys

def install_package(package):
    """Install package with progress indication"""
    try:
        __import__(package.replace('-', '_').split('[')[0])
        print(f"✅ {package} already installed")
        return True
    except ImportError:
        print(f"📦 Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])
        print(f"✅ {package} installed successfully!")
        return True
    except Exception as e:
        print(f"❌ Failed to install {package}: {e}")
        return False

# Required packages
packages = [
    "pymupdf",      # PDF processing
    "Pillow",       # Image processing
    "tqdm",         # Progress bars
]

print("\n📦 Installing dependencies...")
all_installed = True
for package in packages:
    if not install_package(package):
        all_installed = False

if all_installed:
    print("\n✅ All dependencies installed successfully!")
    print("🎉 Ready to load the PDF Splitter!")
else:
    print("\n❌ Some packages failed to install. Please try restarting the runtime.")

print("\n" + "=" * 50)
print("Next: Run the second cell to load the PDF Splitter code")

🔧 Setting up Enhanced PDF Splitter...

📦 Installing dependencies...
📦 Installing pymupdf...
✅ pymupdf installed successfully!
📦 Installing Pillow...
✅ Pillow installed successfully!
✅ tqdm already installed

✅ All dependencies installed successfully!
🎉 Ready to load the PDF Splitter!

Next: Run the second cell to load the PDF Splitter code


In [None]:
quick_split_now()

⚡ QUICK SPLIT - FAST PROCESSING
🕐 Time: 2025-08-08 12:22:52 UTC
👤 User: Ravi-katta-dev
⚡ Fast processing with smart defaults
📁 Processing: Tech Practice1-80.pdf
📊 File size: 3.53 MB
🎯 Complete PDF Splitter v4.0 initialized!
📱 Session ID: 20250808_124712
🚀 Quick processing...
🔍 Step 1: Comprehensive PDF Analysis
🎯 Running comprehensive multi-method analysis...
📏 Method 1: Vertical line detection...
📏 Analyzing 10 pages for vertical lines...
🧠 Method 2: Content layout analysis...
🧠 Analyzing content layout from 5 pages...
👁️ Method 3: Visual pattern recognition...
📋 Method 4: Document structure analysis...
🔄 Combining all analysis methods...

📊 Analysis Results:
   • Detection method: multi_method_combined
   • Optimal split ratio: 54.3%
   • Detection confidence: 78.4%
   • Line detection confidence: 86.0%
   • Content analysis confidence: 50.0%
   • Methods agreement: 99.3%

✂️ Step 2: Applying Split at 54.3%


📄 Processing:   0%|          | 0/80 [00:00<?, ?page/s]


💾 Step 3: Saving Results
✅ Processing Complete!

📊 Results Summary:
   • Input file: Tech Practice1-80.pdf
   • Output file: QUICK_SPLIT_Tech Practice1-80_124712.pdf
   • Pages processed: 80/80
   • Pages created: 160
   • Output size: 3.42 MB
   • Processing time: 1.08 seconds
   • Speed: 74.2 pages/second
   • Split ratio used: 54.3%
   • Detection confidence: 78.4%
   • Success rate: 100.0%
✅ Done! Downloading QUICK_SPLIT_Tech Practice1-80_124712.pdf...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

🎉 Success! Your split PDF has been downloaded.
