In [6]:
!python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
!pip install paddleocr==3.2.0

# Install a newer compatible PyTorch version
!pip install torch==2.2.0+cu118 torchvision==0.17.0+cu118 torchaudio==2.2.0+cu118 --index-url https://download.pytorch.org/whl/cu118
# !pip install nvidia-cudnn-cu11==8.7.0.84

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.2.0+cu118 requires nvidia-cudnn-cu11==8.7.0.84; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu11 8.9.6.50 which is incompatible.[0m[31m
[0mSuccessfully installed nvidia-cudnn-cu11-8.9.6.50
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting nvidia-cudnn-cu11==8.7.0.84 (from torch==2.2.0+cu118)
  Using cached https://download.pytorch.org/whl/cu118/nvidia_cudnn_cu11-8.7.0.84-py3-none-manylinux1_x86_64.whl (728.5 MB)
Installing collected packages: nvidia-cudnn-cu11
  Attempting uninstall: nvidia-cudnn-cu11
    Found existing installation: nvidia-cudnn-cu11 8.9.6.50
    Uninstalling nvidia-cudnn-cu11-8.9.6.50:
      Successfully uninstalled nvidia-cudnn-cu11-8.9.6.50
[31mERROR: pip's dependency resolver does not currently take into account a

In [None]:
# !pip install "numpy<2.0"

In [1]:
# Step 1: Install required PaddleX dependencies for PP-StructureV3
!pip install "paddlex[ocr]"

# Step 2: Install additional dependencies that might be missing
!pip install layoutparser
!pip install "unstructured[all-docs]"
!pip install opencv-python-headless
!pip install pillow
!pip install scikit-image
!pip install matplotlib

# Step 3: Restart runtime after installation

Collecting numpy (from layoutparser)
  Downloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Downloading numpy-2.2.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m114.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
paddlepaddle-gpu 3.0.0 requires nvidia-cudnn-cu11==8.9.6.50; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu11 8



# Flask Specific

In [4]:
!pip install flask flask-cors pyngrok PyMuPDF pandas

Collecting flask-cors
  Downloading flask_cors-6.0.1-py3-none-any.whl.metadata (5.3 kB)
Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Collecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading flask_cors-6.0.1-py3-none-any.whl (13 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, PyMuPDF, flask-cors
Successfully installed PyMuPDF-1.26.4 flask-cors-6.0.1 pyngrok-7.3.0


In [1]:
import os
import json
import glob
import threading
import time
from datetime import datetime
from pathlib import Path
import zipfile
import tempfile
import shutil
from typing import Dict, List, Optional, Tuple
import uuid
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import traceback

from flask import Flask, request, jsonify, render_template, send_file
from flask_cors import CORS
import fitz  # PyMuPDF for PDF processing
import cv2
import numpy as np

# Set CUDA device
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

app = Flask(__name__)
CORS(app)

# Global variables for processing state
processing_queue = []
processing_status = {}
current_job_id = None
paddle_pipeline = None
magic_dictionary = {}

class JobStatus:
    def __init__(self, job_id: str, total_files: int):
        self.job_id = job_id
        self.total_files = total_files
        self.processed_files = 0
        self.total_pages = 0
        self.processed_pages = 0
        self.failed_pages = []
        self.low_confidence_pages = []
        self.start_time = datetime.now()
        self.status = "queued"  # queued, processing, completed, failed
        self.current_file = ""
        self.error_message = ""
        self.output_directory = ""

def initialize_paddle_ocr():
    """Initialize PaddleOCR with maximum power settings"""
    global paddle_pipeline

    try:
        from paddleocr import PPStructureV3
        logger.info("Initializing PP-StructureV3 with maximum power settings...")

        paddle_pipeline = PPStructureV3(
            # === LAYOUT DETECTION (MAXIMUM POWER) ===
            layout_detection_model_name="PP-DocLayout_plus-L",

            # === TEXT MODELS (SERVER-GRADE MAXIMUM ACCURACY) ===
            text_detection_model_name="PP-OCRv5_server_det",
            text_recognition_model_name="PP-OCRv5_server_rec",

            # === DOCUMENT PREPROCESSING (MAXIMUM CORRECTION) ===
            doc_orientation_classify_model_name="PP-LCNet_x1_0_doc_ori",
            doc_unwarping_model_name="UVDoc",
            textline_orientation_model_name="PP-LCNet_x1_0_textline_ori",

            # === TABLE RECOGNITION (MOST ADVANCED MODELS) ===
            table_classification_model_name="PP-LCNet_x1_0_table_cls",
            wired_table_structure_recognition_model_name="SLANeXt_wired",
            wireless_table_structure_recognition_model_name="SLANeXt_wireless",
            wired_table_cells_detection_model_name="RT-DETR-L_wired_table_cell_det",
            wireless_table_cells_detection_model_name="RT-DETR-L_wireless_table_cell_det",

            # === FORMULA & SEAL RECOGNITION (MAXIMUM CAPABILITY) ===
            formula_recognition_model_name="UniMERNet",
            seal_text_detection_model_name="PP-OCRv4_server_seal_det",

            # === CHART PARSING (ADVANCED) ===
            chart_recognition_model_name="PP-Chart2Table",

            # === ENABLE ALL ADVANCED FEATURES ===
            use_doc_orientation_classify=True,
            use_doc_unwarping=True,
            use_textline_orientation=True,
            use_table_recognition=True,
            use_formula_recognition=True,
            use_seal_recognition=True,
            use_chart_recognition=True,
            use_region_detection=True,

            # === MAXIMUM ACCURACY PARAMETERS ===
            layout_threshold=0.3,
            layout_nms=True,
            layout_unclip_ratio=1.5,
            layout_merge_bboxes_mode="large",

            text_det_limit_side_len=1280,
            text_det_limit_type='max',
            text_det_thresh=0.15,
            text_det_box_thresh=0.4,
            text_det_unclip_ratio=2.2,

            text_rec_score_thresh=0.0,
            text_recognition_batch_size=1,

            seal_det_limit_side_len=960,
            seal_det_thresh=0.1,
            seal_det_box_thresh=0.4,
            seal_det_unclip_ratio=1.0,
            seal_rec_score_thresh=0.0,

            formula_recognition_batch_size=1,
            chart_recognition_batch_size=1,
            textline_orientation_batch_size=1,

            device='gpu'
        )

        logger.info("✅ PP-StructureV3 initialized successfully!")
        return True

    except Exception as e:
        logger.error(f"❌ Failed to initialize PaddleOCR: {e}")
        return False

def load_magic_dictionary(dictionary_path: str) -> Dict:
    """Load magic-specific dictionary from JSON/CSV"""
    global magic_dictionary

    if not os.path.exists(dictionary_path):
        logger.warning(f"Dictionary file not found: {dictionary_path}")
        return {}

    try:
        if dictionary_path.endswith('.json'):
            with open(dictionary_path, 'r', encoding='utf-8') as f:
                magic_dictionary = json.load(f)
        elif dictionary_path.endswith('.csv'):
            import pandas as pd
            df = pd.read_csv(dictionary_path)
            magic_dictionary = df.to_dict('records') if not df.empty else {}

        logger.info(f"✅ Loaded magic dictionary with {len(magic_dictionary)} entries")
        return magic_dictionary

    except Exception as e:
        logger.error(f"❌ Failed to load dictionary: {e}")
        return {}

def extract_pdf_pages(pdf_path: str, temp_dir: str) -> List[str]:
    """Extract PDF pages as images"""
    page_images = []

    try:
        doc = fitz.open(pdf_path)

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better OCR
            pix = page.get_pixmap(matrix=mat)

            img_path = os.path.join(temp_dir, f"page_{page_num + 1:04d}.png")
            pix.save(img_path)
            page_images.append(img_path)

        doc.close()
        logger.info(f"✅ Extracted {len(page_images)} pages from PDF")
        return page_images

    except Exception as e:
        logger.error(f"❌ Failed to extract PDF pages: {e}")
        return []

def process_single_page(image_path: str, page_number: int, book_name: str,
                       confidence_threshold: float = 0.85) -> Dict:
    """Process a single page with PaddleOCR and return structured data"""
    try:
        # Maximum power inference with ultra-sensitive settings
        result = paddle_pipeline.predict(
            input=image_path,
            # Ultra-sensitive layout detection
            layout_threshold=0.25,
            layout_unclip_ratio=1.8,

            # Ultra-sensitive text detection
            text_det_thresh=0.1,
            text_det_box_thresh=0.3,
            text_det_unclip_ratio=2.5,
            text_rec_score_thresh=0.0,

            # Ultra-sensitive seal detection
            seal_det_thresh=0.08,
            seal_det_box_thresh=0.3,
            seal_rec_score_thresh=0.0,

            # Enable all advanced features
            use_doc_orientation_classify=True,
            use_doc_unwarping=True,
            use_textline_orientation=True,
            use_table_recognition=True,
            use_formula_recognition=True,
            use_seal_recognition=True,
            use_chart_recognition=True,
            use_region_detection=True
        )

        if not result:
            return {"error": "No OCR results returned"}

        page_result = result[0]  # Get first page result
        json_data = page_result.json

        # Extract OCR results
        ocr_results = json_data.get('res', {}).get('overall_ocr_res', {}).get('rec_texts', [])
        layout_elements = json_data.get('res', {}).get('layout_det_res', {}).get('boxes', [])

        # Build structured output
        full_text = ""
        tokens = []
        low_confidence_words = []

        # Process OCR results to extract tokens with confidence
        for text_item in ocr_results:
            if isinstance(text_item, dict):
                text = text_item.get('text', '')
                confidence = text_item.get('score', 1.0)
            else:
                text = str(text_item)
                confidence = 1.0

            full_text += text + " "

            # Split into words and assign confidence
            words = text.split()
            for word in words:
                word_data = {
                    "word": word.lower(),
                    "confidence": confidence,
                    "original": word
                }
                tokens.append(word_data)

                if confidence < confidence_threshold:
                    low_confidence_words.append(word_data)

        # Apply magic dictionary corrections
        corrected_tokens = apply_magic_dictionary_corrections(tokens)

        # Detect layout structure
        layout_info = analyze_layout_structure(layout_elements)

        # Build final structured output
        structured_output = {
            "book": book_name,
            "page": page_number,
            "text": full_text.strip(),
            "tokens": corrected_tokens,
            "layout": layout_info,
            "processing_metadata": {
                "timestamp": datetime.now().isoformat(),
                "total_tokens": len(tokens),
                "low_confidence_tokens": len(low_confidence_words),
                "confidence_threshold": confidence_threshold,
                "magic_corrections_applied": sum(1 for t in corrected_tokens if t.get('corrected', False))
            },
            "raw_paddle_result": json_data
        }

        return structured_output

    except Exception as e:
        logger.error(f"❌ Failed to process page {page_number}: {e}")
        return {
            "error": f"Processing failed: {str(e)}",
            "book": book_name,
            "page": page_number,
            "timestamp": datetime.now().isoformat()
        }

def apply_magic_dictionary_corrections(tokens: List[Dict]) -> List[Dict]:
    """Apply magic dictionary corrections to low-confidence tokens"""
    if not magic_dictionary:
        return tokens

    corrected_tokens = []

    for token in tokens:
        word = token['word']
        confidence = token['confidence']

        # Check if this is a low-confidence word that might benefit from dictionary correction
        if confidence < 0.9:
            # Look for close matches in magic dictionary
            best_match = find_best_dictionary_match(word)
            if best_match:
                corrected_token = token.copy()
                corrected_token['word'] = best_match['corrected']
                corrected_token['confidence'] = min(0.95, confidence + 0.1)  # Boost confidence
                corrected_token['corrected'] = True
                corrected_token['original_word'] = word
                corrected_token['correction_reason'] = best_match['reason']
                corrected_tokens.append(corrected_token)
                continue

        corrected_tokens.append(token)

    return corrected_tokens

def find_best_dictionary_match(word: str) -> Optional[Dict]:
    """Find best match in magic dictionary using fuzzy matching"""
    if not magic_dictionary:
        return None

    # Simple fuzzy matching - can be enhanced with more sophisticated algorithms
    word_lower = word.lower()

    # Direct match
    if isinstance(magic_dictionary, dict):
        if word_lower in magic_dictionary:
            return {
                "corrected": magic_dictionary[word_lower],
                "reason": "direct_match"
            }
    elif isinstance(magic_dictionary, list):
        for entry in magic_dictionary:
            if isinstance(entry, dict):
                for key, value in entry.items():
                    if word_lower == key.lower():
                        return {
                            "corrected": value,
                            "reason": "direct_match"
                        }

    # Simple edit distance matching (can be enhanced)
    def simple_edit_distance(s1: str, s2: str) -> int:
        if len(s1) < len(s2):
            return simple_edit_distance(s2, s1)

        if len(s2) == 0:
            return len(s1)

        previous_row = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row

        return previous_row[-1]

    # Find closest matches
    candidates = []

    if isinstance(magic_dictionary, dict):
        for dict_word in magic_dictionary.keys():
            distance = simple_edit_distance(word_lower, dict_word.lower())
            if distance <= 2 and len(word) > 3:  # Allow 2 character differences for words > 3 chars
                candidates.append({
                    "corrected": magic_dictionary[dict_word],
                    "distance": distance,
                    "reason": f"fuzzy_match_distance_{distance}"
                })

    if candidates:
        # Return best match (lowest edit distance)
        best_candidate = min(candidates, key=lambda x: x['distance'])
        return best_candidate

    return None

def analyze_layout_structure(layout_elements: List[Dict]) -> Dict:
    """Analyze layout structure from PaddleOCR results"""
    if not layout_elements:
        return {"columns": 1, "bbox": [0, 0, 0, 0], "elements": []}

    # Count different element types
    element_counts = {}
    total_bbox = [float('inf'), float('inf'), 0, 0]  # [min_x, min_y, max_x, max_y]

    for element in layout_elements:
        label = element.get('label', 'unknown')
        element_counts[label] = element_counts.get(label, 0) + 1

        # Update total bounding box
        bbox = element.get('bbox', [0, 0, 0, 0])
        if len(bbox) >= 4:
            total_bbox[0] = min(total_bbox[0], bbox[0])
            total_bbox[1] = min(total_bbox[1], bbox[1])
            total_bbox[2] = max(total_bbox[2], bbox[2])
            total_bbox[3] = max(total_bbox[3], bbox[3])

    # Simple column detection (can be enhanced)
    text_elements = [e for e in layout_elements if 'text' in e.get('label', '').lower()]
    columns = estimate_column_count(text_elements)

    return {
        "columns": columns,
        "bbox": total_bbox if total_bbox[0] != float('inf') else [0, 0, 0, 0],
        "elements": element_counts,
        "total_elements": len(layout_elements)
    }

def estimate_column_count(text_elements: List[Dict]) -> int:
    """Estimate number of columns based on text element positions"""
    if not text_elements:
        return 1

    # Simple heuristic: group text elements by X position
    x_positions = []
    for element in text_elements:
        bbox = element.get('bbox', [0, 0, 0, 0])
        if len(bbox) >= 4:
            center_x = (bbox[0] + bbox[2]) / 2
            x_positions.append(center_x)

    if not x_positions:
        return 1

    # Simple clustering to detect columns
    x_positions.sort()
    gaps = []
    for i in range(1, len(x_positions)):
        gap = x_positions[i] - x_positions[i-1]
        gaps.append(gap)

    if not gaps:
        return 1

    # Look for significant gaps that might indicate column boundaries
    avg_gap = sum(gaps) / len(gaps)
    significant_gaps = [g for g in gaps if g > avg_gap * 2]

    return min(len(significant_gaps) + 1, 3)  # Cap at 3 columns

def process_single_file(file_path: str, output_dir: str, book_name: str,
                       job_status: JobStatus, confidence_threshold: float = 0.85) -> bool:
    """Process a single file (PDF, image, or ZIP)"""
    try:
        file_ext = Path(file_path).suffix.lower()
        temp_dir = None
        page_images = []

        if file_ext == '.pdf':
            # Extract PDF pages
            temp_dir = tempfile.mkdtemp()
            page_images = extract_pdf_pages(file_path, temp_dir)

        elif file_ext == '.zip':
            # Extract ZIP contents
            temp_dir = tempfile.mkdtemp()
            with zipfile.ZipFile(file_path, 'r') as zip_ref:
                zip_ref.extractall(temp_dir)

            # Find image files in extracted content
            image_extensions = ['*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff']
            for ext in image_extensions:
                page_images.extend(glob.glob(os.path.join(temp_dir, '**', ext), recursive=True))
                page_images.extend(glob.glob(os.path.join(temp_dir, '**', ext.upper()), recursive=True))

            page_images.sort()

        elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
            # Single image file
            page_images = [file_path]

        else:
            logger.error(f"Unsupported file format: {file_ext}")
            return False

        if not page_images:
            logger.error(f"No processable pages found in {file_path}")
            return False

        job_status.total_pages += len(page_images)

        # Create book output directory
        book_output_dir = os.path.join(output_dir, book_name)
        os.makedirs(book_output_dir, exist_ok=True)

        # Process each page
        all_pages_text = []

        for page_idx, image_path in enumerate(page_images):
            try:
                job_status.current_file = f"{book_name} - Page {page_idx + 1}/{len(page_images)}"

                # Process page with PaddleOCR
                page_result = process_single_page(
                    image_path,
                    page_idx + 1,
                    book_name,
                    confidence_threshold
                )

                if "error" in page_result:
                    job_status.failed_pages.append({
                        "book": book_name,
                        "page": page_idx + 1,
                        "error": page_result["error"]
                    })
                    continue

                # Check for low confidence
                avg_confidence = np.mean([t['confidence'] for t in page_result.get('tokens', [])])
                if avg_confidence < confidence_threshold:
                    job_status.low_confidence_pages.append({
                        "book": book_name,
                        "page": page_idx + 1,
                        "confidence": avg_confidence
                    })

                # Save individual page outputs
                page_name = f"page_{page_idx + 1:04d}"

                # Save JSON
                json_path = os.path.join(book_output_dir, f"{page_name}.json")
                with open(json_path, 'w', encoding='utf-8') as f:
                    json.dump(page_result, f, indent=2, ensure_ascii=False)

                # Save text
                txt_path = os.path.join(book_output_dir, f"{page_name}.txt")
                with open(txt_path, 'w', encoding='utf-8') as f:
                    f.write(page_result.get('text', ''))

                all_pages_text.append(page_result.get('text', ''))
                job_status.processed_pages += 1

            except Exception as e:
                logger.error(f"❌ Failed to process page {page_idx + 1}: {e}")
                job_status.failed_pages.append({
                    "book": book_name,
                    "page": page_idx + 1,
                    "error": str(e)
                })

        # Save complete book text
        complete_book_path = os.path.join(book_output_dir, f"{book_name}_complete.txt")
        with open(complete_book_path, 'w', encoding='utf-8') as f:
            f.write("\n\n--- PAGE BREAK ---\n\n".join(all_pages_text))

        # Cleanup temporary directory
        if temp_dir and os.path.exists(temp_dir):
            shutil.rmtree(temp_dir)

        job_status.processed_files += 1
        return True

    except Exception as e:
        logger.error(f"❌ Failed to process file {file_path}: {e}")
        job_status.failed_pages.append({
            "book": book_name,
            "file": file_path,
            "error": str(e)
        })
        return False

def process_batch_job(job_id: str, input_files: List[str], output_dir: str,
                     dictionary_path: str = "", confidence_threshold: float = 0.85,
                     max_workers: int = 2):
    """Process a batch job in background thread"""
    global processing_status, current_job_id

    current_job_id = job_id
    job_status = JobStatus(job_id, len(input_files))
    job_status.status = "processing"
    job_status.output_directory = output_dir
    processing_status[job_id] = job_status

    try:
        # Load magic dictionary if provided
        if dictionary_path and os.path.exists(dictionary_path):
            load_magic_dictionary(dictionary_path)

        # Create output directory
        os.makedirs(output_dir, exist_ok=True)

        # Process files with thread pool
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = []

            for file_path in input_files:
                book_name = Path(file_path).stem
                future = executor.submit(
                    process_single_file,
                    file_path,
                    output_dir,
                    book_name,
                    job_status,
                    confidence_threshold
                )
                futures.append(future)

            # Wait for completion
            for future in as_completed(futures):
                try:
                    future.result()
                except Exception as e:
                    logger.error(f"❌ Thread execution failed: {e}")

        # Generate final report
        generate_batch_report(job_status, output_dir)

        job_status.status = "completed"
        logger.info(f"✅ Batch job {job_id} completed successfully!")

    except Exception as e:
        job_status.status = "failed"
        job_status.error_message = str(e)
        logger.error(f"❌ Batch job {job_id} failed: {e}")
        logger.error(traceback.format_exc())

    finally:
        current_job_id = None

def generate_batch_report(job_status: JobStatus, output_dir: str):
    """Generate comprehensive batch processing report"""
    end_time = datetime.now()
    duration = end_time - job_status.start_time

    report = {
        "job_id": job_status.job_id,
        "processing_summary": {
            "total_files": job_status.total_files,
            "processed_files": job_status.processed_files,
            "total_pages": job_status.total_pages,
            "processed_pages": job_status.processed_pages,
            "failed_pages": len(job_status.failed_pages),
            "low_confidence_pages": len(job_status.low_confidence_pages)
        },
        "timing": {
            "start_time": job_status.start_time.isoformat(),
            "end_time": end_time.isoformat(),
            "duration_seconds": duration.total_seconds(),
            "pages_per_hour": job_status.processed_pages / max(duration.total_seconds() / 3600, 0.001)
        },
        "failed_pages": job_status.failed_pages,
        "low_confidence_pages": job_status.low_confidence_pages,
        "magic_dictionary_stats": {
            "dictionary_loaded": len(magic_dictionary) > 0,
            "dictionary_entries": len(magic_dictionary)
        }
    }

    # Save report
    report_path = os.path.join(output_dir, f"batch_report_{job_status.job_id}.json")
    with open(report_path, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=2, ensure_ascii=False)

    # Save log file
    log_path = os.path.join(output_dir, f"batch_log_{job_status.job_id}.log")
    with open(log_path, 'w', encoding='utf-8') as f:
        f.write(f"OCR Batch Processing Log\n")
        f.write(f"Job ID: {job_status.job_id}\n")
        f.write(f"Started: {job_status.start_time}\n")
        f.write(f"Completed: {end_time}\n")
        f.write(f"Duration: {duration}\n\n")

        f.write(f"SUMMARY:\n")
        f.write(f"- Files processed: {job_status.processed_files}/{job_status.total_files}\n")
        f.write(f"- Pages processed: {job_status.processed_pages}/{job_status.total_pages}\n")
        f.write(f"- Failed pages: {len(job_status.failed_pages)}\n")
        f.write(f"- Low confidence pages: {len(job_status.low_confidence_pages)}\n")
        f.write(f"- Processing rate: {report['timing']['pages_per_hour']:.1f} pages/hour\n\n")

        if job_status.failed_pages:
            f.write(f"FAILED PAGES:\n")
            for failure in job_status.failed_pages:
                f.write(f"- {failure.get('book', 'Unknown')}, Page {failure.get('page', '?')}: {failure.get('error', 'Unknown error')}\n")

        if job_status.low_confidence_pages:
            f.write(f"\nLOW CONFIDENCE PAGES (Manual Review Recommended):\n")
            for low_conf in job_status.low_confidence_pages:
                f.write(f"- {low_conf.get('book', 'Unknown')}, Page {low_conf.get('page', '?')}: {low_conf.get('confidence', 0):.2f}\n")

# Flask Routes
@app.route('/')
def index():
    return render_template('index.html')

@app.route('/api/status')
def get_status():
    """Get current processing status"""
    if current_job_id and current_job_id in processing_status:
        job = processing_status[current_job_id]
        return jsonify({
            "job_id": job.job_id,
            "status": job.status,
            "progress": {
                "files": f"{job.processed_files}/{job.total_files}",
                "pages": f"{job.processed_pages}/{job.total_pages}",
                "current_file": job.current_file
            },
            "stats": {
                "failed_pages": len(job.failed_pages),
                "low_confidence_pages": len(job.low_confidence_pages),
                "elapsed_time": str(datetime.now() - job.start_time),
                "pages_per_hour": job.processed_pages / max((datetime.now() - job.start_time).total_seconds() / 3600, 0.001)
            }
        })

    return jsonify({"status": "idle"})

@app.route('/api/initialize', methods=['POST'])
def initialize_ocr():
    """Initialize PaddleOCR engine"""
    success = initialize_paddle_ocr()
    return jsonify({
        "success": success,
        "message": "PaddleOCR initialized successfully!" if success else "Failed to initialize PaddleOCR"
    })

@app.route('/api/upload-dictionary', methods=['POST'])
def upload_dictionary():
    """Upload magic dictionary file"""
    if 'dictionary' not in request.files:
        return jsonify({"error": "No dictionary file provided"}), 400

    file = request.files['dictionary']
    if file.filename == '':
        return jsonify({"error": "No file selected"}), 400

    # Save dictionary file
    dict_dir = os.path.join(os.getcwd(), 'dictionaries')
    os.makedirs(dict_dir, exist_ok=True)

    dict_path = os.path.join(dict_dir, file.filename)
    file.save(dict_path)

    # Load dictionary
    dictionary = load_magic_dictionary(dict_path)

    return jsonify({
        "success": True,
        "message": f"Dictionary loaded with {len(dictionary)} entries",
        "path": dict_path
    })

@app.route('/api/start-batch', methods=['POST'])
def start_batch_processing():
    """Start batch OCR processing"""
    global current_job_id

    if current_job_id:
        return jsonify({"error": "Another job is already running"}), 400

    if not paddle_pipeline:
        return jsonify({"error": "PaddleOCR not initialized. Please initialize first."}), 400

    data = request.get_json()
    input_directory = data.get('input_directory', '')
    output_directory = data.get('output_directory', '')
    dictionary_path = data.get('dictionary_path', '')
    confidence_threshold = float(data.get('confidence_threshold', 0.85))
    max_workers = int(data.get('max_workers', 2))

    if not input_directory or not os.path.exists(input_directory):
        return jsonify({"error": "Invalid input directory"}), 400

    if not output_directory:
        return jsonify({"error": "Output directory not specified"}), 400

    # Find all processable files
    supported_extensions = ['*.pdf', '*.zip', '*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff']
    input_files = []

    for ext in supported_extensions:
        input_files.extend(glob.glob(os.path.join(input_directory, ext)))
        input_files.extend(glob.glob(os.path.join(input_directory, ext.upper())))
        # Also search subdirectories
        input_files.extend(glob.glob(os.path.join(input_directory, '**', ext), recursive=True))
        input_files.extend(glob.glob(os.path.join(input_directory, '**', ext.upper()), recursive=True))

    input_files = list(set(input_files))  # Remove duplicates

    if not input_files:
        return jsonify({"error": "No processable files found in input directory"}), 400

    # Generate job ID
    job_id = str(uuid.uuid4())

    # Start processing in background thread
    thread = threading.Thread(
        target=process_batch_job,
        args=(job_id, input_files, output_directory, dictionary_path, confidence_threshold, max_workers)
    )
    thread.daemon = True
    thread.start()

    return jsonify({
        "success": True,
        "job_id": job_id,
        "files_found": len(input_files),
        "message": f"Started processing {len(input_files)} files"
    })

@app.route('/api/stop-batch', methods=['POST'])
def stop_batch_processing():
    """Stop current batch processing"""
    global current_job_id

    if not current_job_id:
        return jsonify({"error": "No job currently running"}), 400

    if current_job_id in processing_status:
        processing_status[current_job_id].status = "stopped"

    current_job_id = None

    return jsonify({
        "success": True,
        "message": "Batch processing stopped"
    })

@app.route('/api/jobs')
def get_all_jobs():
    """Get all job statuses"""
    jobs = []
    for job_id, job_status in processing_status.items():
        jobs.append({
            "job_id": job_id,
            "status": job_status.status,
            "total_files": job_status.total_files,
            "processed_files": job_status.processed_files,
            "total_pages": job_status.total_pages,
            "processed_pages": job_status.processed_pages,
            "failed_pages": len(job_status.failed_pages),
            "low_confidence_pages": len(job_status.low_confidence_pages),
            "start_time": job_status.start_time.isoformat(),
            "output_directory": job_status.output_directory
        })

    return jsonify(jobs)

@app.route('/api/download-report/<job_id>')
def download_report(job_id):
    """Download batch processing report"""
    if job_id not in processing_status:
        return jsonify({"error": "Job not found"}), 404

    job_status = processing_status[job_id]
    report_path = os.path.join(job_status.output_directory, f"batch_report_{job_id}.json")

    if not os.path.exists(report_path):
        return jsonify({"error": "Report not found"}), 404

    return send_file(report_path, as_attachment=True)

@app.route('/api/test-single', methods=['POST'])
def test_single_page():
    """Test OCR on a single uploaded image"""
    if not paddle_pipeline:
        return jsonify({"error": "PaddleOCR not initialized"}), 400

    if 'image' not in request.files:
        return jsonify({"error": "No image file provided"}), 400

    file = request.files['image']
    if file.filename == '':
        return jsonify({"error": "No file selected"}), 400

    # Save uploaded file temporarily
    temp_dir = tempfile.mkdtemp()
    temp_path = os.path.join(temp_dir, file.filename)
    file.save(temp_path)

    try:
        # Process single page
        result = process_single_page(temp_path, 1, "test_image", 0.85)

        # Cleanup
        shutil.rmtree(temp_dir)

        return jsonify({
            "success": True,
            "result": result
        })

    except Exception as e:
        shutil.rmtree(temp_dir)
        return jsonify({
            "error": f"Processing failed: {str(e)}"
        }), 500

@app.route('/api/validate-paths', methods=['POST'])
def validate_paths():
    """Validate input and output directory paths"""
    data = request.get_json()
    input_dir = data.get('input_directory', '')
    output_dir = data.get('output_directory', '')

    validation = {
        "input_valid": os.path.exists(input_dir) and os.path.isdir(input_dir),
        "output_valid": True,  # We can create output directory
        "input_files_found": 0
    }

    if validation["input_valid"]:
        # Count processable files
        supported_extensions = ['*.pdf', '*.zip', '*.jpg', '*.jpeg', '*.png', '*.bmp', '*.tiff']
        input_files = []

        for ext in supported_extensions:
            input_files.extend(glob.glob(os.path.join(input_dir, ext)))
            input_files.extend(glob.glob(os.path.join(input_dir, ext.upper())))
            input_files.extend(glob.glob(os.path.join(input_dir, '**', ext), recursive=True))
            input_files.extend(glob.glob(os.path.join(input_dir, '**', ext.upper()), recursive=True))

        validation["input_files_found"] = len(set(input_files))

    # Test output directory creation
    if output_dir:
        try:
            os.makedirs(output_dir, exist_ok=True)
            validation["output_valid"] = True
        except:
            validation["output_valid"] = False

    return jsonify(validation)

# if __name__ == '__main__':
#     # Setup ngrok
#     if NGROK_AUTH_TOKEN:
#         ngrok.set_auth_token("NGROK_AUTH_TOKEN")
#         logger.info("ngrok auth token set successfully.")
#     else:
#         logger.warning("NGROK_AUTH_TOKEN not set. ngrok might not work.")

#     # Start ngrok tunnel
#     public_url = ngrok.connect(FLASK_PORT).public_url
#     logger.info(f"🚀 Your Magic OCR app is live at: {public_url}")

#     # Start Flask app
#     # Setting use_reloader=False is important for environments like Colab
#     app.run(host="0.0.0.0", port=FLASK_PORT, debug=FLASK_DEBUG, use_reloader=False)

In [None]:
# --- Core Setup (Move this BEFORE ngrok functions) ---
import logging
import os

# Set up logging first
LOGGING_LEVEL = os.getenv("LOGGING_LEVEL", "INFO").upper()
logging.basicConfig(level=getattr(logging, LOGGING_LEVEL, logging.INFO),
                   format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger("flask_qwen_vl_app")

# --- Ngrok Setup ---
from pyngrok import ngrok
import subprocess
import sys

# FIXED: Set the actual token value, not the string "NGROK_AUTH_TOKEN"
NGROK_AUTH_TOKEN = "sadas"
NGROK_STATIC_DOMAIN = "adsd-adsd-adsd.ngrok-free.app"  # Remove https:// prefix
FLASK_PORT = int(os.getenv("FLASK_PORT", 5000))  # Use 5000 as default to match your original code

def install_ngrok():
    """Install pyngrok if not already installed"""
    try:
        import pyngrok
        logger.info("pyngrok is already installed")
    except ImportError:
        logger.info("Installing pyngrok...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "pyngrok"])
        import pyngrok
        logger.info("pyngrok installed successfully")

def setup_ngrok():
    """Setup ngrok tunnel with static domain"""
    try:
        # Install ngrok if needed
        install_ngrok()

        # Set auth token if provided
        if NGROK_AUTH_TOKEN:
            ngrok.set_auth_token(NGROK_AUTH_TOKEN)
            logger.info("ngrok auth token set successfully")
        else:
            logger.warning("No NGROK_AUTH_TOKEN provided - ngrok may not work without authentication")

        # Kill any existing tunnels
        ngrok.kill()

        # Create new tunnel with static domain
        if NGROK_STATIC_DOMAIN:
            # Use static domain (note: removed https:// prefix)
            public_tunnel = ngrok.connect(FLASK_PORT, hostname=NGROK_STATIC_DOMAIN)
            public_url = public_tunnel.public_url
            logger.info(f"🌐 Using static domain: {NGROK_STATIC_DOMAIN}")
        else:
            # Fallback to random domain
            public_tunnel = ngrok.connect(FLASK_PORT)
            public_url = public_tunnel.public_url
            logger.info(f"🌐 Using random domain")

        logger.info(f"🔗 Your Flask app is now accessible at: {public_url}")

        return public_url

    except Exception as e:
        logger.error(f"Failed to setup ngrok: {e}")
        logger.info("App will run locally only")
        return None

# Also fix the main section at the bottom of your code:
if __name__ == '__main__':
    # Setup ngrok (FIXED: Remove the redundant auth token setting)
    public_url = setup_ngrok()

    if public_url:
        logger.info(f"🚀 Your Magic OCR app is live at: {public_url}")
    else:
        logger.info(f"🏠 App running locally at: http://localhost:{FLASK_PORT}")

    # Start Flask app
    # Setting use_reloader=False is important for environments like Colab
    app.run(host="0.0.0.0", port=FLASK_PORT, debug=False, use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [31/Aug/2025 08:31:11] "GET / HTTP/1.1" 200 -

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/usr/lib/python3.12/threading.py", line 1032, in _bootstrap
    self._bootstrap_inner()
  File "/usr/lib/python3.12/threading.py", line 1075, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.12/threading.py", line 1012, in run
    self._target(*self._args, **self._kwargs)
  File

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

inference.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

inference.yml:   0%|          | 0.00/766 [00:00<?, ?B/s]

inference.pdiparams:   0%|          | 0.00/6.75M [00:00<?, ?B/s]

[32mCreating model: ('UVDoc', None)[0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/UVDoc`.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

inference.json: 0.00B [00:00, ?B/s]

inference.yml:   0%|          | 0.00/330 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

inference.pdiparams:   0%|          | 0.00/32.1M [00:00<?, ?B/s]

[32mCreating model: ('PP-DocBlockLayout', None)[0m
[32mUsing official model (PP-DocBlockLayout), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/PP-DocBlockLayout`.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

INFO:werkzeug:127.0.0.1 - - [31/Aug/2025 08:31:32] "GET /api/jobs HTTP/1.1" 200 -


README.md: 0.00B [00:00, ?B/s]

inference.pdiparams:   0%|          | 0.00/129M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

inference.json: 0.00B [00:00, ?B/s]

inference.yml: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

[32mCreating model: ('PP-DocLayout_plus-L', None)[0m
[32mUsing official model (PP-DocLayout_plus-L), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/PP-DocLayout_plus-L`.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

config.json: 0.00B [00:00, ?B/s]

inference.pdiparams:   0%|          | 0.00/129M [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

inference.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

inference.yml: 0.00B [00:00, ?B/s]

INFO:werkzeug:127.0.0.1 - - [31/Aug/2025 08:31:42] "GET /api/jobs HTTP/1.1" 200 -
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/PP-LCNet_x1_0_textline_ori`.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

inference.yml:   0%|          | 0.00/735 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

inference.json: 0.00B [00:00, ?B/s]

inference.pdiparams:   0%|          | 0.00/6.74M [00:00<?, ?B/s]

[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/PP-OCRv5_server_det`.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

inference.yml:   0%|          | 0.00/903 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

inference.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

inference.pdiparams:   0%|          | 0.00/87.9M [00:00<?, ?B/s]

[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/PP-OCRv5_server_rec`.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

.gitattributes: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

README.md: 0.00B [00:00, ?B/s]

inference.yml: 0.00B [00:00, ?B/s]

inference.pdiparams:   0%|          | 0.00/84.4M [00:00<?, ?B/s]

inference.json: 0.00B [00:00, ?B/s]

[32mCreating model: ('PP-OCRv4_server_seal_det', None)[0m
INFO:werkzeug:127.0.0.1 - - [31/Aug/2025 08:31:53] "GET /api/jobs HTTP/1.1" 200 -
[32mUsing official model (PP-OCRv4_server_seal_det), the model files will be automatically downloaded and saved in `/root/.paddlex/official_models/PP-OCRv4_server_seal_det`.[0m


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

inference.pdiparams:   0%|          | 0.00/114M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

inference.json: 0.00B [00:00, ?B/s]

inference.yml:   0%|          | 0.00/925 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]