# OCR Test with Tesseract

## Requirements
1. **Install Tesseract OCR**: 
   - Install Tesseract: `winget install --id UB-Mannheim.TesseractOCR` 
ion    - add it to system variables 
2. **Python packages**: 
   - `uv add opencv-python pytesseract pillow`
   - `uv add nltk` (for advanced text correction)
3. **Additional packages** (for AI correction):
   - `uv add requests` (for API-based corrections)
   - `uv add transformers torch` (for local model corrections - optional)

## Only Pdf (textual format)

In [1]:
import pdfplumber
import os

def extract_pdf(input_pdf: str, output_txt: str):
    if not os.path.exists(input_pdf):
        raise FileNotFoundError(f'Input PDF not found: {input_pdf}')

    out_dir = os.path.dirname(output_txt) or '.'
    os.makedirs(out_dir, exist_ok=True)

    pages = []
    with pdfplumber.open(input_pdf) as pdf:
        for page in pdf.pages:
            pages.append(page.extract_text() or '')

    text = '\n'.join(pages).strip()

    with open(output_txt, 'w', encoding='utf-8') as f:
        f.write(text)

    return text

# Defaults (change as needed)
input_pdf = './Tests files/MC test.pdf'
output_txt = './Tests output/output_from_MC test.txt'

# Run
extracted_text = extract_pdf(input_pdf, output_txt)
print(f'Saved to {output_txt}')
print('\n\nExtracted text:\n', extracted_text)

Saved to ./Tests output/output_from_MC test.txt


Extracted text:
 Mobile Communication & Computing (MU-Sem. 7-Comp) 6-10 Long Term Evolution of 3GPP
6.4.2(d) Coordinated Multipoint (CoMP)
One of the key issues with many cellular systems is that of poor performance at the cell edges. To improve the
performance at cell edges, LTE-Advanced introduces coordinated multipoint (CoMP) scheme.
In CoMP there are two important components :
1. TX (Transmit) points
2. RX (Receive) Points
 A number of TX points provide coordinated transmission in the DL (DownLink).
 Similarly a number of RX points provide coordinated reception in the UL (UpLink).
 A TX/RX-point constitutes of a set of co-located TX/RX antennas providing coverage in the same sector.
 The set of TX/RX-points used in CoMP can either be at different locations, or co-sited but providing coverage in
different sectors. They can also belong to the same or different eNBs.
 In Fig. 6.4.5 two simplified examples for DL CoMP is shown.
(a)

## Only Image (clear Text)

In [4]:
import cv2
import pytesseract
import os

def run_ocr(input_path: str, output_path: str):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f'Input file not found: {input_path}')
    
    out_dir = os.path.dirname(output_path) or '.'
    os.makedirs(out_dir, exist_ok=True)
    
    img = cv2.imread(input_path)
    if img is None:
        raise ValueError(f'Could not read image: {input_path}')
    
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5,5), 0)
    gray = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    text = pytesseract.image_to_string(gray)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(text)
    
    return text

# Defaults (change as needed)
input_pdf = './Tests files/MC test.pdf'
output_txt = './Tests output/output_from_MC test2.txt'

# Run
text = run_ocr(input_pdf, output_txt)
print('Saved to ./Tests output/output_from_image.txt')
print('\n\nExtracted text:\n',text)

ValueError: Could not read image: ./Tests files/MC test.pdf

## Pdf (Mixed photos and text)

In [3]:
import fitz  # PyMuPDF
import pytesseract
import cv2
import numpy as np
from PIL import Image

def extract_pdf_with_images(pdf_path, out_txt):
    doc = fitz.open(pdf_path)
    final_text = []

    for page_num, page in enumerate(doc, start=1):
        blocks = []

        # --- Text blocks (with bbox) ---
        for b in page.get_text("blocks"):
            x0, y0, x1, y1, text, *_ = b
            if text.strip():
                blocks.append({
                    "bbox": (x0, y0, x1, y1),
                    "type": "text",
                    "content": text.strip()
                })

        # --- Image blocks ---
        raw_dict = page.get_text("rawdict")
        for block in raw_dict["blocks"]:
            if block["type"] == 1:  # image
                bbox = block["bbox"]
                img = page.get_pixmap(matrix=fitz.Matrix(2, 2), clip=fitz.Rect(bbox))
                img_pil = Image.frombytes("RGB", [img.width, img.height], img.samples)

                # OCR on image
                img_cv = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
                ocr_text = pytesseract.image_to_string(img_cv).strip()
                if ocr_text:
                    blocks.append({
                        "bbox": bbox,
                        "type": "image",
                        "content": ocr_text
                    })

        # --- Sort by layout order ---
        blocks.sort(key=lambda b: (round(b["bbox"][1]), round(b["bbox"][0])))

        # --- Merge page ---
        page_text = []
        for b in blocks:
            if b["type"] == "image":
                page_text.append(f"[IMAGE OCR] {b['content']}")
            else:
                page_text.append(b["content"])
        final_text.append("\n".join(page_text))

    merged_text = "\n\n--- Page Break ---\n\n".join(final_text)

    with open(out_txt, "w", encoding="utf-8") as f:
        f.write(merged_text)

    return merged_text


# Example run
input_pdf = './Tests files/MC test.pdf'
output_txt = './Tests output/output_from_MC test1.txt'

text = extract_pdf_with_images(input_pdf, output_txt)
print('Extracted text:\n',text)

Extracted text:
 [IMAGE OCR] =
Mobile Communication & Computing (MU-Sem. 7-Comp)  6-10 
Long Term Evolution of 3GPP
6.4.2(d) Coordinated Multipoint (CoMP)
One of the key issues with many cellular systems is that of poor performance at the cell edges. To improve the
performance at cell edges, LTE-Advanced introduces coordinated multipoint (CoMP) scheme.
In CoMP there are two important components :
1. 
TX (Transmit) points
2. 
RX (Receive) Points
 
A number of TX points provide coordinated transmission in the DL (DownLink).
 
Similarly a number of RX  points provide coordinated reception in the UL (UpLink).
 
A TX/RX-point constitutes of a set of co-located TX/RX antennas providing coverage in the same sector.
 
The set of TX/RX-points used in CoMP can either be at different locations, or co-sited but providing coverage in
different sectors. They can also belong to the same or different eNBs.
 
In Fig. 6.4.5 two simplified examples for DL CoMP is shown.
[IMAGE OCR] SN, ee
CTT

Both 

In [5]:
import io
import pdfplumber
import fitz  # PyMuPDF
import pytesseract
import cv2
import numpy as np
from PIL import Image
import os

def improved_ocr(image):
    """Enhanced OCR with better preprocessing"""
    # Convert to grayscale if needed
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image
        
    # Noise removal with bilateral filter (preserves edges)
    filtered = cv2.bilateralFilter(gray, 11, 17, 17)
    
    # Otsu's thresholding for better binarization
    _, binary = cv2.threshold(filtered, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Morphological operations to clean up text
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    morph = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    # OCR with custom configuration
    custom_config = r'--oem 3 --psm 6 -l eng'
    text = pytesseract.image_to_string(morph, config=custom_config)
    
    return text.strip()

def extract_pdf_hybrid(pdf_path, output_path):
    """
    Hybrid approach: Use pdfplumber for native text and improved OCR for images
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f'Input PDF not found: {pdf_path}')
    
    out_dir = os.path.dirname(output_path) or '.'
    os.makedirs(out_dir, exist_ok=True)
    
    text_blocks = []
    image_blocks = []
    
    # Step 1: Extract native text with pdfplumber (best for regular text)
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            extracted_text = page.extract_text() or ''
            if extracted_text.strip():
                text_blocks.append(f"=== PAGE {page_num} - TEXT ===\n{extracted_text}")
    
    # Step 2: Extract and OCR images with PyMuPDF
    pdf_doc = fitz.open(pdf_path)
    for page_num, page in enumerate(pdf_doc, 1):
        # Get all images on this page
        image_list = page.get_images(full=True)
        
        for img_idx, img_info in enumerate(image_list):
            try:
                xref = img_info[0]
                base_img = pdf_doc.extract_image(xref)
                image_bytes = base_img["image"]
                
                # Convert to PIL Image
                image_pil = Image.open(io.BytesIO(image_bytes))
                image_np = np.array(image_pil)
                
                # Convert to OpenCV format
                if len(image_np.shape) == 2:  # Grayscale
                    image_cv = image_np
                else:
                    image_cv = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
                
                # Apply improved OCR
                image_text = improved_ocr(image_cv)
                
                if image_text.strip():
                    image_blocks.append(f"=== PAGE {page_num} - IMAGE {img_idx+1} ===\n{image_text}")
                    
            except Exception as e:
                print(f"Error processing image {img_idx+1} on page {page_num}: {e}")
                continue
    
    pdf_doc.close()
    
    # Step 3: Combine all content
    all_content = []
    all_content.extend(text_blocks)
    
    if image_blocks:
        all_content.append("\n" + "="*50)
        all_content.append("CONTENT FROM IMAGES")
        all_content.append("="*50)
        all_content.extend(image_blocks)
    
    final_text = "\n\n".join(all_content)
    
    # Save to file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(final_text)
    
    return final_text

def extract_pdf_advanced_layout(pdf_path, output_path):
    """
    Advanced layout-preserving extraction using PyMuPDF with improved OCR
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f'Input PDF not found: {pdf_path}')
    
    out_dir = os.path.dirname(output_path) or '.'
    os.makedirs(out_dir, exist_ok=True)
    
    doc = fitz.open(pdf_path)
    final_text = []

    for page_num, page in enumerate(doc, start=1):
        blocks = []

        # Get text blocks with bounding boxes
        for b in page.get_text("blocks"):
            x0, y0, x1, y1, text, *_ = b
            if text.strip():
                blocks.append({
                    "bbox": (x0, y0, x1, y1),
                    "type": "text",
                    "content": text.strip()
                })

        # Get image blocks with improved OCR
        raw_dict = page.get_text("rawdict")
        for block in raw_dict["blocks"]:
            if block["type"] == 1:  # image block
                bbox = block["bbox"]
                # Higher resolution for better OCR
                img = page.get_pixmap(matrix=fitz.Matrix(3, 3), clip=fitz.Rect(bbox))
                img_pil = Image.frombytes("RGB", [img.width, img.height], img.samples)

                # Convert to OpenCV and apply improved OCR
                img_cv = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
                ocr_text = improved_ocr(img_cv)
                
                if ocr_text:
                    blocks.append({
                        "bbox": bbox,
                        "type": "image",
                        "content": ocr_text
                    })

        # Sort blocks by reading order (top to bottom, left to right)
        blocks.sort(key=lambda b: (round(b["bbox"][1] / 10) * 10, round(b["bbox"][0])))

        # Format page content
        page_content = []
        for b in blocks:
            if b["type"] == "image":
                page_content.append(f"[IMAGE] {b['content']}")
            else:
                page_content.append(b["content"])
        
        final_text.append(f"=== PAGE {page_num} ===\n" + "\n".join(page_content))

    doc.close()
    
    merged_text = "\n\n".join(final_text)

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(merged_text)

    return merged_text

# Test both approaches
input_pdf = './Tests files/MC test.pdf'

print("Testing Hybrid Approach (pdfplumber + improved OCR)...")
hybrid_output = './Tests output/hybrid_output.txt'
hybrid_text = extract_pdf_hybrid(input_pdf, hybrid_output)
print(f"Saved hybrid result to: {hybrid_output}")
print(f"Hybrid text preview:\n{hybrid_text[:500]}...\n")

print("\nTesting Advanced Layout Approach...")
advanced_output = './Tests output/advanced_layout_output.txt'
advanced_text = extract_pdf_advanced_layout(input_pdf, advanced_output)
print(f"Saved advanced result to: {advanced_output}")
print(f"Advanced text preview:\n{advanced_text[:500]}...")

Testing Hybrid Approach (pdfplumber + improved OCR)...
Saved hybrid result to: ./Tests output/hybrid_output.txt
Hybrid text preview:
=== PAGE 1 - TEXT ===
Mobile Communication & Computing (MU-Sem. 7-Comp) 6-10 Long Term Evolution of 3GPP
6.4.2(d) Coordinated Multipoint (CoMP)
One of the key issues with many cellular systems is that of poor performance at the cell edges. To improve the
performance at cell edges, LTE-Advanced introduces coordinated multipoint (CoMP) scheme.
In CoMP there are two important components :
1. TX (Transmit) points
2. RX (Receive) Points
 A number of TX points provide coordinated transmission in the ...


Testing Advanced Layout Approach...
Saved hybrid result to: ./Tests output/hybrid_output.txt
Hybrid text preview:
=== PAGE 1 - TEXT ===
Mobile Communication & Computing (MU-Sem. 7-Comp) 6-10 Long Term Evolution of 3GPP
6.4.2(d) Coordinated Multipoint (CoMP)
One of the key issues with many cellular systems is that of poor performance at the cell edges. To impr

## Hybrid Approach with Advanced Image Processing

In [6]:
import io
import pdfplumber
import fitz  # PyMuPDF
import pytesseract
import cv2
import numpy as np
from PIL import Image, ImageEnhance, ImageFilter
import os

def enhanced_image_preprocessing(image):
    """Multiple preprocessing techniques to handle different image types"""
    results = []
    
    # Convert to grayscale if needed
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image.copy()
    
    # Technique 1: Standard preprocessing with bilateral filter
    filtered = cv2.bilateralFilter(gray, 11, 17, 17)
    _, binary1 = cv2.threshold(filtered, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    text1 = pytesseract.image_to_string(binary1, config=r'--oem 3 --psm 6 -l eng').strip()
    if text1:
        results.append(("Standard", text1))
    
    # Technique 2: Adaptive thresholding for varying lighting
    adaptive = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    text2 = pytesseract.image_to_string(adaptive, config=r'--oem 3 --psm 6 -l eng').strip()
    if text2:
        results.append(("Adaptive", text2))
    
    # Technique 3: Morphological operations for noisy text
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    morph = cv2.morphologyEx(gray, cv2.MORPH_CLOSE, kernel)
    _, binary3 = cv2.threshold(morph, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    text3 = pytesseract.image_to_string(binary3, config=r'--oem 3 --psm 6 -l eng').strip()
    if text3:
        results.append(("Morphological", text3))
    
    # Technique 4: Contrast enhancement
    enhanced = cv2.convertScaleAbs(gray, alpha=1.5, beta=30)
    _, binary4 = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    text4 = pytesseract.image_to_string(binary4, config=r'--oem 3 --psm 6 -l eng').strip()
    if text4:
        results.append(("Enhanced", text4))
    
    # Technique 5: Different PSM modes for various text layouts
    psm_modes = [3, 6, 8, 11, 13]  # Different page segmentation modes
    for psm in psm_modes:
        try:
            text_psm = pytesseract.image_to_string(binary1, config=f'--oem 3 --psm {psm} -l eng').strip()
            if text_psm and text_psm not in [r[1] for r in results]:
                results.append((f"PSM-{psm}", text_psm))
        except:
            continue
    
    return results

def advanced_ocr_with_confidence(image):
    """OCR with confidence scoring to get the best result"""
    # Convert to grayscale if needed
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image.copy()
    
    # Apply preprocessing
    filtered = cv2.bilateralFilter(gray, 11, 17, 17)
    _, binary = cv2.threshold(filtered, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Get OCR data with confidence scores
    try:
        data = pytesseract.image_to_data(binary, output_type=pytesseract.Output.DICT, config=r'--oem 3 --psm 6 -l eng')
        
        # Filter words by confidence (keep words with confidence > 30)
        high_conf_words = []
        for i, word in enumerate(data['text']):
            if int(data['conf'][i]) > 30 and word.strip():
                high_conf_words.append(word)
        
        high_conf_text = ' '.join(high_conf_words)
        
        # Also get the full text for comparison
        full_text = pytesseract.image_to_string(binary, config=r'--oem 3 --psm 6 -l eng').strip()
        
        # Return the longer/better text
        if len(high_conf_text) > len(full_text) * 0.7:  # If high-confidence text is substantial
            return high_conf_text
        else:
            return full_text
            
    except Exception as e:
        # Fallback to standard OCR
        return pytesseract.image_to_string(binary, config=r'--oem 3 --psm 6 -l eng').strip()

def extract_pdf_enhanced_hybrid(pdf_path, output_path):
    """
    Enhanced hybrid approach with multiple OCR techniques
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f'Input PDF not found: {pdf_path}')
    
    out_dir = os.path.dirname(output_path) or '.'
    os.makedirs(out_dir, exist_ok=True)
    
    text_blocks = []
    image_blocks = []
    
    # Step 1: Extract native text with pdfplumber
    print("Extracting native text with pdfplumber...")
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            extracted_text = page.extract_text() or ''
            if extracted_text.strip():
                text_blocks.append(f"=== PAGE {page_num} - TEXT ===\n{extracted_text}")
    
    # Step 2: Extract and OCR images with enhanced preprocessing
    print("Processing images with enhanced OCR...")
    pdf_doc = fitz.open(pdf_path)
    for page_num, page in enumerate(pdf_doc, 1):
        image_list = page.get_images(full=True)
        
        for img_idx, img_info in enumerate(image_list):
            try:
                print(f"Processing image {img_idx+1} on page {page_num}...")
                xref = img_info[0]
                base_img = pdf_doc.extract_image(xref)
                image_bytes = base_img["image"]
                
                # Convert to PIL Image and then to OpenCV
                image_pil = Image.open(io.BytesIO(image_bytes))
                
                # Enhance image quality before OCR
                if image_pil.mode != 'RGB':
                    image_pil = image_pil.convert('RGB')
                
                # Apply PIL enhancements
                enhancer = ImageEnhance.Contrast(image_pil)
                image_pil = enhancer.enhance(1.5)
                
                enhancer = ImageEnhance.Sharpness(image_pil)
                image_pil = enhancer.enhance(2.0)
                
                # Convert to OpenCV format
                image_np = np.array(image_pil)
                image_cv = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
                
                # Try multiple OCR approaches
                ocr_results = enhanced_image_preprocessing(image_cv)
                
                # Also try confidence-based OCR
                conf_text = advanced_ocr_with_confidence(image_cv)
                if conf_text:
                    ocr_results.append(("Confidence-based", conf_text))
                
                # Combine and format results
                if ocr_results:
                    combined_text = []
                    combined_text.append(f"=== PAGE {page_num} - IMAGE {img_idx+1} ===")
                    
                    # Find the best result (longest meaningful text)
                    best_result = max(ocr_results, key=lambda x: len(x[1]) if len(x[1]) > 10 else 0)
                    combined_text.append(f"BEST RESULT ({best_result[0]}):")
                    combined_text.append(best_result[1])
                    
                    # Add other significant results
                    other_results = [r for r in ocr_results if r != best_result and len(r[1]) > 5]
                    if other_results:
                        combined_text.append("\nALTERNATIVE RESULTS:")
                        for method, text in other_results:
                            if text != best_result[1]:  # Avoid duplicates
                                combined_text.append(f"- {method}: {text}")
                    
                    image_blocks.append("\n".join(combined_text))
                    
            except Exception as e:
                print(f"Error processing image {img_idx+1} on page {page_num}: {e}")
                continue
    
    pdf_doc.close()
    
    # Step 3: Combine all content
    all_content = []
    all_content.extend(text_blocks)
    
    if image_blocks:
        all_content.append("\n" + "="*60)
        all_content.append("CONTENT FROM IMAGES (ENHANCED PROCESSING)")
        all_content.append("="*60)
        all_content.extend(image_blocks)
    
    final_text = "\n\n".join(all_content)
    
    # Save to file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(final_text)
    
    return final_text

# Test the enhanced hybrid approach
input_pdf = './Tests files/test_mix.pdf'
enhanced_output = './Tests output/enhanced_hybrid_output.txt'

print("Testing Enhanced Hybrid Approach with multiple OCR techniques...")
enhanced_text = extract_pdf_enhanced_hybrid(input_pdf, enhanced_output)
print(f"\nSaved enhanced result to: {enhanced_output}")
print(f"\nEnhanced text preview:\n{enhanced_text[:800]}...")

Testing Enhanced Hybrid Approach with multiple OCR techniques...
Extracting native text with pdfplumber...
Processing images with enhanced OCR...
Processing image 1 on page 1...
Processing image 2 on page 1...
Processing image 3 on page 1...
Processing image 4 on page 1...
Processing image 1 on page 2...
Processing image 2 on page 2...
Processing image 3 on page 2...
Processing image 4 on page 2...
Processing image 1 on page 3...
Processing image 2 on page 3...
Processing image 1 on page 4...
Processing image 2 on page 4...
Processing image 3 on page 4...
Processing image 4 on page 4...
Processing image 1 on page 5...
Processing image 2 on page 5...
Processing image 1 on page 6...
Processing image 2 on page 6...

Saved enhanced result to: ./Tests output/enhanced_hybrid_output.txt

Enhanced text preview:
=== PAGE 1 - TEXT ===
Mobile Communication & Computing (MU-Sem. 7-Comp) 6-10 Long Term Evolution of 3GPP
6.4.2(d) Coordinated Multipoint (CoMP)
One of the key issues with many cellular 

## Rule Based OCR Text Correction

In [7]:
import re
import requests
import json
from typing import List, Optional

# Option 1: Using local transformers model (offline)
def setup_local_text_correction():
    """Setup local model for OCR correction - requires transformers library"""
    try:
        from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
        
        # Use a grammar correction model (T5-based)
        model_name = "pszemraj/flan-t5-large-grammar-synthesis"
        # Alternative: "vennify/t5-base-grammar-correction"
        
        print("Loading grammar correction model...")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        corrector = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
        
        print("Model loaded successfully!")
        return corrector
    except ImportError:
        print("transformers library not installed. Install with: pip install transformers torch")
        return None
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

# Option 2: Using OpenAI API (requires API key)
def correct_text_with_openai(text: str, api_key: str) -> str:
    """Correct OCR text using OpenAI API"""
    try:
        headers = {
            'Authorization': f'Bearer {api_key}',
            'Content-Type': 'application/json'
        }
        
        prompt = f"""Fix the OCR errors in this text. Correct spelling mistakes, fix spacing issues, and make the text readable while preserving the original meaning:

Text: {text}

Corrected text:"""
        
        data = {
            'model': 'gpt-3.5-turbo',
            'messages': [
                {'role': 'user', 'content': prompt}
            ],
            'max_tokens': len(text) + 100,
            'temperature': 0.1
        }
        
        response = requests.post('https://api.openai.com/v1/chat/completions', 
                               headers=headers, json=data)
        
        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content'].strip()
        else:
            print(f"OpenAI API error: {response.status_code}")
            return text
            
    except Exception as e:
        print(f"Error with OpenAI correction: {e}")
        return text

# Option 3: Rule-based OCR correction (offline, no dependencies)
def rule_based_ocr_correction(text: str) -> str:
    """Apply common OCR error corrections using regex patterns"""
    
    corrections = [
        # Common OCR character mistakes
        (r'\b0\b', 'O'),  # Zero to O
        (r'\b1\b', 'I'),  # One to I (in some contexts)
        (r'rn', 'm'),     # rn often misread as m
        (r'cl', 'd'),     # cl often misread as d
        (r'vv', 'w'),     # vv often misread as w
        (r'(\w)1(\w)', r'\1l\2'),  # 1 between letters often should be l
        (r'(\w)0(\w)', r'\1o\2'),  # 0 between letters often should be o
        
        # Fix spacing issues
        (r'([a-z])([A-Z])', r'\1 \2'),  # Add space between lowercase and uppercase
        (r'([a-zA-Z])(\d)', r'\1 \2'),  # Add space between letter and number
        (r'(\d)([a-zA-Z])', r'\1 \2'),  # Add space between number and letter
        (r'\s+', ' '),  # Multiple spaces to single space
        
        # Common word corrections
        (r'\btlle\b', 'the'),
        (r'\btl1e\b', 'the'),
        (r'\bfrom\b', 'from'),
        (r'\bw1th\b', 'with'),
        (r'\bth1s\b', 'this'),
        (r'\bthat\b', 'that'),
        (r'\bwh1ch\b', 'which'),
        (r'\bwhere\b', 'where'),
        (r'\bwhen\b', 'when'),
        (r'\bhow\b', 'how'),
        (r'\bwhy\b', 'why'),
        (r'\bwhat\b', 'what'),
        
        # Fix punctuation
        (r'\s+([,.!?;:])', r'\1'),  # Remove space before punctuation
        (r'([,.!?;:])\s*([a-zA-Z])', r'\1 \2'),  # Ensure space after punctuation
    ]
    
    corrected_text = text
    for pattern, replacement in corrections:
        corrected_text = re.sub(pattern, replacement, corrected_text)
    
    return corrected_text.strip()

# Option 4: Using Hugging Face API (requires API key)
def correct_text_with_huggingface(text: str, api_key: str) -> str:
    """Correct text using Hugging Face Inference API"""
    try:
        API_URL = "https://api-inference.huggingface.co/models/pszemraj/flan-t5-large-grammar-synthesis"
        headers = {"Authorization": f"Bearer {api_key}"}
        
        payload = {
            "inputs": f"grammar: {text}",
            "parameters": {"max_length": len(text) + 50}
        }
        
        response = requests.post(API_URL, headers=headers, json=payload)
        
        if response.status_code == 200:
            result = response.json()
            if isinstance(result, list) and len(result) > 0:
                return result[0].get('generated_text', text)
        else:
            print(f"Hugging Face API error: {response.status_code}")
            
        return text
    except Exception as e:
        print(f"Error with Hugging Face correction: {e}")
        return text

def extract_pdf_with_ai_correction(pdf_path, output_path, correction_method="rule_based", api_key=None):
    """
    Enhanced hybrid extraction with AI-powered text correction
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f'Input PDF not found: {pdf_path}')
    
    out_dir = os.path.dirname(output_path) or '.'
    os.makedirs(out_dir, exist_ok=True)
    
    # Setup correction method
    corrector = None
    if correction_method == "local":
        corrector = setup_local_text_correction()
        if corrector is None:
            print("Falling back to rule-based correction...")
            correction_method = "rule_based"
    
    text_blocks = []
    image_blocks = []
    
    # Step 1: Extract native text with pdfplumber
    print("Extracting native text with pdfplumber...")
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            extracted_text = page.extract_text() or ''
            if extracted_text.strip():
                text_blocks.append(f"=== PAGE {page_num} - TEXT ===\n{extracted_text}")
    
    # Step 2: Extract and OCR images with correction
    print("Processing images with enhanced OCR and AI correction...")
    pdf_doc = fitz.open(pdf_path)
    for page_num, page in enumerate(pdf_doc, 1):
        image_list = page.get_images(full=True)
        
        for img_idx, img_info in enumerate(image_list):
            try:
                print(f"Processing image {img_idx+1} on page {page_num}...")
                xref = img_info[0]
                base_img = pdf_doc.extract_image(xref)
                image_bytes = base_img["image"]
                
                # Enhanced image processing (from previous function)
                image_pil = Image.open(io.BytesIO(image_bytes))
                if image_pil.mode != 'RGB':
                    image_pil = image_pil.convert('RGB')
                
                from PIL import ImageEnhance
                enhancer = ImageEnhance.Contrast(image_pil)
                image_pil = enhancer.enhance(1.5)
                enhancer = ImageEnhance.Sharpness(image_pil)
                image_pil = enhancer.enhance(2.0)
                
                image_np = np.array(image_pil)
                image_cv = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
                
                # Get raw OCR text
                raw_ocr_text = pytesseract.image_to_string(image_cv, config=r'--oem 3 --psm 6 -l eng').strip()
                
                if raw_ocr_text:
                    # Apply AI correction
                    corrected_text = raw_ocr_text
                    
                    if correction_method == "rule_based":
                        corrected_text = rule_based_ocr_correction(raw_ocr_text)
                    elif correction_method == "openai" and api_key:
                        corrected_text = correct_text_with_openai(raw_ocr_text, api_key)
                    elif correction_method == "huggingface" and api_key:
                        corrected_text = correct_text_with_huggingface(raw_ocr_text, api_key)
                    elif correction_method == "local" and corrector:
                        try:
                            result = corrector(f"grammar: {raw_ocr_text}", max_length=len(raw_ocr_text) + 50)
                            corrected_text = result[0]['generated_text'] if result else raw_ocr_text
                        except:
                            corrected_text = rule_based_ocr_correction(raw_ocr_text)
                    
                    # Format the result
                    result_block = []
                    result_block.append(f"=== PAGE {page_num} - IMAGE {img_idx+1} ===")
                    result_block.append("RAW OCR:")
                    result_block.append(raw_ocr_text)
                    result_block.append(f"\nCORRECTED ({correction_method.upper()}):")
                    result_block.append(corrected_text)
                    
                    image_blocks.append("\n".join(result_block))
                    
            except Exception as e:
                print(f"Error processing image {img_idx+1} on page {page_num}: {e}")
                continue
    
    pdf_doc.close()
    
    # Step 3: Combine all content
    all_content = []
    all_content.extend(text_blocks)
    
    if image_blocks:
        all_content.append("\n" + "="*60)
        all_content.append("CONTENT FROM IMAGES (WITH AI CORRECTION)")
        all_content.append("="*60)
        all_content.extend(image_blocks)
    
    final_text = "\n\n".join(all_content)
    
    # Save to file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(final_text)
    
    return final_text

# Test different correction methods
input_pdf = './Tests files/MC test.pdf'

print("=== Testing Rule-Based Correction (No API needed) ===")
rule_output = './Tests output/rule_based_corrected.txt'
rule_text = extract_pdf_with_ai_correction(input_pdf, rule_output, "rule_based")
print(f"Saved to: {rule_output}")
print(f"Preview:\n{rule_text[:600]}...\n")

# Uncomment below to test with API keys (you need to provide your own keys)
"""
print("=== Testing OpenAI Correction ===")
openai_api_key = "your-openai-api-key-here"
openai_output = './Tests output/openai_corrected.txt'
openai_text = extract_pdf_with_ai_correction(input_pdf, openai_output, "openai", openai_api_key)
print(f"Saved to: {openai_output}")

print("=== Testing Hugging Face Correction ===")
hf_api_key = "your-huggingface-api-key-here"
hf_output = './Tests output/hf_corrected.txt'
hf_text = extract_pdf_with_ai_correction(input_pdf, hf_output, "huggingface", hf_api_key)
print(f"Saved to: {hf_output}")
"""

print("\n=== Available Correction Methods ===")
print("1. rule_based - Offline regex-based corrections (recommended for start)")
print("2. openai - Uses OpenAI GPT for correction (requires API key)")
print("3. huggingface - Uses HF grammar correction models (requires API key)")
print("4. local - Downloads and uses local transformer model (requires transformers library)")

=== Testing Rule-Based Correction (No API needed) ===
Extracting native text with pdfplumber...
Processing images with enhanced OCR and AI correction...
Processing image 1 on page 1...
Processing image 2 on page 1...
Processing image 3 on page 1...
Processing image 4 on page 1...
Processing image 1 on page 2...
Processing image 2 on page 2...
Processing image 3 on page 2...
Processing image 4 on page 2...
Processing image 1 on page 3...
Processing image 2 on page 3...
Processing image 1 on page 4...
Processing image 2 on page 4...
Processing image 3 on page 4...
Processing image 4 on page 4...
Processing image 1 on page 5...
Processing image 2 on page 5...
Processing image 1 on page 6...
Processing image 2 on page 6...
Saved to: ./Tests output/rule_based_corrected.txt
Preview:
=== PAGE 1 - TEXT ===
Mobile Communication & Computing (MU-Sem. 7-Comp) 6-10 Long Term Evolution of 3GPP
6.4.2(d) Coordinated Multipoint (CoMP)
One of the key issues with many cellular systems is that of poor per

## NLTK-Based OCR Text Correction

In [10]:
# Imports
import re
import string
import io
import os
from collections import Counter
import nltk
import pdfplumber
import fitz
import pytesseract
import cv2
import numpy as np
from PIL import Image, ImageEnhance

# NLTK setup and correction functions
def setup_nltk():
    """Setup required NLTK data with persistent storage"""
    try:
        # Set up NLTK data path for persistent storage
        import os
        nltk_data_path = os.path.expanduser('~/nltk_data')
        if nltk_data_path not in nltk.data.path:
            nltk.data.path.append(nltk_data_path)
        
        required_data = [
            ('punkt', 'tokenizers'),
            ('words', 'corpora'), 
            ('averaged_perceptron_tagger', 'taggers'),
            ('brown', 'corpora')
        ]
        
        missing_data = []
        for data_name, data_type in required_data:
            try:
                nltk.data.find(f'{data_type}/{data_name}')
                print(f"✓ {data_name} already available")
            except LookupError:
                missing_data.append(data_name)
        
        if missing_data:
            print(f"Downloading missing NLTK data: {', '.join(missing_data)}")
            for data_name in missing_data:
                nltk.download(data_name, quiet=False)
                print(f"✓ Downloaded {data_name}")
            print("✓ All NLTK data now permanently installed")
        else:
            print("✓ All required NLTK data already available")
        
        return True
    except ImportError:
        print("✗ NLTK not available")
        return False
    except Exception as e:
        print(f"✗ Error setting up NLTK: {e}")
        return False

def create_word_frequency_dict():
    """Create word frequency dictionary from NLTK corpora"""
    try:
        from nltk.corpus import brown, words
        english_words = set(words.words())
        brown_words = [word.lower() for word in brown.words() if word.isalpha()]
        word_freq = Counter(brown_words)
        return english_words, word_freq
    except:
        return set(), Counter()

def nltk_spell_check(word, english_words, word_freq, max_distance=2):
    """Find best spelling correction for a word using edit distance"""
    word = word.lower()
    
    # If word is already correct, return it
    if word in english_words:
        return word
    
    # Find words with similar spelling
    candidates = []
    for eng_word in english_words:
        if abs(len(eng_word) - len(word)) <= max_distance:
            distance = edit_distance(word, eng_word)
            if distance <= max_distance:
                # Use frequency as tie-breaker
                frequency = word_freq.get(eng_word, 1)
                candidates.append((eng_word, distance, frequency))
    
    if candidates:
        # Sort by edit distance first, then by frequency (descending)
        candidates.sort(key=lambda x: (x[1], -x[2]))
        return candidates[0][0]
    
    return word  # Return original if no candidates found

def edit_distance(s1, s2):
    """Calculate edit distance between two strings"""
    if len(s1) < len(s2):
        return edit_distance(s2, s1)
    
    if len(s2) == 0:
        return len(s1)
    
    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

def nltk_ocr_correction(text):
    """Comprehensive OCR correction using NLTK"""
    try:
        from nltk.tokenize import word_tokenize, sent_tokenize
        english_words, word_freq = create_word_frequency_dict()
        
        if not english_words:
            return rule_based_ocr_correction(text)
        
        sentences = sent_tokenize(text)
        corrected_sentences = []
        
        for sentence in sentences:
            words = word_tokenize(sentence)
            corrected_words = []
            
            for word in words:
                if word in string.punctuation:
                    corrected_words.append(word)
                    continue
                
                # Handle mixed alphanumeric OCR errors
                corrected_word = word
                if any(c.isdigit() for c in word) and any(c.isalpha() for c in word):
                    corrected_word = re.sub(r'0', 'o', corrected_word)
                    corrected_word = re.sub(r'1', 'l', corrected_word)
                    corrected_word = re.sub(r'5', 'S', corrected_word)
                    corrected_word = re.sub(r'8', 'B', corrected_word)
                
                # Apply spell checking
                if corrected_word.isalpha() and len(corrected_word) > 1:
                    spell_corrected = nltk_spell_check(corrected_word, english_words, word_freq)
                    corrected_words.append(spell_corrected)
                else:
                    corrected_words.append(corrected_word)
            
            # Reconstruct sentence with proper spacing
            corrected_sentence = ' '.join(corrected_words)
            corrected_sentence = re.sub(r'\s+([,.!?;:])', r'\1', corrected_sentence)
            corrected_sentence = re.sub(r'([,.!?;:])\s*([A-Za-z])', r'\1 \2', corrected_sentence)
            corrected_sentences.append(corrected_sentence)
        
        result = ' '.join(corrected_sentences)
        result = re.sub(r'\s+', ' ', result).strip()
        return result
        
    except:
        return rule_based_ocr_correction(text)

def advanced_nltk_correction(text):
    """Advanced NLTK correction with context awareness"""
    try:
        from nltk.tokenize import word_tokenize, sent_tokenize
        from nltk.tag import pos_tag
        from nltk.corpus import wordnet
        
        english_words, word_freq = create_word_frequency_dict()
        if not english_words:
            return nltk_ocr_correction(text)
        
        sentences = sent_tokenize(text)
        corrected_sentences = []
        
        for sentence in sentences:
            words = word_tokenize(sentence)
            pos_tags = pos_tag(words)
            corrected_words = []
            
            for word, pos in pos_tags:
                if word in string.punctuation:
                    corrected_words.append(word)
                    continue
                
                corrected_word = word
                
                # Context-aware OCR corrections
                if pos.startswith('NN'):
                    corrected_word = re.sub(r'rn', 'm', corrected_word)
                    corrected_word = re.sub(r'cl', 'd', corrected_word)
                elif pos.startswith('VB'):
                    corrected_word = re.sub(r'1ng', 'ing', corrected_word)
                elif pos.startswith('DT'):
                    if corrected_word.lower() in ['tlle', 'tl1e']:
                        corrected_word = 'the'
                
                # Apply spell checking
                if corrected_word.isalpha() and len(corrected_word) > 1:
                    spell_corrected = nltk_spell_check(corrected_word, english_words, word_freq)
                    corrected_words.append(spell_corrected)
                else:
                    corrected_words.append(corrected_word)
            
            corrected_sentence = ' '.join(corrected_words)
            corrected_sentence = re.sub(r'\s+([,.!?;:])', r'\1', corrected_sentence)
            corrected_sentence = re.sub(r'([,.!?;:])\s*([A-Za-z])', r'\1 \2', corrected_sentence)
            corrected_sentences.append(corrected_sentence)
        
        result = ' '.join(corrected_sentences)
        return re.sub(r'\s+', ' ', result).strip()
        
    except:
        return nltk_ocr_correction(text)

# Fallback rule-based correction (from previous cell)
def rule_based_ocr_correction(text: str) -> str:
    """Apply common OCR error corrections using regex patterns"""
    corrections = [
        (r'rn', 'm'), (r'cl', 'd'), (r'vv', 'w'),
        (r'(\w)1(\w)', r'\1l\2'), (r'(\w)0(\w)', r'\1o\2'),
        (r'([a-z])([A-Z])', r'\1 \2'), (r'\s+', ' '),
        (r'\btlle\b', 'the'), (r'\btl1e\b', 'the'),
        (r'\s+([,.!?;:])', r'\1'), (r'([,.!?;:])\s*([a-zA-Z])', r'\1 \2'),
    ]
    
    corrected_text = text
    for pattern, replacement in corrections:
        corrected_text = re.sub(pattern, replacement, corrected_text)
    
    return corrected_text.strip()

def extract_pdf_with_nltk_correction(pdf_path, output_path):
    """Enhanced hybrid extraction with NLTK-powered text correction"""
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f'Input PDF not found: {pdf_path}')
    
    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
    
    print("Setting up NLTK environment...")
    nltk_available = setup_nltk()
    
    if nltk_available:
        print("✓ NLTK correction enabled")
    else:
        print("✗ NLTK unavailable, using rule-based correction")
    
    text_blocks = []
    image_blocks = []
    
    # Extract native text
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages, 1):
            extracted_text = page.extract_text() or ''
            if extracted_text.strip():
                text_blocks.append(f"=== PAGE {page_num} - TEXT ===\n{extracted_text}")
    
    # Extract and OCR images
    pdf_doc = fitz.open(pdf_path)
    for page_num, page in enumerate(pdf_doc, 1):
        image_list = page.get_images(full=True)
        
        for img_idx, img_info in enumerate(image_list):
            try:
                xref = img_info[0]
                base_img = pdf_doc.extract_image(xref)
                image_bytes = base_img["image"]
                
                # Enhanced image processing
                image_pil = Image.open(io.BytesIO(image_bytes))
                if image_pil.mode != 'RGB':
                    image_pil = image_pil.convert('RGB')
                
                enhancer = ImageEnhance.Contrast(image_pil)
                image_pil = enhancer.enhance(1.5)
                enhancer = ImageEnhance.Sharpness(image_pil)
                image_pil = enhancer.enhance(2.0)
                
                image_np = np.array(image_pil)
                image_cv = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
                
                # OCR and correction
                raw_ocr_text = pytesseract.image_to_string(image_cv, config=r'--oem 3 --psm 6 -l eng').strip()
                
                if raw_ocr_text:
                    corrected_text = (advanced_nltk_correction(raw_ocr_text) if nltk_available 
                                    else rule_based_ocr_correction(raw_ocr_text))
                    
                    result_block = [
                        f"=== PAGE {page_num} - IMAGE {img_idx+1} ===",
                        "RAW OCR:", raw_ocr_text,
                        "\nCORRECTED (NLTK):", corrected_text
                    ]
                    image_blocks.append("\n".join(result_block))
                    
            except Exception as e:
                print(f"Error processing image {img_idx+1} on page {page_num}: {e}")
                continue
    
    pdf_doc.close()
    
    # Combine and save
    all_content = text_blocks.copy()
    if image_blocks:
        all_content.extend(["\n" + "="*60, "CONTENT FROM IMAGES (NLTK CORRECTED)", "="*60])
        all_content.extend(image_blocks)
    
    final_text = "\n\n".join(all_content)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(final_text)
    
    return final_text

# Test NLTK-based correction
input_pdf = './Tests files/MC test.pdf'
nltk_output = './Tests output/nltk_corrected.txt'

print("Testing NLTK-Based OCR Correction...")
extract_pdf_with_nltk_correction(input_pdf, nltk_output)
print(f"✓ Results saved to: {nltk_output}")
print("✓ Features: Spell checking, POS tagging, word frequency analysis")

Testing NLTK-Based OCR Correction...
✓ Results saved to: ./Tests output/nltk_corrected.txt
✓ Features: Spell checking, POS tagging, word frequency analysis


In [None]:
# Enhanced Compact Version with Missing Features
import os, io, re, string
import pdfplumber, fitz, pytesseract, cv2
import numpy as np
from collections import Counter
from PIL import Image, ImageEnhance
import nltk

# Setup NLTK with better feedback
def setup_nltk():
    try:
        nltk_data_path = os.path.expanduser('~/nltk_data')
        if nltk_data_path not in nltk.data.path:
            nltk.data.path.append(nltk_data_path)
        
        required_data = [
            ('punkt', 'tokenizers'),
            ('words', 'corpora'), 
            ('averaged_perceptron_tagger', 'taggers'),
            ('brown', 'corpora')
        ]
        
        missing = []
        for data_name, data_type in required_data:
            try:
                nltk.data.find(f'{data_type}/{data_name}')
                print(f"✓ {data_name} available")
            except LookupError:
                missing.append(data_name)
        
        if missing:
            print(f"Downloading: {', '.join(missing)}")
            for data_name in missing:
                nltk.download(data_name, download_dir=nltk_data_path, quiet=False)
                print(f"✓ Downloaded {data_name}")
        
        return True
    except Exception as e:
        print(f"✗ NLTK setup failed: {e}")
        return False

# Word dictionary + frequencies
def create_dict():
    try:
        from nltk.corpus import brown, words
        return set(words.words()), Counter(w.lower() for w in brown.words() if w.isalpha())
    except:
        return set(), Counter()

# Enhanced edit distance
def edit_distance(s1, s2):
    if len(s1) < len(s2): return edit_distance(s2, s1)
    if not s2: return len(s1)
    prev = list(range(len(s2)+1))
    for i,c1 in enumerate(s1):
        curr = [i+1]
        for j,c2 in enumerate(s2):
            ins, dele, sub = prev[j+1]+1, curr[j]+1, prev[j]+(c1!=c2)
            curr.append(min(ins,dele,sub))
        prev = curr
    return prev[-1]

# Spell check with frequency ranking
def spell_check(word, english, freq, max_d=2):
    word = word.lower()
    if word in english: return word
    cands = []
    for w in english:
        if abs(len(w)-len(word)) <= max_d:
            dist = edit_distance(word, w)
            if dist <= max_d:
                cands.append((w, dist, freq.get(w, 1)))
    if cands: 
        return sorted(cands, key=lambda x: (x[1], -x[2]))[0][0]
    return word

# Advanced OCR correction with POS tagging
def correct_text(text, english, freq):
    try:
        from nltk.tokenize import word_tokenize, sent_tokenize
        from nltk.tag import pos_tag
        
        out = []
        for sent in sent_tokenize(text):
            words = word_tokenize(sent)
            pos_tags = pos_tag(words)
            corrected = []
            
            for word, pos in pos_tags:
                if word in string.punctuation: 
                    corrected.append(word)
                    continue
                
                # Context-aware corrections based on POS
                if pos.startswith('NN'):  # Nouns
                    word = re.sub(r'rn', 'm', word)
                    word = re.sub(r'cl', 'd', word)
                elif pos.startswith('VB'):  # Verbs
                    word = re.sub(r'1ng', 'ing', word)
                elif pos.startswith('DT'):  # Determiners
                    if word.lower() in ['tlle', 'tl1e']:
                        word = 'the'
                
                # Common OCR fixes
                if any(c.isdigit() for c in word) and any(c.isalpha() for c in word):
                    word = re.sub(r'0', 'o', word)
                    word = re.sub(r'1', 'l', word)
                    word = re.sub(r'5', 'S', word)
                    word = re.sub(r'8', 'B', word)
                
                # Spell check
                if word.isalpha() and len(word) > 1:
                    word = spell_check(word, english, freq)
                
                corrected.append(word)
            
            # Reconstruct sentence with proper spacing
            s = ' '.join(corrected)
            s = re.sub(r'\s+([,.!?;:])', r'\1', s)
            s = re.sub(r'([,.!?;:])\s*([A-Za-z])', r'\1 \2', s)
            out.append(s)
        
        return re.sub(r'\s+', ' ', ' '.join(out)).strip()
    except:
        # Fallback to simple correction
        return simple_correct(text, english, freq)

def simple_correct(text, english, freq):
    """Fallback correction without POS tagging"""
    from nltk.tokenize import word_tokenize, sent_tokenize
    out = []
    for sent in sent_tokenize(text):
        words = []
        for w in word_tokenize(sent):
            if w in string.punctuation: 
                words.append(w)
                continue
            # Basic corrections
            w = re.sub(r'0','o',w)
            w = re.sub(r'1','l',w)
            if w.isalpha() and len(w)>1: 
                w = spell_check(w,english,freq)
            words.append(w)
        s = ' '.join(words)
        s = re.sub(r'\s+([,.!?;:])',r'\1',s)
        out.append(s)
    return ' '.join(out)

# Main extraction with error handling
def extract_pdf(pdf_path, out_path):
    print("Setting up NLTK...")
    nltk_available = setup_nltk()
    
    if not nltk_available:
        print("✗ NLTK unavailable, using basic correction")
        return None
    
    print("✓ Creating word dictionaries...")
    english, freq = create_dict()
    
    if not english:
        print("✗ Could not load word dictionaries")
        return None
    
    print(f"✓ Loaded {len(english)} words")
    
    text_blocks, img_blocks = [], []
    
    # Extract native text
    print("Extracting text with pdfplumber...")
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for i, p in enumerate(pdf.pages, 1):
                t = p.extract_text()
                if t and t.strip(): 
                    text_blocks.append(f"=== PAGE {i} - TEXT ===\n{t}")
    except Exception as e:
        print(f"Error with pdfplumber: {e}")
    
    # Extract and OCR images
    print("Processing images...")
    try:
        pdf_doc = fitz.open(pdf_path)
        for i, page in enumerate(pdf_doc, 1):
            images = page.get_images(full=True)
            for j, img in enumerate(images, 1):
                try:
                    xref = img[0]
                    base = pdf_doc.extract_image(xref)["image"]
                    pil = Image.open(io.BytesIO(base)).convert('RGB')
                    
                    # Enhanced image processing
                    pil = ImageEnhance.Contrast(pil).enhance(1.5)
                    pil = ImageEnhance.Sharpness(pil).enhance(2.0)
                    
                    img_cv = cv2.cvtColor(np.array(pil), cv2.COLOR_RGB2BGR)
                    raw = pytesseract.image_to_string(img_cv, config='--oem 3 --psm 6 -l eng').strip()
                    
                    if raw:
                        print(f"Processing image {j} on page {i}...")
                        corr = correct_text(raw, english, freq)
                        img_blocks.append(f"=== PAGE {i} - IMAGE {j} ===\nRAW OCR:\n{raw}\n\nCORRECTED (NLTK):\n{corr}")
                        
                except Exception as e:
                    print(f"Error processing image {j} on page {i}: {e}")
                    continue
        pdf_doc.close()
    except Exception as e:
        print(f"Error with image processing: {e}")
    
    # Combine and save
    os.makedirs(os.path.dirname(out_path) or '.', exist_ok=True)
    
    all_content = text_blocks.copy()
    if img_blocks:
        all_content.extend(["\n" + "="*60, "CONTENT FROM IMAGES (NLTK CORRECTED)", "="*60])
        all_content.extend(img_blocks)
    
    final = "\n\n".join(all_content)
    
    with open(out_path, 'w', encoding='utf-8') as f: 
        f.write(final)
    
    print(f"✓ Saved results to: {out_path}")
    return final

# Test the enhanced version
if __name__=="__main__":
    input_pdf = "./Tests files/MC test.pdf"
    output_file = "./Tests output/enhanced_nltk_corrected.txt"
    
    print("Testing Enhanced Compact OCR with NLTK...")
    result = extract_pdf(input_pdf, output_file)
    
    if result:
        print("✓ Enhanced extraction complete!")
        print("✓ Features: POS tagging, context-aware correction, enhanced image processing")
        print(f"✓ Preview:\n{result[:400]}...")
    else:
        print("✗ Extraction failed")

## Frontend Integration Pipeline - JSON Output for Firebase

In [18]:
import json
import os
import io
import re
import string
from datetime import datetime
from collections import Counter
import hashlib
import pdfplumber
import fitz
import pytesseract
import cv2
import numpy as np
from PIL import Image, ImageEnhance
import nltk

class OCRPipeline:
    def __init__(self):
        self.setup_nltk()
        self.english_words, self.word_freq, self.stop_words = self.create_dict()
        
    def setup_nltk(self):
        """Setup NLTK with quiet initialization"""
        try:
            nltk_data_path = os.path.expanduser('~/nltk_data')
            if nltk_data_path not in nltk.data.path:
                nltk.data.path.append(nltk_data_path)
            
            required_data = ['punkt', 'words', 'averaged_perceptron_tagger', 'brown', 'stopwords']
            missing = []
            
            for data in required_data:
                try:
                    nltk.data.find(f'tokenizers/{data}' if data == 'punkt' else 
                                  f'taggers/{data}' if 'tagger' in data else f'corpora/{data}')
                except LookupError:
                    missing.append(data)
            
            for data in missing:
                nltk.download(data, download_dir=nltk_data_path, quiet=True)
                
            self.nltk_available = True
        except Exception as e:
            self.nltk_available = False
    
    def create_dict(self):
        """Create word dictionaries and stopwords for text processing"""
        try:
            if not self.nltk_available:
                return set(), Counter(), set()
            
            from nltk.corpus import brown, words, stopwords
            english_words = set(words.words())
            brown_words = [w.lower() for w in brown.words() if w.isalpha()]
            word_freq = Counter(brown_words)
            stop_words = set(stopwords.words('english'))
            return english_words, word_freq, stop_words
        except:
            return set(), Counter(), set()
    
    def edit_distance(self, s1, s2):
        """Calculate edit distance between strings"""
        if len(s1) < len(s2): 
            return self.edit_distance(s2, s1)
        if not s2: 
            return len(s1)
        
        prev = list(range(len(s2) + 1))
        for i, c1 in enumerate(s1):
            curr = [i + 1]
            for j, c2 in enumerate(s2):
                ins, dele, sub = prev[j+1]+1, curr[j]+1, prev[j]+(c1!=c2)
                curr.append(min(ins, dele, sub))
            prev = curr
        return prev[-1]
    
    def spell_check(self, word, max_d=2):
        """Spell check using edit distance and frequency"""
        word = word.lower()
        if not self.english_words or word in self.english_words:
            return word
        
        candidates = []
        for w in self.english_words:
            if abs(len(w) - len(word)) <= max_d:
                dist = self.edit_distance(word, w)
                if dist <= max_d:
                    candidates.append((w, dist, self.word_freq.get(w, 1)))
        
        if candidates:
            return sorted(candidates, key=lambda x: (x[1], -x[2]))[0][0]
        return word
    
    def correct_text(self, text):
        """Advanced text correction with POS tagging"""
        if not self.nltk_available or not text.strip():
            return self.simple_correct(text)
        
        try:
            from nltk.tokenize import word_tokenize, sent_tokenize
            from nltk.tag import pos_tag
            
            sentences = sent_tokenize(text)
            corrected_sentences = []
            
            for sentence in sentences:
                words = word_tokenize(sentence)
                pos_tags = pos_tag(words)
                corrected = []
                
                for word, pos in pos_tags:
                    if word in string.punctuation:
                        corrected.append(word)
                        continue
                    
                    # Context-aware corrections
                    if pos.startswith('NN'):  # Nouns
                        word = re.sub(r'rn', 'm', word)
                        word = re.sub(r'cl', 'd', word)
                    elif pos.startswith('VB'):  # Verbs
                        word = re.sub(r'1ng', 'ing', word)
                    elif pos.startswith('DT'):  # Determiners
                        if word.lower() in ['tlle', 'tl1e']:
                            word = 'the'
                    
                    # OCR fixes
                    if any(c.isdigit() for c in word) and any(c.isalpha() for c in word):
                        word = re.sub(r'0', 'o', word)
                        word = re.sub(r'1', 'l', word)
                        word = re.sub(r'5', 'S', word)
                        word = re.sub(r'8', 'B', word)
                    
                    # Spell check
                    if word.isalpha() and len(word) > 1:
                        word = self.spell_check(word)
                    
                    corrected.append(word)
                
                # Reconstruct sentence
                s = ' '.join(corrected)
                s = re.sub(r'\s+([,.!?;:])', r'\1', s)
                s = re.sub(r'([,.!?;:])\s*([A-Za-z])', r'\1 \2', s)
                corrected_sentences.append(s)
            
            return re.sub(r'\s+', ' ', ' '.join(corrected_sentences)).strip()
        except:
            return self.simple_correct(text)
    
    def simple_correct(self, text):
        """Fallback correction without advanced features"""
        corrections = [
            (r'rn', 'm'), (r'cl', 'd'), (r'vv', 'w'),
            (r'(\w)1(\w)', r'\1l\2'), (r'(\w)0(\w)', r'\1o\2'),
            (r'\s+', ' '), (r'\btlle\b', 'the'), (r'\btl1e\b', 'the'),
            (r'\s+([,.!?;:])', r'\1')
        ]
        
        corrected = text
        for pattern, replacement in corrections:
            corrected = re.sub(pattern, replacement, corrected)
        return corrected.strip()
    
    def process_normal_text(self, text):
        """Process normal PDF text with NLTK cleaning and stopword filtering"""
        if not text.strip():
            return text.strip()
        
        try:
            # Clean up common PDF extraction issues
            text = re.sub(r'\s+', ' ', text)  # Multiple spaces
            text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # CamelCase splitting
            text = re.sub(r'(\w)([.!?])', r'\1\2 ', text)  # Punctuation spacing
            
            if not self.nltk_available:
                return text.strip()
            
            from nltk.tokenize import word_tokenize, sent_tokenize
            
            # Sentence tokenization and cleaning
            sentences = sent_tokenize(text)
            cleaned_sentences = []
            
            for sentence in sentences:
                # Remove very short sentences (likely extraction errors)
                if len(sentence.split()) < 3:
                    continue
                
                # Remove excessive stopwords from sentence
                words = word_tokenize(sentence.lower())
                
                # Keep sentence structure but filter out excessive stopwords
                # Only remove if more than 60% of words are stopwords
                content_words = [w for w in words if w not in self.stop_words and w.isalpha()]
                if len(content_words) > len(words) * 0.4:  # At least 40% content words
                    cleaned_sentences.append(sentence.strip())
            
            return ' '.join(cleaned_sentences)
        except:
            return text.strip()
    
    def analyze_content(self, text):
        """AI-like content analysis for frontend"""
        if not text.strip():
            return {
                "concepts": [],
                "difficulty": "Unknown",
                "word_count": 0,
                "estimated_reading_time": 0,
                "key_topics": [],
                "confidence_score": 0.0
            }
        
        words = text.split()
        word_count = len(words)
        
        # Simple concept extraction (can be enhanced with NLP models)
        concepts = []
        key_terms = []
        
        # Look for academic/technical terms
        if self.nltk_available:
            try:
                from nltk.tokenize import word_tokenize
                from nltk.tag import pos_tag
                
                tokens = word_tokenize(text.lower())
                pos_tags = pos_tag(tokens)
                
                # Extract nouns as potential concepts (excluding stopwords)
                nouns = [word for word, pos in pos_tags 
                        if pos.startswith('NN') and len(word) > 3 
                        and word not in self.stop_words]
                noun_freq = Counter(nouns)
                concepts = [word.title() for word, freq in noun_freq.most_common(8) if freq > 1]
                
                # Extract proper nouns as key topics
                proper_nouns = [word for word, pos in pos_tags 
                              if pos == 'NNP' and len(word) > 2]
                key_topics = list(set(proper_nouns))[:5]
                
            except:
                # Fallback to simple word analysis
                words_clean = [w.lower().strip('.,!?;:') for w in words 
                             if len(w) > 4 and w.lower() not in self.stop_words]
                word_freq = Counter(words_clean)
                concepts = [w.title() for w, f in word_freq.most_common(6)]
                key_topics = concepts[:3]
        else:
            # Simple analysis without NLTK
            basic_stopwords = {'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'this', 'that', 'these', 'those'}
            words_clean = [w.lower().strip('.,!?;:') for w in words 
                         if len(w) > 4 and w.lower() not in basic_stopwords]
            word_freq = Counter(words_clean)
            concepts = [w.title() for w, f in word_freq.most_common(6)]
            key_topics = concepts[:3]
        
        # Estimate difficulty based on word complexity
        complex_words = [w for w in words if len(w) > 8]
        complexity_ratio = len(complex_words) / max(word_count, 1)
        
        if complexity_ratio > 0.15:
            difficulty = "Advanced"
        elif complexity_ratio > 0.08:
            difficulty = "Intermediate"
        else:
            difficulty = "Beginner"
        
        # Reading time (average 200 words per minute)
        reading_time = max(1, round(word_count / 200))
        
        # Confidence score based on text quality
        confidence_score = min(1.0, max(0.1, (word_count / 100) * 0.8 + (len(concepts) / 10) * 0.2))
        
        return {
            "concepts": concepts,
            "difficulty": difficulty,
            "word_count": word_count,
            "estimated_reading_time": reading_time,
            "key_topics": key_topics,
            "confidence_score": round(confidence_score, 2)
        }
    
    def process_file(self, file_path, user_id=None):
        """Main processing pipeline for frontend integration"""
        try:
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File not found: {file_path}")
            
            # Process based on file type
            file_ext = os.path.splitext(file_path)[1].lower()
            
            if file_ext == '.pdf':
                result = self.process_pdf(file_path)
            elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
                result = self.process_image(file_path)
            else:
                raise ValueError(f"Unsupported file type: {file_ext}")
            
            # Analyze content
            all_text = result['extracted_text']
            analysis = self.analyze_content(all_text)
            
            # Create Firebase-ready JSON structure (Firebase will add timestamp and ID)
            firebase_data = {
                "user_id": user_id or "anonymous",
                "file_info": {
                    "original_name": os.path.basename(file_path),
                    "file_type": file_ext,
                    "file_size": os.path.getsize(file_path),
                    "processing_method": result['processing_method']
                },
                "extraction_results": {
                    "raw_text": result['raw_text'],
                    "corrected_text": result['corrected_text'],
                    "extracted_text": all_text,
                    "pages_processed": result['pages_processed'],
                    "images_processed": result['images_processed']
                },
                "ai_analysis": analysis,
                "processing_metadata": {
                    "nltk_available": self.nltk_available,
                    "processing_time": result.get('processing_time', 0),
                    "corrections_applied": result.get('corrections_applied', 0)
                }
            }
            
            return firebase_data
            
        except Exception as e:
            # Return error structure for Firebase
            return {
                "user_id": user_id or "anonymous",
                "error": True,
                "error_message": str(e),
                "file_info": {
                    "original_name": os.path.basename(file_path) if os.path.exists(file_path) else "unknown",
                    "file_type": "unknown",
                    "processing_method": "failed"
                }
            }
    
    def process_pdf(self, pdf_path):
        """Process PDF file with hybrid approach"""
        start_time = datetime.now()
        text_blocks = []
        image_blocks = []
        raw_texts = []
        corrected_texts = []
        corrections_count = 0
        
        # Extract native text with pdfplumber
        try:
            with pdfplumber.open(pdf_path) as pdf:
                for page_num, page in enumerate(pdf.pages, 1):
                    text = page.extract_text() or ''
                    if text.strip():
                        # Process normal text with NLTK cleaning
                        cleaned_text = self.process_normal_text(text)
                        text_blocks.append({
                            "page": page_num,
                            "type": "native_text",
                            "content": cleaned_text
                        })
        except Exception as e:
            raise Exception(f"Error with pdfplumber: {e}")
        
        # Extract and OCR images
        try:
            pdf_doc = fitz.open(pdf_path)
            for page_num, page in enumerate(pdf_doc, 1):
                images = page.get_images(full=True)
                for img_idx, img_info in enumerate(images, 1):
                    try:
                        xref = img_info[0]
                        base_img = pdf_doc.extract_image(xref)
                        image_bytes = base_img["image"]
                        
                        # Process image
                        pil_img = Image.open(io.BytesIO(image_bytes)).convert('RGB')
                        pil_img = ImageEnhance.Contrast(pil_img).enhance(1.5)
                        pil_img = ImageEnhance.Sharpness(pil_img).enhance(2.0)
                        
                        img_cv = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
                        raw_text = pytesseract.image_to_string(img_cv, config='--oem 3 --psm 6 -l eng').strip()
                        
                        if raw_text:
                            corrected_text = self.correct_text(raw_text)
                            corrections_count += len(raw_text.split()) - len(corrected_text.split())
                            
                            raw_texts.append(raw_text)
                            corrected_texts.append(corrected_text)
                            
                            image_blocks.append({
                                "page": page_num,
                                "image": img_idx,
                                "type": "ocr_text",
                                "raw_content": raw_text,
                                "corrected_content": corrected_text
                            })
                    except Exception:
                        continue
            pdf_doc.close()
        except Exception as e:
            raise Exception(f"Error with image processing: {e}")
        
        # Combine all text
        all_text_parts = []
        for block in text_blocks:
            all_text_parts.append(block['content'])
        for block in image_blocks:
            all_text_parts.append(block['corrected_content'])
        
        processing_time = (datetime.now() - start_time).total_seconds()
        
        return {
            "processing_method": "hybrid_pdf",
            "extracted_text": "\n\n".join(all_text_parts),
            "raw_text": "\n\n".join(raw_texts),
            "corrected_text": "\n\n".join(corrected_texts),
            "pages_processed": len(set([b['page'] for b in text_blocks + image_blocks])),
            "images_processed": len(image_blocks),
            "processing_time": round(processing_time, 2),
            "corrections_applied": abs(corrections_count),
            "detailed_blocks": text_blocks + image_blocks
        }
    
    def process_image(self, image_path):
        """Process single image file"""
        start_time = datetime.now()
        
        try:
            # Load and enhance image
            pil_img = Image.open(image_path).convert('RGB')
            pil_img = ImageEnhance.Contrast(pil_img).enhance(1.5)
            pil_img = ImageEnhance.Sharpness(pil_img).enhance(2.0)
            
            # Convert to OpenCV format
            img_cv = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
            
            # OCR
            raw_text = pytesseract.image_to_string(img_cv, config='--oem 3 --psm 6 -l eng').strip()
            corrected_text = self.correct_text(raw_text) if raw_text else ""
            
            processing_time = (datetime.now() - start_time).total_seconds()
            corrections_count = len(raw_text.split()) - len(corrected_text.split()) if raw_text and corrected_text else 0
            
            return {
                "processing_method": "image_ocr",
                "extracted_text": corrected_text,
                "raw_text": raw_text,
                "corrected_text": corrected_text,
                "pages_processed": 1,
                "images_processed": 1,
                "processing_time": round(processing_time, 2),
                "corrections_applied": abs(corrections_count)
            }
        except Exception as e:
            raise Exception(f"Error processing image: {e}")

# Pipeline usage functions for frontend integration
def process_file_for_frontend(file_path, user_id=None, output_dir="./firebase_data"):
    """
    Main function to be called from frontend
    Returns JSON data ready for Firebase storage
    """
    pipeline = OCRPipeline()
    result = pipeline.process_file(file_path, user_id)
    
    # Save JSON file for Firebase upload (Firebase will generate document ID and timestamp)
    os.makedirs(output_dir, exist_ok=True)
    
    # Use filename-based naming since Firebase will handle document IDs
    filename = os.path.splitext(os.path.basename(file_path))[0]
    json_filename = f"{filename}_processed.json"
    json_path = os.path.join(output_dir, json_filename)
    
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(result, f, indent=2, ensure_ascii=False)
    
    return result, json_path

# Test the pipeline
if __name__ == "__main__":
    test_file = "./Tests files/MC test.pdf"
    test_user = "user_123"
    
    print("🚀 Starting OCR Pipeline...")
    
    try:
        firebase_data, json_file = process_file_for_frontend(test_file, test_user)
        
        # Simple success output
        print(f"✅ Extraction successful!")
        print(f"   📄 {firebase_data['extraction_results']['pages_processed']} pages processed")
        print(f"   🖼️  {firebase_data['extraction_results']['images_processed']} images processed") 
        print(f"   📝 {firebase_data['ai_analysis']['word_count']} words extracted")
        print(f"   💾 JSON saved: {os.path.basename(json_file)}")
        
    except Exception as e:
        print(f"❌ Error: {e}")

🚀 Starting OCR Pipeline...
✅ Extraction successful!
   📄 6 pages processed
   🖼️  18 images processed
   📝 1994 words extracted
   💾 JSON saved: MC test_processed.json
✅ Extraction successful!
   📄 6 pages processed
   🖼️  18 images processed
   📝 1994 words extracted
   💾 JSON saved: MC test_processed.json


## Simple Flask Server for Frontend Integration

In [None]:
# Simple Flask server to integrate OCR pipeline with React frontend
# Run this in a separate terminal: python ocr_server.py

from flask import Flask, request, jsonify
from flask_cors import CORS
import os
import tempfile
import shutil

app = Flask(__name__)
CORS(app)  # Enable CORS for React frontend

@app.route('/api/ocr/process', methods=['POST'])
def process_ocr():
    try:
        # Check if file is in request
        if 'file' not in request.files:
            return jsonify({'error': 'No file provided'}), 400
        
        file = request.files['file']
        if file.filename == '':
            return jsonify({'error': 'No file selected'}), 400
        
        # Get user_id from form data (optional)
        user_id = request.form.get('user_id', 'anonymous')
        
        # Create temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as temp_file:
            file.save(temp_file.name)
            temp_path = temp_file.name
        
        try:
            # Process file using our OCR pipeline
            result, json_path = process_file_for_frontend(temp_path, user_id)
            
            # Return the processed data
            response_data = {
                'success': True,
                'data': result,
                'message': 'File processed successfully'
            }
            
            return jsonify(response_data)
            
        finally:
            # Clean up temporary file
            if os.path.exists(temp_path):
                os.unlink(temp_path)
                
    except Exception as e:
        return jsonify({'error': str(e), 'success': False}), 500

@app.route('/api/ocr/health', methods=['GET'])
def health_check():
    return jsonify({'status': 'healthy', 'message': 'OCR service is running'})

if __name__ == '__main__':
    print("🚀 Starting OCR Server...")
    print("📡 Server will run on http://localhost:5000")
    print("🔗 Frontend can send POST requests to http://localhost:5000/api/ocr/process")
    app.run(debug=True, port=5000)

# To run this server:
# 1. Save this cell content to a file named 'ocr_server.py'
# 2. Install Flask: pip install flask flask-cors
# 3. Run: python ocr_server.py