# Unicode mismap identifier
While running simple OCR on the PDF can help us deal with the problem of unicode mismap, by overlaying thepdfwith the OCR text, it cannot help us to identify where the unicode mapping is going wrong, for which character. Here we will be addressing that particular issue.

## Process
1. Get OCR extracted text
2. Get raw text (which maps to the current unicode)
3. Compare and identify

In [None]:
import os
from pdf2image import convert_from_path
import pytesseract
import pdfplumber
from difflib import HtmlDiff


def extract_ocr_text(pdf_path):
    images = convert_from_path(pdf_path)
    ocr_text = []
  
    for i, image in enumerate(images):
        image_path = f"page_{i}.jpg"
        image.save(image_path, 'JPEG')
        
        text = pytesseract.image_to_string(image_path, config='--psm 6')
        ocr_text.append(text)

        os.remove(image_path)
    return ocr_text


def extract_raw_text(pdf_path):
    raw_text = []
    
    with pdfplumber.open(pdf_path) as pdf:
        for page_num in range(len(pdf.pages)):
            page = pdf.pages[page_num]
            text = page.extract_text()
            raw_text.append(text)

    return raw_text


def generate_diff_report(raw_text_list, ocr_text_list, output_path="diff_report.html"):
    d = HtmlDiff()
    all_diffs = ""

    num_pages = min(len(raw_text_list), len(ocr_text_list))

    for page_num in range(num_pages):
        raw_text = raw_text_list[page_num].splitlines() if raw_text_list[page_num] else []
        ocr_text = ocr_text_list[page_num].splitlines() if ocr_text_list[page_num] else []
        diff = d.make_file(raw_text, ocr_text, context=True, fromdesc=f'Raw Text Page {page_num+1}', todesc=f'OCR Text Page {page_num+1}')
        all_diffs += diff

    
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(all_diffs)

    print(f"Diff report generated: {output_path}")

# Main function to process the entire PDF
def process_pdf(pdf_path):
    # Step 1: Extract OCR text from PDF images
    print("Extracting OCR text from PDF...")
    ocr_text_list = extract_ocr_text(pdf_path)

    # Step 2: Extract raw text from PDF using pdfplumber
    print("Extracting raw text from PDF...")
    raw_text_list = extract_raw_text(pdf_path)

    # Step 3: Generate diff report between raw text and OCR text
    print("Generating diff report...")
    generate_diff_report(raw_text_list, ocr_text_list, output_path="multi_page_diff_report.html")


process_pdf("apple.pdf")