In [None]:
!pip install PyMuPDF python-docx easyocr pywin32

In [None]:
import os
import numpy as np
import pymupdf 
import cv2
import easyocr
from docx import Document
from docx.oxml.ns import qn
import subprocess
import tempfile
import platform
import io
import zipfile
import tempfile
from PIL import Image
import win32com.client

In [None]:
def ProcessFile(file_path):
    if not os.path.exists(file_path):
        print(f"Error: File '{file_path}' does not exist.")
        return

    ext = os.path.splitext(file_path)[1].lower()
    file = None
    
    if ext == '.pdf':
        file = ProcessPdf(file_path)
    elif ext == '.docx':
        file = ProcessDocx(file_path)
    elif ext == '.doc':
        docx_path = ConvertDocToPdf(file_path)
        if docx_path:
            file = ProcessPdf(docx_path)
            try:
                os.remove(docx_path)
            except Exception as e:
                print(f"Warning: Could not delete temporary file {docx_path}: {e}")
    else:
        print(f"Error: Unsupported file format '{ext}'")

    return file

In [None]:
def ConvertDocToPdf(doc_path):
    doc_path = os.path.abspath(doc_path)
    
    temp_dir = tempfile.gettempdir()
    temp_filename = next(tempfile._get_candidate_names()) + ".pdf"
    temp_pdf_path = os.path.join(temp_dir, temp_filename)
    
    system = platform.system()
    
    if system == "Windows":
        ConvertWithWindows(doc_path, temp_pdf_path)
    else:
        ConvertWithLibreOffice(doc_path, temp_pdf_path)
    
    if os.path.exists(temp_pdf_path):
        return temp_pdf_path
    else:
        raise Exception("PDF conversion failed: output file not found")

In [None]:
def ConvertWithWindows(doc_path, pdf_path):
    
    word = win32com.client.Dispatch("Word.Application")
    word.Visible = False
    
    try:
        doc = word.Documents.Open(doc_path)
        
        doc.SaveAs(pdf_path, FileFormat=17)
        
        doc.Close()
    except Exception as e:
        raise Exception(f"Error converting document with Word: {e}")
    finally:
        word.Quit()

def ConvertWithLibreOffice(doc_path, pdf_path):
    output_dir = os.path.dirname(pdf_path)
    
    libreoffice_paths = ["libreoffice", "soffice"]
    libreoffice = None
    
    for path in libreoffice_paths:
        try:
            subprocess.run([path, "--version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            libreoffice = path
            break
        except FileNotFoundError:
            continue
    
    if libreoffice is None:
        raise Exception("LibreOffice not found. Please install LibreOffice to convert .doc files.")
    
    process = subprocess.run([
        libreoffice,
        "--headless",
        "--convert-to", "pdf",
        "--outdir", output_dir,
        doc_path
    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    
    if process.returncode != 0:
        raise Exception(f"LibreOffice conversion failed: {process.stderr.decode('utf-8')}")
    
    expected_filename = os.path.splitext(os.path.basename(doc_path))[0] + ".pdf"
    actual_pdf_path = os.path.join(output_dir, expected_filename)
    
    if os.path.exists(actual_pdf_path) and actual_pdf_path != pdf_path:
        os.rename(actual_pdf_path, pdf_path)

In [None]:
def ExtractTextFromPdf(page):
    words = page.get_text("words")
    return [word[4] for word in words]

In [None]:
def ExtractTextFromDocx(doc):
    texts = []
    for para in doc.paragraphs:
        for run in para.runs:
            text = run.text
            if text:
                texts.append(text)
    return texts

In [None]:
def DetectFace(image_np):
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

    gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
    
    faces = face_cascade.detectMultiScale(
        gray,
        scaleFactor=1.1,
        minNeighbors=5,
        minSize=(30, 30)
    )
    
    return len(faces) > 0

In [None]:
def ocr(image_data, reader=None):
    if reader is None:
        reader = easyocr.Reader(['en'], gpu=False)  
    image = Image.open(io.BytesIO(image_data))
    image_np = np.array(image)
    
    has_face = DetectFace(image_np)
    
    ocr_results = reader.readtext(image_np, detail=0)
    ocr_text = " ".join(ocr_results).strip() if ocr_results else "No text detected in image"
    
    return ocr_text, reader, has_face

In [None]:
def ProcessPdf(pdf_path):
    try:
        doc = pymupdf.open(pdf_path)
        extracted_text = []
        ocr_results = []
        ocr_reader = None
        img_index = 0
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            page_text = ExtractTextFromPdf(page)
            images = page.get_images(full=True)
            
            extracted_text.append(" ".join(page_text) if page_text else "No text found.")
            
            for img in images: 
                xref = img[0]
                try:
                    base_image = doc.extract_image(xref)
                    if base_image:  
                        ocr_text, ocr_reader, has_face = ocr(base_image["image"], ocr_reader)
                        ocr_results.append({
                            "page": page_num + 1,
                            "image_index": img_index,
                            "text": ocr_text,
                            "has_face": has_face
                        })
                        img_index += 1
                except Exception as img_error:
                    print(f"Error extracting image {xref} on page {page_num + 1}: {img_error}")
                    continue
        
        doc.close()
        return extracted_text, ocr_results
    
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return [], []

In [None]:
def ProcessDocx(docx_path):
    try:
        doc = Document(docx_path)
        extracted_text = []
        ocr_results = []
        ocr_reader = None
        img_index = 0  
        
        page_text = ExtractTextFromDocx(doc)
        extracted_text.append(" ".join(page_text) if page_text else "No Text Found.")
        
        with zipfile.ZipFile(docx_path, 'r') as zip_ref:
            for file in zip_ref.namelist():
                if file.startswith('word/media/'):
                    with zip_ref.open(file) as img_file:
                        image_data = img_file.read()
                        ocr_text, ocr_readerreader , has_face = ocr(image_data, ocr_reader)
                        
                        ocr_results.append({
                            "image_index": img_index,  #start from 0 
                            "text": ocr_text,
                            "has_face": has_face
                        })
                        img_index += 1
        
        return extracted_text, ocr_results
            
    except Exception as e:
        print(f"ERROR processing DOCX: {e}")
        return [], []

In [None]:
if __name__ == "__main__":
    file_path = "/kaggle/input/testing-files/test_pdf.pdf" 
    file = ProcessFile(file_path)
    print(file)