In [6]:
import fitz
import pdf2image
import pytesseract
import os
from typing import Optional
import docx

In [7]:
class ResumeRouter:
    def __init__(self, tesseract_cmd: Optional[str] = None):
        if tesseract_cmd:
            pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
    
    # Detects file type and routes to the correct parsing pipeline.
    def route_and_parse(self, file_path: str) -> str:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"The file {file_path} does not exist.")
        
        # This extracts the file extension:
        _, ext = os.path.splitext(file_path)
        ext = ext.lower()

        if ext in [".docx", ".doc"]:
            print("[LOG] DOCX detected → DOCX parser")
            return self._parse_docx(file_path)

        elif ext == ".pdf":
            if self._is_scanned_pdf(file_path):
                print("[LOG] Scanned PDF detected → OCR pipeline")
                return self._pipeline_ocr(file_path)
            else:
                print("[LOG] Native PDF detected → Text extraction pipeline")
                return self._pipeline_native_pdf(file_path)

        else:
            raise ValueError(f"Unsupported file format: {ext}")

    # PDF Type Detection
    def _is_scanned_pdf(self, pdf_path: str) -> bool:
        """
        Heuristic to detect scanned PDFs.

        Logic:
        - Extract text from first page
        - If text is very short → likely image-based

        Returns:
            bool: True if scanned, False otherwise
        """
        try:
            with fitz.open(pdf_path) as doc:
                if len(doc) == 0:
                    return True
                first_page = doc[0].get_text().strip()

                return len(first_page)<5
        except:
            print(f"[ERROR] PDF scan check failed: {e}")
            return True

    # if Native PDF detected → Text extraction pipeline
    def _pipeline_native_pdf(self, pdf_path: str) -> str:
        # Extracts text from text-based PDFs using PyMuPDF.
        text_content = []

        with fitz.open(pdf_path) as doc:
            for page in doc:
                text_content.append(page.get_text())
        return "\n".join(text_content)
    
    # if Scanned PDF detected → OCR pipeline
    def _pipeline_ocr(self, pdf_path: str) -> str:
        """
        Extracts text from scanned PDFs using OCR.

        Steps:
        1. Convert PDF pages to images
        2. Run Tesseract OCR on each image

        Returns:
            str: OCR extracted text
        """
        text_content = []
        image = convert_from_path(pdf_path)

        for idx, image in enumerate(image, start=1):
            print(f"[LOG] OCR processing page {idx}")
            text = pytesseract.image_to_string(image)
            text_content.append(text)
        return "\n".join(text_content)
    
    # parse docs
    def _parse_docx(self, pdf_path):

        doc = docx.Document(pdf_path)
        content = []

        for para in doc.paragraphs:
            if para.text.strip():
                content.append(para.text)

        for table in doc.tables:
            for roe in table.rows:
                row_text = [
                    cell.text.strip() for cell in row.cells if cell.text.strip()
                ]
                if row_text:
                    content.append(" | ".join(row_text))

        return "\n".join(content)

In [10]:
# -------------------------------
# Example Usage
# -------------------------------
if __name__ == "__main__":
    router = ResumeRouter()

    file_path = r"C:\Start-Up\dishasetu-ai\data\resume\with_table_no_ocr.pdf"  # change this
    raw_text = router.route_and_parse(file_path)

    print("\n--- Extracted Text Preview ---\n")
    print(raw_text[:])

[LOG] Native PDF detected → Text extraction pipeline

--- Extracted Text Preview ---

First Name Last Name
+91-Your Phone Number
Bachelor of Technology
your email
in Chemical Engineering
GitHub
Guru Gobind Singh Indraprastha University, New Delhi
LinkedIn
Education
Year
Degree/Certificate
Institute
CGPA/Percentage
2019-2023
B.Tech Chemical Engineering
Guru Gobind Singh Indraprastha University
7.96
2018
CBSE(XII)
Modern School, New Delhi
90.4%
2016
CBSE(X)
Modern School, New Delhi
9.2
Experience
• Company Name
June 2021 - July 2021
Position
Location
– About work
– About Work
Projects
• Project Name
Mar 2021 - Apr 2021
Course or faculty
Github
– About it
– About it
• Project Name
Mar 2021 - Apr 2021
Course or faculty
Github
– About it
– About it
Technical Skills
• Programming Languages: C/C++, Python
• Tools and Frameworks:Jupyter, Visual Studio & Figma
• Operating Systems:Windows, Linux & Android
Key courses taken
• CSE & Maths:Introduction to C, Fundamentals of Computers, Object Orient