In [1]:
%pip install fastapi uvicorn python-multipart

Defaulting to user installation because normal site-packages is not writeable
Collecting fastapi
  Downloading fastapi-0.122.0-py3-none-any.whl.metadata (30 kB)
Collecting uvicorn
  Downloading uvicorn-0.38.0-py3-none-any.whl.metadata (6.8 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting starlette<0.51.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.50.0-py3-none-any.whl.metadata (6.3 kB)
Collecting annotated-doc>=0.0.2 (from fastapi)
  Downloading annotated_doc-0.0.4-py3-none-any.whl.metadata (6.6 kB)
Collecting click>=7.0 (from uvicorn)
  Downloading click-8.3.1-py3-none-any.whl.metadata (2.6 kB)
Downloading fastapi-0.122.0-py3-none-any.whl (110 kB)
Downloading starlette-0.50.0-py3-none-any.whl (74 kB)
Downloading uvicorn-0.38.0-py3-none-any.whl (68 kB)
Downloading python_multipart-0.0.20-py3-none-any.whl (24 kB)
Downloading annotated_doc-0.0.4-py3-none-any.whl (5.3 kB)
Downloading click-8.3.1-py3-none-any.whl (10

In [None]:
import re
import pymupdf
import pytesseract
from pdf2image import convert_from_path

In [1]:
def clean_extracted_text(text):
    if not text: return ""
    page_patterns = [
        r'(?i)page\s+\d+(?:\s+of\s+\d+)?',
        r'(?i)página\s+\d+',
        r'^\s*\d+\s*$',
        r'(?i)page\s*\|\s*\d+',
        r'(?i)\d+\s*/\s*\d+',
        r'(?i)\d+\s+of\s+\d+',
    ]
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        original_line = line
        line = line.strip()
        if not line: continue
        is_page_marker = False
        for pattern in page_patterns:
            if re.match(pattern, line):
                is_page_marker = True
                break
        if not is_page_marker:
            cleaned_line = re.sub(r'\s+', ' ', line)
            cleaned_lines.append(cleaned_line)
    result = '\n'.join(cleaned_lines)
    result = re.sub(r'\n{3,}', '\n\n', result)
    return result.strip()

In [2]:
def extract_text_hybrid_fixed(pdf_path, dpi=300, lang="eng", min_char=50):
    try:
        doc = pymupdf.open(pdf_path)
        text_output = ""
        # Convert PDF to images for OCR fallback
        try:
            images = convert_from_path(pdf_path, dpi=dpi)
        except Exception:
            images = [] # Handle case where poppler is not installed
            
        for page_num, page in enumerate(doc):
            page_text = ""
            blocks = page.get_text("blocks")
            if blocks:
                blocks = sorted(blocks, key=lambda b: (b[1], b[0]))
                block_texts = []
                for block in blocks:
                    block_content = block[4].strip()
                    if block_content:
                        block_texts.append(block_content)
                page_text = '\n'.join(block_texts)
            
            if len(page_text.strip()) >= min_char:
                text_output += page_text + "\n\n"
            else:
                # OCR Fallback
                if page_num < len(images):
                    ocr_text = pytesseract.image_to_string(images[page_num], lang=lang)
                    if ocr_text.strip():
                        text_output += ocr_text + "\n\n"
        doc.close()
        return clean_extracted_text(text_output)
    except Exception as e:
        print(f"Error processing PDF: {e}")
        return ""

In [None]:
def categorize_resume_text(text):
    if not text: return {"error": "No text"}
    section_keywords = {
        'contact_info': ['email', 'phone', 'address', 'linkedin', 'github', 'contact'],
        'summary': ['summary', 'objective', 'profile', 'about', 'overview'],
        'experience': ['experience', 'employment', 'work history', 'work experience'],
        'education': ['education', 'academic', 'degree', 'university'],
        'skills': ['skills', 'technical skills', 'competencies', 'technologies'],
        'projects': ['projects', 'portfolio', 'achievements'],
        'certifications': ['certifications', 'certificates', 'awards']
    }
    return {"categorized_sections": {}, "extracted_contacts": {}}

def get_resume_text(pdf_path: str) -> str:
    """Tool dùng để đọc text từ file PDF."""
    return extract_text_hybrid_fixed(pdf_path)