In [1]:
import os
import json
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, EasyOcrOptions
from langchain.schema import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
import getpass


  from .autonotebook import tqdm as notebook_tqdm
2025-10-24 19:27:54.535820: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
load_dotenv()

# Set up Google API Key
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
    GOOGLE_API_KEY = getpass.getpass("Enter your Google API key: ")
    os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

print("✅ Environment setup complete")

def initialize_docling_converter():
    """Initialize and return the Docling DocumentConverter with minimal options for maximum speed."""
    doc_converter = DocumentConverter()
    return doc_converter

def validate_and_format_cv_structure(raw_text):
    """
    Uses LLM to validate and format CV structure with proper headers.
    """
    STANDARD_CV_HEADERS = [
        "Personal Information",
        "Professional Summary",
        "Work Experience", 
        "Education",
        "Skills",
        "Projects",
        "Certifications",
        "Languages",
        "References"
    ]
    
    prompt_template = """
    You are a CV/Resume formatting expert. Your task is to analyze the extracted CV text and ensure it is properly structured with appropriate headers.

    STANDARD CV HEADERS:
    {standard_headers}

    EXTRACTED CV TEXT:
    {raw_text}

    INSTRUCTIONS:
    1. Analyze the existing structure and headers in the text
    2. If the text is already well-structured with clear headers that match the standard, return it as-is
    3. If headers are missing, poorly formatted, or non-standard:
       - Reorganize the content under appropriate standard headers
       - Preserve ALL original information
       - Add missing headers if content exists for them
       - Use markdown formatting with ## for headers
    4. Ensure the structured document flows logically

    Return ONLY the properly formatted CV text with appropriate headers.
    """
    
    try:
        print("🔧 Validating CV structure with LLM...")
        
        # Initialize LLM for structuring
        llm = ChatGoogleGenerativeAI(
            model="gemini-2.5-flash",
            temperature=0.1,
            convert_system_message_to_human=True
        )
        
        prompt = PromptTemplate(
            template=prompt_template,
            input_variables=["raw_text", "standard_headers"]
        )
        
        formatted_prompt = prompt.invoke({
            "raw_text": raw_text,
            "standard_headers": ", ".join(STANDARD_CV_HEADERS)
        })
        
        response = llm.invoke(formatted_prompt)
        print("✅ CV structure validation completed")
        return response.content
        
    except Exception as e:
        print(f"❌ LLM structuring failed: {e}")
        print("🔄 Using basic header formatting as fallback...")
        return add_basic_headers_fallback(raw_text)

def add_basic_headers_fallback(raw_text):
    """
    Basic fallback method that adds simple headers without LLM.
    """
    lines = raw_text.split('\n')
    structured_lines = []
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        lower_line = line.lower()
        if any(keyword in lower_line for keyword in ['experience', 'work', 'job', 'employment']):
            if not line.startswith('## '):
                structured_lines.append('## Work Experience')
        elif any(keyword in lower_line for keyword in ['education', 'degree', 'university', 'college']):
            if not line.startswith('## '):
                structured_lines.append('## Education')
        elif any(keyword in lower_line for keyword in ['skill', 'technical', 'programming']):
            if not line.startswith('## '):
                structured_lines.append('## Skills')
        elif any(keyword in lower_word for keyword in ['project', 'portfolio'] for lower_word in lower_line.split()):
            if not line.startswith('## '):
                structured_lines.append('## Projects')
        elif any(keyword in lower_line for keyword in ['certificat', 'license']):
            if not line.startswith('## '):
                structured_lines.append('## Certifications')
        elif any(keyword in lower_line for keyword in ['name', 'address', 'phone', 'email', 'contact']):
            if not line.startswith('## '):
                structured_lines.append('## Personal Information')
        
        structured_lines.append(line)
    
    return '\n'.join(structured_lines)

def demonstrate_cv_processing(pdf_path):
    """
    Demonstrate the CV processing pipeline up to document creation.
    """
    print("=" * 80)
    print("🎯 CV PROCESSING PIPELINE (UP TO DOCUMENT CREATION)")
    print("=" * 80)
    
    # Step 1: Docling Extraction
    print("\n1️⃣ STEP 1: DOCLING EXTRACTION")
    print("-" * 50)
    
    doc_converter = initialize_docling_converter()
    result = doc_converter.convert(pdf_path)
    raw_text = result.document.export_to_markdown()
    
    print(f"📊 Extracted {len(raw_text)} characters")
    print(f"📝 Sample of extracted text:")
    print("─" * 40)
    print(raw_text[:800] + "..." if len(raw_text) > 800 else raw_text)
    print("─" * 40)
    
    # Step 2: LLM Structure Validation
    print("\n2️⃣ STEP 2: LLM STRUCTURE VALIDATION & FORMATTING")
    print("-" * 50)
    
    structured_text = validate_and_format_cv_structure(raw_text)
    
    print(f"📊 After structuring: {len(structured_text)} characters")
    print(f"📝 Structured text preview:")
    print("─" * 40)
    print(structured_text[:1000] + "..." if len(structured_text) > 1000 else structured_text)
    print("─" * 40)
    
    # Step 3: Document Creation
    print("\n3️⃣ STEP 3: DOCUMENT CREATION")
    print("-" * 50)
    
    doc = Document(
        page_content=structured_text,
        metadata={
            "source": pdf_path,
            "page": 1,
            "processed": "structured"
        }
    )
    
    print(f"✅ Created LangChain Document")
    print(f"📏 Document length: {len(doc.page_content)} characters")
    print(f"📋 Metadata: {doc.metadata}")
    print("─" * 40)
    
    # Return all intermediate results
    return {
        'raw_text': raw_text,
        'structured_text': structured_text,
        'document': doc
    }

def show_detailed_comparison(pdf_path):
    """
    Show detailed before/after comparison with header analysis.
    """
    print("\n" + "=" * 80)
    print("🔄 DETAILED BEFORE vs AFTER COMPARISON")
    print("=" * 80)
    
    # Process the CV
    results = demonstrate_cv_processing(pdf_path)
    raw_text = results['raw_text']
    structured_text = results['structured_text']
    
    # Header Analysis
    print("\n4️⃣ HEADER ANALYSIS")
    print("-" * 50)
    
    # Find headers in structured text
    headers = [line.strip() for line in structured_text.split('\n') if line.startswith('## ')]
    
    print(f"📑 Headers found in structured CV: {len(headers)}")
    for i, header in enumerate(headers, 1):
        print(f"   {i}. {header}")
    
    # Content under each header
    print(f"\n📊 Content distribution:")
    sections = structured_text.split('## ')
    for section in sections[1:]:  # Skip first empty section
        if '\n' in section:
            header = section.split('\n')[0].strip()
            content = '\n'.join(section.split('\n')[1:]).strip()
            content_preview = content[:100] + "..." if len(content) > 100 else content
            print(f"   🎯 {header}: {len(content)} chars → '{content_preview}'")
    
    return results

✅ Environment setup complete


In [6]:
# Cell 1: Process a single CV and see the transformation
print("Processing CV through the pipeline...")
pdf_path = "/home/nurshed/Desktop/python/project/RAG Study/rag_chatbot_v5/resume1.pdf" 
results = demonstrate_cv_processing(pdf_path)

2025-10-24 19:29:51,008 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-10-24 19:29:51,012 - INFO - Going to convert document batch...
2025-10-24 19:29:51,013 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 4f2edc0f7d9bb60b38ebfecf9a2609f5
2025-10-24 19:29:51,015 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-10-24 19:29:51,060 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-10-24 19:29:51,082 [RapidOCR] download_file.py:60: File exists and is valid: /home/nurshed/Desktop/python/venv/base/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-10-24 19:29:51,084 [RapidOCR] main.py:53: Using /home/nurshed/Desktop/python/venv/base/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.onnx[0m


Processing CV through the pipeline...
🎯 CV PROCESSING PIPELINE (UP TO DOCUMENT CREATION)

1️⃣ STEP 1: DOCLING EXTRACTION
--------------------------------------------------


[32m[INFO] 2025-10-24 19:29:51,249 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-10-24 19:29:51,256 [RapidOCR] download_file.py:60: File exists and is valid: /home/nurshed/Desktop/python/venv/base/lib/python3.12/site-packages/rapidocr/models/ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-10-24 19:29:51,257 [RapidOCR] main.py:53: Using /home/nurshed/Desktop/python/venv/base/lib/python3.12/site-packages/rapidocr/models/ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-10-24 19:29:51,355 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-10-24 19:29:51,415 [RapidOCR] download_file.py:60: File exists and is valid: /home/nurshed/Desktop/python/venv/base/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.onnx[0m
[32m[INFO] 2025-10-24 19:29:51,417 [RapidOCR] main.py:53: Using /home/nurshed/Desktop/python/venv/base/lib/python3.12/site-packages/rapidocr/models/ch_PP-OCRv4_rec_infer.onnx[0m
2025-10-24 19:29

📊 Extracted 2139 characters
📝 Sample of extracted text:
────────────────────────────────────────
## IM A. SAMPLE I

1234 North 55 Street Bellevue, Nebraska 68005 (402) 292-2345

imasample1@xxx.com

## SUMMARY OF QUALIFICATIONS

Exceptionally well organized and resourceful Professional with more than six years experience and a solid academic background in accounting and financial management; excellent analytical and problem solving skills; able to handle multiple projects while producing high quality work in a fast-paced, deadline-oriented environment.

## EDUCATION

Bachelor of Science

, Bellevue University, Bellevue, NE (In Progress)

Major:  Accounting

Minor:  Computer Information Systems

Expected Graduation Date:  January, 20xx

GPA to date:  3.95/4.00

## PROFESSIONAL ACCOMPLISHMENTS

## Accounting and Financial Management

-  Developed and maintained accounting records for up...
────────────────────────────────────────

2️⃣ STEP 2: LLM STRUCTURE VALIDATION & FORMATTING
-------

In [4]:
final_doc = detailed_results['document']
print(final_doc.page_content)

NameError: name 'detailed_results' is not defined