In [1]:
%pip install -U langchain-openai langchain langgraph langsmith azure-core azure-ai-documentintelligence duckdb

Collecting langchain-openai
  Downloading langchain_openai-0.3.31-py3-none-any.whl.metadata (2.4 kB)
Collecting langgraph
  Downloading langgraph-0.6.6-py3-none-any.whl.metadata (6.8 kB)
Collecting langsmith
  Downloading langsmith-0.4.16-py3-none-any.whl.metadata (14 kB)
Collecting azure-core
  Downloading azure_core-1.35.0-py3-none-any.whl.metadata (44 kB)
Collecting duckdb
  Downloading duckdb-1.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.0 kB)
Collecting langchain-core<1.0.0,>=0.3.74 (from langchain-openai)
  Downloading langchain_core-0.3.74-py3-none-any.whl.metadata (5.8 kB)
Collecting openai<2.0.0,>=1.99.9 (from langchain-openai)
  Downloading openai-1.101.0-py3-none-any.whl.metadata (29 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.1.0 (from langgraph)
  Downloading langgraph_checkpoint-2.1.1-py3-none-any.whl.metadata (4.2 kB)
Collecting langgraph-prebuilt<0.7.0,>=0.6.0 (from langgraph)
  Downloading langgraph_prebuilt-0.6.4-py3-none-any.whl.meta

In [9]:
from typing import TypedDict, Annotated, List, Optional
import json
import logging
from pathlib import Path

from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langgraph.graph import StateGraph, END
from pydantic import BaseModel, Field

In [10]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [11]:
# Azure Document Intelligence configuration
DOCUMENT_INTELLIGENCE_KEY = "FM58vRr29iHOMbZsbsG1m63gkwUkAtQt8JpWGFyB53cEAqlX16p4JQQJ99BHACYeBjFXJ3w3AAALACOG4Tyz"
DOCUMENT_INTELLIGENCE_ENDPOINT = "https://tritdocintel.cognitiveservices.azure.com/"

# Initialize Azure Document Intelligence client
document_intelligence_client = DocumentIntelligenceClient(
    endpoint=DOCUMENT_INTELLIGENCE_ENDPOINT,
    credential=AzureKeyCredential(DOCUMENT_INTELLIGENCE_KEY)
)
document_intelligence_client

<azure.ai.documentintelligence._patch.DocumentIntelligenceClient at 0x7349119dd510>

In [12]:
# Khởi tạo LLM với Langchain (sử dụng Azure OpenAI)
llm = AzureChatOpenAI(
    api_key="B1NWhvI61o8yCmsHg4Fa3StgdbPusXhLoUnfRkbYEsNuBzgPbyWmJQQJ99BGACHYHv6XJ3w3AAABACOGHGvM",
    azure_endpoint="https://yvstritopenai.openai.azure.com/",
    deployment_name="gpt-4.1-mini",  # Thay bằng tên deployment thực tế của bạn
    api_version="2025-04-01-preview",
)

In [13]:
# Pydantic Models for Structured Output (same as before)
class ContactInfo(BaseModel):
    """Contact information for the candidate."""
    email: str = Field(description="Candidate's email address", default="")
    phone: str = Field(description="Candidate's phone number", default="")

class EmploymentRecord(BaseModel):
    """Employment history record."""
    position: str = Field(description="Job title/position", default="")
    employer: str = Field(description="Company/employer name", default="")
    start_date: str = Field(description="Start date (YYYY-MM-DD format)", default="")
    end_date: str = Field(description="End date (YYYY-MM-DD or 'Present')", default="")
    summary: str = Field(description="Job description and responsibilities", default="")

class EducationRecord(BaseModel):
    """Education history record."""
    institution: str = Field(description="Educational institution name", default="")
    degree: str = Field(description="Degree or field of study", default="")
    graduation_year: str = Field(description="Graduation year", default="")

class Project(BaseModel):
    """Project information."""
    name: str = Field(description="Project name", default="")
    description: str = Field(description="Project description", default="")

class Employment(BaseModel):
    """Employment information container."""
    history: List[EmploymentRecord] = Field(description="List of employment records", default_factory=list)

class Education(BaseModel):
    """Education information container."""
    history: List[EducationRecord] = Field(description="List of education records", default_factory=list)

class Resume(BaseModel):
    """Complete resume structure following industry standards."""
    name: str = Field(description="Full name of the candidate", default="")
    contact: ContactInfo = Field(description="Contact information", default_factory=ContactInfo)
    employment: Employment = Field(description="Employment history", default_factory=Employment)
    education: Education = Field(description="Education history", default_factory=Education)
    skills: List[str] = Field(description="List of skills and competencies", default_factory=list)
    projects: List[Project] = Field(description="List of notable projects", default_factory=list)

In [14]:
# LangGraph State Definition
class ProcessingState(TypedDict):
    """State container for the CV processing workflow."""
    pdf_path: str
    extracted_text: Annotated[str, "Document Intelligence extracted text"]
    structured_data: Annotated[Optional[dict], "Parsed resume data"]
    error_message: Annotated[Optional[str], "Error information"]
    processing_status: Annotated[str, "Current processing status"]


In [15]:
# Prompt Template with Clear Instructions
EXTRACTION_PROMPT = ChatPromptTemplate.from_template(
    """You are an expert HR data extraction specialist. Extract information from the following resume text and structure it according to the provided schema.

Resume Text:
{resume_text}

Instructions:
- Extract all available information accurately
- Use empty strings for missing text fields
- Use empty arrays for missing list fields
- Format dates as YYYY-MM-DD when possible
- For current positions, use "Present" as end date
- Be precise and avoid hallucinations

Resume text to process:
{resume_text}"""
)

In [16]:
def extract_text_from_pdf(state: ProcessingState) -> ProcessingState:
    """
    Extract text from PDF using Azure Document Intelligence.
    
    Args:
        state: Current processing state
        
    Returns:
        Updated state with extracted text or error information
    """
    try:
        logger.info(f"Starting Azure Document Intelligence extraction for: {state['pdf_path']}")
        
        # Validate file exists
        pdf_path = Path(state["pdf_path"])
        if not pdf_path.exists():
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
        
        # Read PDF file
        with open(pdf_path, "rb") as f:
            # Use prebuilt-layout model for comprehensive text extraction with layout information
            poller = document_intelligence_client.begin_analyze_document(
                "prebuilt-layout", 
                body=f
            )
            
            # Get the analysis result
            result: AnalyzeResult = poller.result()
        
        # Extract text from the analysis result
        extracted_text = ""
        
        # Method 1: Extract from paragraphs (preserves document structure)
        if result.paragraphs:
            logger.info(f"Found {len(result.paragraphs)} paragraphs")
            # Sort paragraphs by their position in the document
            sorted_paragraphs = sorted(
                result.paragraphs, 
                key=lambda p: (p.spans[0].offset if p.spans else 0)
            )
            
            for paragraph in sorted_paragraphs:
                if paragraph.content:
                    extracted_text += paragraph.content + "\n"
        
        # Method 2: Fallback to pages if no paragraphs found
        elif result.pages:
            logger.info(f"Found {len(result.pages)} pages, extracting from lines")
            for page in result.pages:
                if page.lines:
                    for line in page.lines:
                        extracted_text += line.content + "\n"
        
        # Method 3: Final fallback to raw content if available
        elif hasattr(result, 'content') and result.content:
            extracted_text = result.content
        
        if not extracted_text.strip():
            raise ValueError("No text could be extracted from the PDF using Azure Document Intelligence")
        
        logger.info(f"Successfully extracted {len(extracted_text)} characters using Azure Document Intelligence")
        
        return {
            **state,
            "extracted_text": extracted_text,
            "processing_status": "text_extracted"
        }
        
    except Exception as e:
        error_msg = f"Azure Document Intelligence extraction failed: {str(e)}"
        logger.error(error_msg)
        return {
            **state,
            "error_message": error_msg,
            "processing_status": "ocr_failed"
        }

In [17]:
def structure_resume_data(state: ProcessingState) -> ProcessingState:
    """
    Parse extracted text into structured resume data using LLM.
    
    Args:
        state: Current processing state with extracted text
        
    Returns:
        Updated state with structured data or error information
    """
    try:
        logger.info("Starting LLM-based data structuring")
        
        # Create structured output chain
        structured_llm = llm.with_structured_output(
            Resume)
        
        # Build processing chain
        extraction_chain = EXTRACTION_PROMPT | structured_llm
        
        # Process the extracted text
        structured_resume = extraction_chain.invoke({
            "resume_text": state["extracted_text"]
        })
        
        # Convert to dictionary for JSON serialization
        structured_data = structured_resume.dict()
        
        logger.info("Successfully structured resume data")
        
        return {
            **state,
            "structured_data": structured_data,
            "processing_status": "completed"
        }
        
    except Exception as e:
        error_msg = f"Data structuring failed: {str(e)}"
        logger.error(error_msg)
        return {
            **state,
            "error_message": error_msg,
            "processing_status": "structuring_failed"
        }

In [18]:
def should_continue_processing(state: ProcessingState) -> str:
    """
    Determine the next step in the workflow.
    
    Args:
        state: Current processing state
        
    Returns:
        Next node name or END
    """
    if state.get("error_message"):
        return END
    
    status = state.get("processing_status", "")
    
    if status == "text_extracted":
        return "structure_data"
    elif status in ["completed", "ocr_failed", "structuring_failed"]:
        return END
    else:
        return "extract_text"

In [19]:
# Build LangGraph Workflow
def create_cv_processing_workflow() -> StateGraph:
    """
    Create and configure the CV processing workflow.
    
    Returns:
        Compiled StateGraph for CV processing
    """
    workflow = StateGraph(ProcessingState)
    
    # Add processing nodes
    workflow.add_node("extract_text", extract_text_from_pdf)
    workflow.add_node("structure_data", structure_resume_data)
    
    # Define workflow edges
    workflow.set_entry_point("extract_text")
    workflow.add_conditional_edges(
        "extract_text",
        should_continue_processing,
        {
            "structure_data": "structure_data",
            END: END
        }
    )
    workflow.add_conditional_edges(
        "structure_data",
        should_continue_processing,
        {END: END}
    )
    
    return workflow.compile()

In [20]:
# Main Processing Function
class CVProcessor:
    """Main CV processing class with Azure Document Intelligence workflow management."""
    
    def __init__(self):
        """Initialize the CV processor with workflow."""
        self.workflow = create_cv_processing_workflow()
    
    def process_resume(
        self,
        pdf_path: str,
        output_path: Optional[str] = None
    ) -> dict:
        """
        Process a CV PDF file and extract structured data using Azure Document Intelligence.
        
        Args:
            pdf_path: Path to the PDF file
            output_path: Optional path to save JSON output
            
        Returns:
            Structured resume data as dictionary
            
        Raises:
            ValueError: If processing fails
        """
        # Initialize processing state
        initial_state: ProcessingState = {
            "pdf_path": pdf_path,
            "extracted_text": "",
            "structured_data": None,
            "error_message": None,
            "processing_status": "initialized"
        }
        
        # Execute workflow
        final_state = self.workflow.invoke(initial_state)
        
        # Check for errors
        if final_state.get("error_message"):
            raise ValueError(final_state["error_message"])
        
        structured_data = final_state.get("structured_data", {})
        
        # Save to file if requested
        if output_path:
            output_file = Path(output_path)
            with output_file.open('w', encoding='utf-8') as f:
                json.dump(structured_data, f, ensure_ascii=False, indent=2)
            logger.info(f"Structured data saved to: {output_file}")
        
        return structured_data

## Run

In [21]:
cv_processor = CVProcessor()

In [22]:
input_pdf = "10265057.pdf"
output_json = "structured_resume.json"

In [23]:
result = cv_processor.process_resume(input_pdf, output_json)

INFO:__main__:Starting Azure Document Intelligence extraction for: 10265057.pdf
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://tritdocintel.cognitiveservices.azure.com//documentintelligence/documentModels/prebuilt-layout:analyze?api-version=REDACTED'
Request method: 'POST'
Request headers:
    'content-type': 'application/octet-stream'
    'Accept': 'application/json'
    'x-ms-client-request-id': '0d41939c-80f0-11f0-8a88-00155db623cc'
    'User-Agent': 'azsdk-python-ai-documentintelligence/1.0.2 Python/3.11.13 (Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39)'
    'Ocp-Apim-Subscription-Key': 'REDACTED'
A body is sent with the request
INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 202
Response headers:
    'Content-Length': '0'
    'Operation-Location': 'REDACTED'
    'x-envoy-upstream-service-time': 'REDACTED'
    'apim-request-id': 'REDACTED'
    'Strict-Transport-Security': 'REDACTED'
    'x-content-type-options': 'R

# DuckDB Snowflake Schema and Import System for Resume Data

In [24]:
import duckdb
import json
from typing import Dict, List, Optional
from pathlib import Path
import logging

In [25]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [26]:
class ResumeDatabase:
    """Handles DuckDB operations for resume data with snowflake schema."""
    
    def __init__(self, database_path: str = "resume_database.db"):
        """
        Initialize the resume database.
        
        Args:
            database_path: Path to DuckDB file (use ':memory:' for in-memory)
        """
        self.conn = duckdb.connect(database=database_path)
        self.create_snowflake_schema()
    
    def create_snowflake_schema(self):
        """Create the snowflake schema for resume data."""
        
        # Snowflake schema DDL
        schema_ddl = """
        -- Core candidate information (dimension table)
        CREATE TABLE IF NOT EXISTS candidates (
            candidate_id INTEGER PRIMARY KEY,
            name VARCHAR NOT NULL,
            email VARCHAR,
            phone VARCHAR,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        );
        
        -- Work experience fact table
        CREATE TABLE IF NOT EXISTS work_experience (
            work_exp_id INTEGER PRIMARY KEY,
            candidate_id INTEGER NOT NULL,
            job_title VARCHAR,
            company VARCHAR,
            start_date DATE,
            end_date DATE,
            description TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (candidate_id) REFERENCES candidates(candidate_id)
        );
        
        -- Education fact table
        CREATE TABLE IF NOT EXISTS education (
            education_id INTEGER PRIMARY KEY,
            candidate_id INTEGER NOT NULL,
            degree VARCHAR,
            institution VARCHAR,
            graduation_year VARCHAR,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (candidate_id) REFERENCES candidates(candidate_id)
        );
        
        -- Skills dimension table (normalized)
        CREATE TABLE IF NOT EXISTS skills_master (
            skill_id INTEGER PRIMARY KEY,
            skill_name VARCHAR UNIQUE NOT NULL
        );
        
        -- Candidate-Skills relationship table (many-to-many)
        CREATE TABLE IF NOT EXISTS candidate_skills (
            candidate_id INTEGER,
            skill_id INTEGER,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            PRIMARY KEY (candidate_id, skill_id),
            FOREIGN KEY (candidate_id) REFERENCES candidates(candidate_id),
            FOREIGN KEY (skill_id) REFERENCES skills_master(skill_id)
        );
        
        -- Projects fact table
        CREATE TABLE IF NOT EXISTS projects (
            project_id INTEGER PRIMARY KEY,
            candidate_id INTEGER NOT NULL,
            project_name VARCHAR,
            description TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            FOREIGN KEY (candidate_id) REFERENCES candidates(candidate_id)
        );
        
        -- Create sequences for primary keys
        CREATE SEQUENCE IF NOT EXISTS candidate_seq START 1;
        CREATE SEQUENCE IF NOT EXISTS work_exp_seq START 1;
        CREATE SEQUENCE IF NOT EXISTS education_seq START 1;
        CREATE SEQUENCE IF NOT EXISTS skill_seq START 1;
        CREATE SEQUENCE IF NOT EXISTS project_seq START 1;
        """
        
        # Execute schema creation
        statements = schema_ddl.split(';')
        for statement in statements:
            if statement.strip():
                self.conn.execute(statement)
        
        logger.info("Snowflake schema created successfully")
    
    def import_resume_data(self, structured_resume: Dict) -> int:
        """
        Import structured resume JSON data into the snowflake schema.
        
        Args:
            structured_resume: Dictionary containing structured resume data
            
        Returns:
            candidate_id of the inserted candidate
        """
        try:
            # Begin transaction
            self.conn.begin()
            
            # 1. Insert candidate (main dimension)
            candidate_id = self.conn.execute("SELECT nextval('candidate_seq')").fetchone()[0]
            
            candidate_data = {
                'candidate_id': candidate_id,
                'name': structured_resume.get('name', ''),
                'email': structured_resume.get('contact', {}).get('email', ''),
                'phone': structured_resume.get('contact', {}).get('phone', '')
            }
            
            self.conn.execute(
                """INSERT INTO candidates (candidate_id, name, email, phone) 
                   VALUES ($candidate_id, $name, $email, $phone)""",
                candidate_data
            )
            
            # 2. Insert work experience
            employment_history = structured_resume.get('employment', {}).get('history', [])
            for work in employment_history:
                work_exp_id = self.conn.execute("SELECT nextval('work_exp_seq')").fetchone()[0]
                
                # Parse dates properly
                start_date = self._parse_date(work.get('start', ''))
                end_date = self._parse_date(work.get('end', '')) if work.get('end', '').lower() != 'present' else None
                
                work_data = {
                    'work_exp_id': work_exp_id,
                    'candidate_id': candidate_id,
                    'job_title': work.get('position', ''),
                    'company': work.get('employer', ''),
                    'start_date': start_date,
                    'end_date': end_date,
                    'description': work.get('summary', '')
                }
                
                self.conn.execute(
                    """INSERT INTO work_experience 
                       (work_exp_id, candidate_id, job_title, company, start_date, end_date, description)
                       VALUES ($work_exp_id, $candidate_id, $job_title, $company, $start_date, $end_date, $description)""",
                    work_data
                )
            
            # 3. Insert education
            education_history = structured_resume.get('education', {}).get('history', [])
            for edu in education_history:
                education_id = self.conn.execute("SELECT nextval('education_seq')").fetchone()[0]
                
                edu_data = {
                    'education_id': education_id,
                    'candidate_id': candidate_id,
                    'degree': edu.get('area', ''),  # Using "area" as degree field
                    'institution': edu.get('institution', ''),
                    'graduation_year': edu.get('end', '')
                }
                
                self.conn.execute(
                    """INSERT INTO education (education_id, candidate_id, degree, institution, graduation_year)
                       VALUES ($education_id, $candidate_id, $degree, $institution, $graduation_year)""",
                    edu_data
                )
            
            # 4. Insert skills (normalized approach)
            skills = structured_resume.get('skills', [])
            for skill_name in skills:
                if skill_name:  # Skip empty skills
                    # Check if skill exists, if not create it
                    existing_skill = self.conn.execute(
                        "SELECT skill_id FROM skills_master WHERE skill_name = ?", [skill_name]
                    ).fetchone()
                    
                    if existing_skill:
                        skill_id = existing_skill[0]
                    else:
                        skill_id = self.conn.execute("SELECT nextval('skill_seq')").fetchone()[0]
                        self.conn.execute(
                            "INSERT INTO skills_master (skill_id, skill_name) VALUES (?, ?)",
                            [skill_id, skill_name]
                        )
                    
                    # Link candidate to skill
                    self.conn.execute(
                        "INSERT OR IGNORE INTO candidate_skills (candidate_id, skill_id) VALUES (?, ?)",
                        [candidate_id, skill_id]
                    )
            
            # 5. Insert projects
            projects = structured_resume.get('projects', [])
            for project in projects:
                project_id = self.conn.execute("SELECT nextval('project_seq')").fetchone()[0]
                
                project_data = {
                    'project_id': project_id,
                    'candidate_id': candidate_id,
                    'project_name': project.get('name', ''),
                    'description': project.get('description', '')
                }
                
                self.conn.execute(
                    """INSERT INTO projects (project_id, candidate_id, project_name, description)
                       VALUES ($project_id, $candidate_id, $project_name, $description)""",
                    project_data
                )
            
            # Commit transaction
            self.conn.commit()
            logger.info(f"Successfully imported resume data for candidate_id: {candidate_id}")
            return candidate_id
            
        except Exception as e:
            # Rollback on error
            self.conn.rollback()
            logger.error(f"Error importing resume data: {e}")
            raise
    
    def _parse_date(self, date_string: str) -> Optional[str]:
        """Parse date string to proper format."""
        if not date_string or date_string.lower() == 'present':
            return None
        
        # Simple date parsing - can be enhanced with dateutil
        try:
            # Assume format YYYY-MM-DD
            if len(date_string) == 10 and '-' in date_string:
                return date_string
            # Handle year-only format
            elif len(date_string) == 4 and date_string.isdigit():
                return f"{date_string}-01-01"
            else:
                return date_string
        except:
            return date_string
    
    def get_candidate_summary(self, candidate_id: int) -> Dict:
        """Get complete candidate information with joins."""
        
        query = """
        SELECT 
            c.candidate_id,
            c.name,
            c.email,
            c.phone,
            -- Work experience
            we.job_title,
            we.company,
            we.start_date,
            we.end_date,
            -- Education
            e.degree,
            e.institution,
            e.graduation_year,
            -- Skills (aggregated)
            GROUP_CONCAT(sm.skill_name) as skills,
            -- Projects
            p.project_name,
            p.description as project_description
        FROM candidates c
        LEFT JOIN work_experience we ON c.candidate_id = we.candidate_id
        LEFT JOIN education e ON c.candidate_id = e.candidate_id
        LEFT JOIN candidate_skills cs ON c.candidate_id = cs.candidate_id
        LEFT JOIN skills_master sm ON cs.skill_id = sm.skill_id
        LEFT JOIN projects p ON c.candidate_id = p.candidate_id
        WHERE c.candidate_id = ?
        GROUP BY c.candidate_id, c.name, c.email, c.phone, we.job_title, we.company, 
                 we.start_date, we.end_date, e.degree, e.institution, e.graduation_year,
                 p.project_name, p.description
        """
        
        result = self.conn.execute(query, [candidate_id]).fetchall()
        return result
    
    def search_candidates_by_skill(self, skill_name: str) -> List:
        """Search candidates by skill."""
        
        query = """
        SELECT DISTINCT c.candidate_id, c.name, c.email, c.phone
        FROM candidates c
        JOIN candidate_skills cs ON c.candidate_id = cs.candidate_id
        JOIN skills_master sm ON cs.skill_id = sm.skill_id
        WHERE LOWER(sm.skill_name) LIKE LOWER(?)
        """
        
        return self.conn.execute(query, [f"%{skill_name}%"]).fetchall()
    
    def get_database_stats(self) -> Dict:
        """Get database statistics."""
        
        stats = {
            'total_candidates': self.conn.execute("SELECT COUNT(*) FROM candidates").fetchone()[0],
            'total_work_experiences': self.conn.execute("SELECT COUNT(*) FROM work_experience").fetchone()[0],
            'total_educations': self.conn.execute("SELECT COUNT(*) FROM education").fetchone()[0],
            'total_skills': self.conn.execute("SELECT COUNT(*) FROM skills_master").fetchone()[0],
            'total_projects': self.conn.execute("SELECT COUNT(*) FROM projects").fetchone()[0]
        }
        
        return stats
    
    def close(self):
        """Close database connection."""
        self.conn.close()

In [27]:
# Integration with CV Processing System
def integrate_with_cv_processor():
    """Extended CV processor with database integration."""
    
    # Initialize database
    resume_db = ResumeDatabase("hr_resume_system.db")
    
    # Sample usage - this would integrate with your CV processor
    def process_and_store_resume(pdf_path: str) -> int:
        """Process resume PDF and store in database."""
        
        # This would use your existing CVProcessor
        cv_processor = CVProcessor()
        structured_data = cv_processor.process_resume(pdf_path)
        candidate_id = resume_db.import_resume_data(structured_data)
        return candidate_id
    return resume_db, process_and_store_resume

In [28]:
resume_db, process_resume_func = integrate_with_cv_processor()

INFO:__main__:Snowflake schema created successfully


In [29]:
candidate_id = process_resume_func("10265057.pdf")

INFO:__main__:Starting Azure Document Intelligence extraction for: 10265057.pdf
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://tritdocintel.cognitiveservices.azure.com//documentintelligence/documentModels/prebuilt-layout:analyze?api-version=REDACTED'
Request method: 'POST'
Request headers:
    'content-type': 'application/octet-stream'
    'Accept': 'application/json'
    'x-ms-client-request-id': '18a45c10-80f0-11f0-8a88-00155db623cc'
    'User-Agent': 'azsdk-python-ai-documentintelligence/1.0.2 Python/3.11.13 (Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39)'
    'Ocp-Apim-Subscription-Key': 'REDACTED'
A body is sent with the request


INFO:azure.core.pipeline.policies.http_logging_policy:Response status: 202
Response headers:
    'Content-Length': '0'
    'Operation-Location': 'REDACTED'
    'x-envoy-upstream-service-time': 'REDACTED'
    'apim-request-id': 'REDACTED'
    'Strict-Transport-Security': 'REDACTED'
    'x-content-type-options': 'REDACTED'
    'x-ms-region': 'REDACTED'
    'Date': 'Sun, 24 Aug 2025 13:41:55 GMT'
INFO:azure.core.pipeline.policies.http_logging_policy:Request URL: 'https://tritdocintel.cognitiveservices.azure.com/documentintelligence/documentModels/prebuilt-layout/analyzeResults/09dc1e1e-0b82-4dda-af8a-42e3a870c85b?api-version=REDACTED'
Request method: 'GET'
Request headers:
    'x-ms-client-request-id': '18a45c10-80f0-11f0-8a88-00155db623cc'
    'User-Agent': 'azsdk-python-ai-documentintelligence/1.0.2 Python/3.11.13 (Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.39)'
    'Ocp-Apim-Subscription-Key': 'REDACTED'
No body was attached to the request
INFO:azure.core.pipeline.polici

In [30]:
print(f"✅ Resume imported with candidate_id: {candidate_id}")

✅ Resume imported with candidate_id: 1


In [31]:
summary = resume_db.get_candidate_summary(candidate_id)
print(f"📊 Candidate Summary: {summary}")

📊 Candidate Summary: [(1, '', '', '', 'System Data Analyst', 'Company Name', None, None, '', 'PURDUE UNIVERSITY', '', 'big data,C,C++,charts,Circuit design,hardware,Data acquisition,data analyst,data collection,data mining,databases,database,dBm,DTI,design software,documentation,functional,GSM,innovation,Java,LabView,Team leader,Logic Analyzer,Mac,manufacturing processes,Matlab,Excel,Microsoft office,Office,Microwave,Radar,NCs,Network,dB,packaging,pivot tables,Programming,project design,proposals,Publication,Python,quality,requirement,research,SAS,self-starter,Spectrum analyzer,SPSS,SQL,SSL,statistics,surveys,system design,troubleshooting,validation,big data,C,C++,charts,Circuit design,hardware,Data acquisition,data analyst,data collection,data mining,databases,database,dBm,DTI,design software,documentation,functional,GSM,innovation,Java,LabView,Team leader,Logic Analyzer,Mac,manufacturing processes,Matlab,Excel,Microsoft office,Office,Microwave,Radar,NCs,Network,dB,packaging,pivot tab

In [34]:
stats = resume_db.get_database_stats()
print(f"📈 Database Stats: {stats}")

ConnectionException: Connection Error: Connection already closed!

In [33]:
resume_db.close()