In [6]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import ComplexField, SearchableField, SimpleField, SearchIndex, SemanticConfiguration, SemanticField, SemanticPrioritizedFields, SemanticSearch
import base64
import os
from dotenv import load_dotenv
from openai import AzureOpenAI
import requests
import json
import string
import random

load_dotenv()

AZURE_SEARCH_SERVICE_NAME = os.environ.get('AZURE_SEARCH_SERVICE_NAME')
AZURE_SEARCH_API_KEY = os.environ.get('AZURE_SEARCH_API_KEY')
INDEX_NAME = 'resume-index-test'
AZURE_OPENAI_PREVIEW_API_VERSION=os.environ.get("AZURE_OPENAI_PREVIEW_API_VERSION")
AZURE_OPENAI_API_KEY=os.environ.get("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT=os.environ.get("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_MODEL=os.environ.get("AZURE_OPENAI_MODEL")
AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT=os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
AZURE_DOCUMENT_INTELLIGENCE_KEY=os.environ.get("AZURE_DOCUMENT_INTELLIGENCE_KEY")

client = AzureOpenAI(
    api_version= AZURE_OPENAI_PREVIEW_API_VERSION,
    api_key=AZURE_OPENAI_API_KEY,
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)

#### Document indexing

In [2]:
def create_search_index(semantic_config):
    index_client = SearchIndexClient(
        endpoint=f"https://{AZURE_SEARCH_SERVICE_NAME}.search.windows.net",
        credential=AzureKeyCredential(AZURE_SEARCH_API_KEY)
    )

    fields = [
        SimpleField(name="id", type="Edm.String", key=True),
        SimpleField(name="name", type="Edm.String"),
        SearchableField(name="content", type="Edm.String", analyzer_name="th.lucene"),
        SearchableField(name="summary", type="Edm.String", analyzer_name="th.lucene"),
        SimpleField(name="file_name", type="Edm.String")
    ]
    semantic_search = SemanticSearch(configurations=[semantic_config])
    index = SearchIndex(name=INDEX_NAME, fields=fields, semantic_search=semantic_search)
    index_client.create_index(index)

In [3]:
semantic_config = SemanticConfiguration(
    name="my-semantic-config",
    prioritized_fields=SemanticPrioritizedFields(
        content_fields=[SemanticField(field_name="content")],
        keywords_fields=[SemanticField(field_name="summary")]
    )
)


In [4]:
create_search_index(semantic_config=semantic_config)

In [24]:
def send_openai_completion_request(full_text_extract):  
    system_prompt = """You are HR department assistant you job is to help summarize the data from job applicant
            the data you need to extract is the keyword of Job title and Skill only answer with the keyword
            in the form of plain text do not include any markdown
            """
            
    headers = {  
        "Content-Type": "application/json",  
        "api-key": AZURE_OPENAI_API_KEY,  
    }  
    
    # Construct payload
    payload = {  
        "messages": [  
            {  
                "role": "system",  
                "content": [  
                    {  
                        "type": "text",  
                        "text": f"{system_prompt}" 
                    }  
                ]  
            },  
            {  
                "role": "user",  
                "content": full_text_extract 
            },  
        ],  
        "temperature": 0.7,  
        "top_p": 0.95,  
        "max_tokens": 1500  
    }   
    
    try:  
        response = requests.post(AZURE_OPENAI_ENDPOINT+'/openai/deployments/gpt-4o-vision/chat/completions?api-version=2024-02-15-preview', headers=headers, json=payload, timeout=120)  
        response.raise_for_status()  
    except requests.RequestException as e:  
        raise SystemExit(f"Failed to make the request. Error: {e}")  
    except requests.exceptions.Timeout as e:
        raise SystemExit(f"Request timeout: {e}")  
    
    return response

In [7]:
PDF_HEADERS = {
    "title": "h1",
    "sectionHeading": "h2"
}

endpoint = AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT
credential = AzureKeyCredential(AZURE_DOCUMENT_INTELLIGENCE_KEY)
form_recognizer_client = DocumentAnalysisClient(endpoint, credential)
def table_to_html(table):
    table_html = "<table>"
    rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)]
    for row_cells in rows:
        table_html += "<tr>"
        for cell in row_cells:
            tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
            cell_spans = ""
            if cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}"
            if cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}"
            table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
        table_html +="</tr>"
    table_html += "</table>"
    return table_html

def extract_pdf_content(file_path, form_recognizer_client, use_layout=False): 
    offset = 0
    page_map = []
    model = "prebuilt-layout" if use_layout else "prebuilt-read"
    with open(file_path, "rb") as f:
        poller = form_recognizer_client.begin_analyze_document(model, document = f)
    form_recognizer_results = poller.result()
    # (if using layout) mark all the positions of headers
    roles_start = {}
    roles_end = {}
    for paragraph in form_recognizer_results.paragraphs:
        if paragraph.role!=None:
            para_start = paragraph.spans[0].offset
            para_end = paragraph.spans[0].offset + paragraph.spans[0].length
            roles_start[para_start] = paragraph.role
            roles_end[para_end] = paragraph.role

    for page_num, page in enumerate(form_recognizer_results.pages):
        tables_on_page = [table for table in form_recognizer_results.tables if table.bounding_regions[0].page_number == page_num + 1]

        # (if using layout) mark all positions of the table spans in the page
        page_offset = page.spans[0].offset
        page_length = page.spans[0].length
        table_chars = [-1]*page_length
        for table_id, table in enumerate(tables_on_page):
            for span in table.spans:
                # replace all table spans with "table_id" in table_chars array
                for i in range(span.length):
                    idx = span.offset - page_offset + i
                    if idx >=0 and idx < page_length:
                        table_chars[idx] = table_id

        # build page text by replacing charcters in table spans with table html and replace the characters corresponding to headers with html headers, if using layout
        page_text = ""
        added_tables = set()
        for idx, table_id in enumerate(table_chars):
            if table_id == -1:
                position = page_offset + idx
                if position in roles_start.keys():
                    role = roles_start[position]
                    if role in PDF_HEADERS:
                        page_text += f"<{PDF_HEADERS[role]}>"
                if position in roles_end.keys():
                    role = roles_end[position]
                    if role in PDF_HEADERS:
                        page_text += f"</{PDF_HEADERS[role]}>"

                page_text += form_recognizer_results.content[page_offset + idx]
                
            elif not table_id in added_tables:
                page_text += table_to_html(tables_on_page[table_id])
                added_tables.add(table_id)

        page_text += " "
        page_map.append((page_num, offset, page_text))
        offset += len(page_text)

    full_text = "".join([page_text for _, _, page_text in page_map])
    return full_text

In [8]:
extracted_text = extract_pdf_content('./OneDrive_1_10-4-2024/pid_redacted/356LW841M3P0.pdf', form_recognizer_client, False)

In [None]:
extracted_text

In [25]:
response = send_openai_completion_request(extracted_text)

In [28]:
summary_text = response.json()['choices'][0]['message']['content']

In [29]:
def index_pdf(content, keyword):
    # Initialize the search client
    search_client = SearchClient(
        endpoint=f"https://{AZURE_SEARCH_SERVICE_NAME}.search.windows.net",
        index_name=INDEX_NAME,
        credential=AzureKeyCredential(AZURE_SEARCH_API_KEY)
    )
    
    # Create a document to index (adjust fields as per your index definition)
    document = {
        "id": "356LW841M3P0",
        "name": "Arthur Morgan",
        "content": str(content),
        "summary": str(keyword),
        "file_name": './OneDrive_1_10-4-2024/pid_redacted/356LW841M3P0.pdf'
    }

    # Upload the document to Azure Search
    result = search_client.upload_documents(documents=[document])
    print(f"Uploaded document: {result}")


In [None]:
index_pdf(extracted_text, summary_text)

In [None]:
search_client = SearchClient(
        endpoint=f"https://{AZURE_SEARCH_SERVICE_NAME}.search.windows.net",
        index_name=INDEX_NAME,
        credential=AzureKeyCredential(AZURE_SEARCH_API_KEY)
    )
results = search_client.search(search_text='มีประสบการณ์ทำ ERP', select=["content", "summary"],top=3)
for result in results:  
    print(f"Score: {result['@search.score']}")    
    print(f"Content: {result['content']}\n")
    print(f"Content: {result['summary']}\n")  
    print("###############################")

### -------------DEPRECATED-------------

In [25]:
system_prompt = '''Extract these field from a resume 

Name (name) : Name of the applicant

Contact Information (pid) : Name, phone number, email address, and sometimes a LinkedIn profile or portfolio link.

Summary or Objective (intro) : A brief statement about career goals or a summary of qualifications.

Work Experience (work) : Job titles, company names, dates of employment, employment duration (months or years calculate from date of employment), and key responsibilities or achievements.

Education (education) : Degrees, schools, graduation dates, and sometimes relevant coursework or academic achievements.

Skills (skills): A list of hard and soft skills relevant to the job.

Competition (compatition) (optional) : Any competition that applicant have participated in

Certifications (certifications) (optional) : Any certifications that are relevant to the job.

Projects (projects) (Optional): Significant personal or professional projects, especially if relevant to the role.

Awards & Honors (awards) (Optional): Any professional or academic awards.

Languages (languages) (Optional): Any additional languages spoken or written.

Reply with the json format with 3 keys 
1. name : the name of the applicant 
2. content : combine pid, summary, intro, work, education, skill, competition,certifications, project, award and language into a plain text data,
3. summary : summerize work experience how long (months or years) does this applicant work in each job position and skillset that this applicant have in plain text English

Do NOT include json markdown in the reply only json response'''


In [4]:
def respone_extraction(response: requests.Response):
    response_json = response.json()
    response_json_dump = json.dumps(response_json, indent=2)
    extracted_data = json.loads(response_json_dump)["choices"][0]["message"]["content"]
    return extracted_data

In [5]:
def data_extraction(filename: str, system_prompt: str) :
    headers = {
        "Content-Type": "application/json",
        "api-key": AZURE_OPENAI_API_KEY,
    }
    payload = {
        "messages": [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
                        "text": f"{system_prompt}"
                     }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{base64.b64encode(open(filename, 'rb').read()).decode('ascii')}"
                        }
                    }
                ]
            }
        ],
        "temperature": 0.5,
        "top_p": 0.95,
        "max_tokens": 1500,
        "stream": False
    }

    # Send request
    try:
        response = requests.post(AZURE_OPENAI_ENDPOINT+'/openai/deployments/gpt-4o-vision/chat/completions?api-version=2024-02-15-preview', headers=headers, json=payload)
        response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
    except requests.RequestException as e:
        raise SystemExit(f"Failed to make the request. Error: {e}")

    # Handle the response as needed (e.g., print or process)
    return response

In [26]:
def index_pdf(file_path, system_prompt):
    # Initialize the search client
    search_client = SearchClient(
        endpoint=f"https://{AZURE_SEARCH_SERVICE_NAME}.search.windows.net",
        index_name=INDEX_NAME,
        credential=AzureKeyCredential(AZURE_SEARCH_API_KEY)
    )

    # Extract the PDF file
    extracted_response = respone_extraction(data_extraction(filename=file_path, system_prompt=system_prompt))
    try :
        extracted_data = json.loads(extracted_response)
    except :
        extracted_data = json.loads(extracted_response.replace("```json\n", "").replace("\n```", ""))
    print(extracted_data)
    # Create a document to index (adjust fields as per your index definition)
    document = {
        "id": ''.join(random.choices(string.ascii_uppercase + string.digits, k=12)),
        "name": extracted_data['name'],
        "content": extracted_data['content'],
        "summary": extracted_data['summary'],
        "file_name": os.path.basename(file_path)
    }

    # Upload the document to Azure Search
    result = search_client.upload_documents(documents=[document])
    print(f"Uploaded document: {result}")


In [None]:
index_pdf(file_path=r'./uploads/phanxv/606faed39472eb276636b8f5_pdf-resume-template-format.jpg', system_prompt=system_prompt)

#### Document Search

In [None]:
for result in results:  
    print(f"Score: {result['@search.score']}")    
    print(f"Content: {result['content']}\n")   
    print("###############################")