In [3]:
# Define the folder of the desired analysis
analysis_folder = "001_Analysis"
results_folder_path = "./Candidates Analysis/" + analysis_folder + "/Results"
gptModel = "gpt-4o"

### PDF Text Extraction Script

This script extracts text from all PDF files within a designated folder and saves the extracted content in a JSON file. It processes every PDF in the folder, stores the filename and corresponding text in a JSON structure, and ensures that the JSON file is overwritten with fresh data each time the script is executed. The output JSON file is saved in the same directory as the PDF files, making it convenient to access the extracted information.

In [4]:
import fitz  # PyMuPDF
import os
import json



# Define the folder path
folder_path = "./Candidates Analysis/" + analysis_folder + "/CV"

# Initialize an empty list to store the extracted text
extracted_data = []

# Iterate through all the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):  # Process only PDF files
        file_path = os.path.join(folder_path, filename)
        
        # Open the PDF file
        doc = fitz.open(file_path)
        
        # Extract text from each page
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        
        # Store the extracted text in a dictionary with the filename as key
        extracted_data.append({
            "filename": filename,
            "content": text
        })

# Convert the list to JSON format
extracted_json = json.dumps(extracted_data, indent=4)

# Define the output path to save the JSON file in the same location as the CV folder
output_json_path = os.path.join(results_folder_path, "OCR_Results.json")

# Save the JSON data to a file (this will overwrite the file if it already exists)
with open(output_json_path, "w") as json_file:
    json_file.write(extracted_json)

print(f"JSON data has been successfully saved to {output_json_path}")


JSON data has been successfully saved to ./Candidates Analysis/001_Analysis/Results\OCR_Results.json


### JSON Structure for CV Data

This Python script defines a JSON structure for storing comprehensive CV (Curriculum Vitae) data. 

In [5]:
import json
structure = {
    "content": {
        "personal_information": {
            "name": "",
            "phone": "",
            "email": "",
            "location": { "city": "", "countryISO": "", "countryName": "" }
        },
        "CV summary": "",
        "education": [
            {
                "degree": "",
                "institution": "",
                "start_date_YYYY": "",
                "end_date_YYYY": "",
                "sort_order_newest_to_oldest": "",
                "location": { "city": "", "countryISO": "", "countryName": "" }
            }
        ],
        "work_experience": [
            {
                "title": "",
                "company": "",
                "start_date_YYYY": "",
                "end_date_YYYY": "",
                "sort_order_newest_to_oldest": "",
                "location": { "city": "", "countryISO": "", "countryName": "" },
                "responsibilities": []
            }
        ],
        "skills": [],
        "certifications_and_courses": [],
        "languages": [{ "language": "", "proficiency": "" }],
        "systems_knowledge": []
    }
}

cvStructure = json.dumps(structure)

### Processing OCR Results and Summarizing with OpenAI

This script integrates OCR results with OpenAI's API to generate structured summaries of the content.

This approach ensures that each OCR result is only processed once and stored in a structured format.


In [6]:
import json
from openai import OpenAI
import sys
import os

# Temporarily add the parent directory to the system path to import Constants
sys.path.append("..")
import Constants
sys.path.remove("..")

# Initialize OpenAI client with the API key from Constants
client = OpenAI(api_key=Constants.OpenAIKey)

# Define the paths for the JSON files
json_path_OCR_Result = results_folder_path + "/OCR_Results.json"
json_output_path = results_folder_path + "/LLM_Normalized_CV.json"

# Load the OCR results from the JSON file
with open(json_path_OCR_Result, 'r') as file:
    ocr_results = json.load(file)

# Load existing responses if the output file already exists
if os.path.exists(json_output_path):
    with open(json_output_path, 'r') as file:
        existing_responses = json.load(file)
else:
    existing_responses = []

# Iterate over each object in the JSON file and send it to the OpenAI API
for ocr_object in ocr_results:
    filename = ocr_object['filename']
    
    # Check if the filename already exists in the existing responses
    if any(response['filename'] == filename for response in existing_responses):
        print(f"Skipping {filename} as it already exists.")
        continue
    
    content = ocr_object['content']
    
    completion = client.chat.completions.create(
        model = gptModel,
        response_format={"type": "json_object"},
        messages=[
                {"role": "system", "content": "You are a helpful assistant whose job is to perfectly understand the following data" + content},
                {"role": "user", "content": "You need to summarize the data in the following structured JSON format:" + cvStructure},
                {"role": "user", "content": "The fields skills, certifications_and_courses, and system_knowledge should only contain a list of values and should not contain records"},
                {"role": "user", "content": "The fields skills refers to 'soft skills' and should contain values like 'teamwork', 'communication', 'leadership', etc."},
                {"role": "user", "content": "The fields systems_knowledge refers to knowledge of software, tools, or systems and should contain values like 'Microsoft Office', 'Python', 'Salesforce', etc."},
                {"role": "user", "content": "The field proficiency in languages should be set to one of the following values: basic, intermediate, advanced, full professional proficiency"},
                {"role": "user", "content": "education and working_experiences field sort_order_newest_to_oldest should be set to 0 for the most recent record and should be incremented by 1 for each subsequent record"},
                {"role": "user", "content": "education and working_experiences field format for start_date_YYYY and end_date_YYYY should be YYYY"}
        ]
    )
    
    # Create the response object
    response_object = {
        "filename": filename,
        "cv": completion.choices[0].message.content
    }

    # Append the response object to the list of existing responses
    existing_responses.append(response_object)

    # Save the responses to the output file after each completion
    with open(json_output_path, 'w') as outfile:
        json.dump(existing_responses, outfile, indent=4)
    
    print(f"Processed {filename} and saved the response.")

print("Processing complete.")


Skipping Alex_Johnson_CV.pdf as it already exists.
Skipping Emma_Brown_CV.pdf as it already exists.
Skipping John_Carter_CV.pdf as it already exists.
Skipping Maria_Thompson_CV.pdf as it already exists.
Skipping Michael_Smith_CV.pdf as it already exists.
Processing complete.


### JSON Structure for Job Position Description

In [7]:
import json
structure = {
  "JobTitle": "",
  "Department": "",
  "ReportsTo": "",
  "JobPurpose": "",
  "KeyResponsibilities": [],
  "WorkModality": "",
  "AdditionalDetails": "",
  "Qualifications": {
    "education": [],
    "work_experience": [],
    "skills": [],
    "certifications_and_courses": [],
    "languages": [
        {
            "language": "",
            "profieciency": ""
        }
    ],
    "systems_knowledge": []
  }
}

positionDescriptionStructure = json.dumps(structure)

### Position description OCR

In [8]:
import fitz  # PyMuPDF
import os
import json


# Define the folder path
folder_path_position = "./Candidates Analysis/" + analysis_folder + "/Position"

# Initialize a variable to store the extracted text
extracted_data = None

# Load only the first PDF file found in the folder
for filename in os.listdir(folder_path_position):
    if filename.endswith(".pdf"):  # Process only PDF files
        file_path = os.path.join(folder_path_position, filename)
        
        # Open the PDF file
        doc = fitz.open(file_path)
        
        # Extract text from each page
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        
        # Store the extracted text in a dictionary with the filename as key
        extracted_data = {
            "filename": filename,
            "content": text
        }
        
        # Break the loop after processing the first file
        break

# If a file was processed, convert the dictionary to JSON format and save it
if extracted_data:
    extracted_json = json.dumps(extracted_data, indent=4)

    # Define the output path to save the JSON file in the same location as the Position folder
    output_json_path = results_folder_path + "/OCR_Position.json"

    # Save the JSON data to a file (this will overwrite the file if it already exists)
    with open(output_json_path, "w") as json_file:
        json_file.write(extracted_json)

    print(f"JSON data has been successfully saved to {output_json_path}")
else:
    print("No PDF files found in the specified folder.")

JSON data has been successfully saved to ./Candidates Analysis/001_Analysis/Results/OCR_Position.json


### Position description LLM normalization

In [9]:
import json
from openai import OpenAI
import sys


# Temporarily add the parent directory to the system path to import Constants
sys.path.append("..")
import Constants
sys.path.remove("..")

# Initialize OpenAI client with the API key from Constants
client = OpenAI(api_key=Constants.OpenAIKey)

# Define the paths for the JSON files
json_path_OCR_Position = results_folder_path + "/OCR_Position.json"
json_path_LLM_Position = results_folder_path + "/LLM_Position.json"

# Load the OCR results from the JSON file
with open(json_path_OCR_Position, 'r') as file:
    ocr_results = json.load(file)

# Check if the OCR results are empty
if not ocr_results:
    print("No data found in OCR results.")
    sys.exit()

# Take the first item from OCR results for processing
filename = ocr_results['filename']
content = ocr_results['content']


# Interact with OpenAI's API
completion = client.chat.completions.create(
    model=gptModel,
    response_format={"type": "json_object"},
    messages=[
        {"role": "system", "content": "You are a helpful assistant whose job is to perfectly understand the following data:" + content},
        {"role": "user", "content": "You need to summarize the data in the following structured JSON format:" + positionDescriptionStructure}
    ]
)

# Create the response object
response_object = {
    "filename": filename,
    "position": completion.choices[0].message.content
}

# Save the response to the output file (overwriting any existing file)
with open(json_path_LLM_Position, 'w') as outfile:
    json.dump(response_object, outfile, indent=4)

print(f"Processed {filename}")


Processed Position.pdf


### JSON Structure for Candidate Evaluation

This script defines a JSON structure designed for evaluating job candidates across several dimensions. Each dimension, such as Education, Work Experience, Skills, Certifications and Courses, Languages, and General Assessment, is represented as an object within a "dimensions" list. 

This structured format allows for systematic assessment of candidates, ensuring a comprehensive evaluation process. The structure is serialized into a JSON-formatted string using `json.dumps`.

In [10]:
import json
structure = {
    "dimensions": [
        {
            "name": "education",
            "description": "Assessment of the candidate's highest level of education, field of study, and relevance to the position.",
            "accepted_values": "Integer, from 0 (Not a match) to 100 (Complete match), using only multiples of 10 (0, 10, 20, ..., 100)",
            "value": 0,
            "reasoning": ""
        },
        {
            "name": "work_experience",
            "description": "Evaluation of previous job roles, duration of employment, responsibilities held, and achievements in those roles.",
            "accepted_values": "Integer, from 0 (Not a match) to 100 (Complete match), using only multiples of 10 (0, 10, 20, ..., 100)",
            "value": 0,
            "reasoning": ""
        },
        {
            "name": "skills",
            "description": "Identification of hard skills (technical abilities, certifications) and soft skills (communication, leadership) relevant to the job.",
            "accepted_values": "Integer, from 0 (Not a match) to 100 (Complete match), using only multiples of 10 (0, 10, 20, ..., 100)",
            "value": 0,
            "reasoning": ""
        },
        {
            "name": "certifications_and_courses",
            "description": "Listing of additional certifications or courses completed that are pertinent to the job role, indicating continuous learning.",
            "accepted_values": "Integer, from 0 (Not a match) to 100 (Complete match), using only multiples of 10 (0, 10, 20, ..., 100)",
            "value": 0,
            "reasoning": ""
        },
        {
            "name": "languages",
            "description": "The candidate posses the required languages for the job role",
            "accepted_values": "Integer, from 0 (Not a match) to 100 (Complete match), using only multiples of 10 (0, 10, 20, ..., 100)",
            "value": 0,
            "reasoning": ""
        },
        {
            "name": "systems_knowledge",
            "description": "Evaluation of the candidate's familiarity with specific systems or tools required for the job role.",
            "accepted_values": "Integer, from 0 (Not a match) to 100 (Complete match), using only multiples of 10 (0, 10, 20, ..., 100)",
            "value": 0,
            "reasoning": ""
        },
        {
            "name": "general_assessment",
            "description": "General assessment of the candidate's fit for the job role including a gap analysis of what was missing to be a perfect match.",
            "accepted_values": "Integer, from 0 (Not a match) to 100 (Complete match), using only multiples of 10 (0, 10, 20, ..., 100)",
            "value": 0,
            "reasoning": ""
        }
    ]
}

resultStructure = json.dumps(structure)

In [11]:
import json
from openai import OpenAI
import sys
import os

# Temporarily add the parent directory to the system path to import Constants
sys.path.append("..")
import Constants
sys.path.remove("..")

# Initialize OpenAI client with the API key from Constants
client = OpenAI(api_key=Constants.OpenAIKey)

# Define the paths for the JSON files
json_path_LLM_Position = results_folder_path + "/LLM_Position.json"
json_path_LLM_Normalized_CV = results_folder_path + "/LLM_Normalized_CV.json"
json_path_LLM_Result =  results_folder_path + "/LLM_Analysis.json"

# Load the JSON data from the Position file
with open(json_path_LLM_Position, 'r') as file:
    position_data = json.load(file)

# Extract the "position" field
position = position_data.get('position', None)

# Ensure the position data is available
if not position:
    raise ValueError("The 'position' field is missing in the Position JSON file.")

# Load the JSON data from the Normalized CV file
with open(json_path_LLM_Normalized_CV, 'r') as file:
    cv_data = json.load(file)

# Load existing results or initialize an empty list if the file doesn't exist
if os.path.exists(json_path_LLM_Result):
    with open(json_path_LLM_Result, 'r') as result_file:
        try:
            existing_results = json.load(result_file)
        except json.JSONDecodeError:
            existing_results = []
else:
    existing_results = []

# Initialize a list to store new responses
new_responses = []

# Iterate through each CV object
for cv in cv_data:
    filename = cv.get('filename')

    # Check if the filename already exists in the results
    if any(result['filename'] == filename for result in existing_results):
        print(f"Result for {filename} already exists in LLM_Analysis.json. Skipping...")
        continue

    # Prepare the CV data for the OpenAI request
    candidate_cv = cv.get('cv', None)
    if not candidate_cv:
        print(f"No CV data found for {filename}. Skipping...")
        continue

    # Make the request to OpenAI
    try:
        completion = client.chat.completions.create(
            model=gptModel,
            response_format={"type": "json_object"},
            messages=[
                {"role": "system", "content": "You need to perfectly understand the following position description in JSON: " + position},
                {"role": "system", "content": "You are an amazing non-biased recruiter whose job is to perfectly understand the following candidate data in JSON: " + candidate_cv},
                {"role": "user", "content": "Based on the previous information, you need to assess a candidate's fit to the position and output the information in EXACTLY this JSON format:" + resultStructure},
                {"role": "user", "content": "The value field should be an integer, from 0 (Not a match) to 100 (Complete match), using only multiples of 10 (0, 10, 20, ..., 100)"},
                {"role": "user", "content": "The reasoning field should be a string, explaining why the candidate is a match or not including a gap analysis of what was missing to be a complete match"}
            ]
        )
        
        # Append the completion result to the new_responses list
        new_responses.append({
            "filename": filename,
            "content": completion.choices[0].message.content
        })
        
        print(f"Result prepared for {filename}")
        
    except Exception as e:
        print(f"An error occurred for {filename}: {str(e)}")

# After processing all CVs, append the new responses to the existing results and save
if new_responses:
    existing_results.extend(new_responses)
    with open(json_path_LLM_Result, "w") as json_file:
        json.dump(existing_results, json_file, indent=4)
    print("All new results have been saved.")
else:
    print("No new results to save.")


Result for Alex_Johnson_CV.pdf already exists in LLM_Analysis.json. Skipping...
Result for Emma_Brown_CV.pdf already exists in LLM_Analysis.json. Skipping...
Result for John_Carter_CV.pdf already exists in LLM_Analysis.json. Skipping...
Result for Maria_Thompson_CV.pdf already exists in LLM_Analysis.json. Skipping...
Result for Michael_Smith_CV.pdf already exists in LLM_Analysis.json. Skipping...
No new results to save.
