In [None]:
# for pdf processing
import pymupdf
import os
import zipfile
import json
import magic
import csv
import mammoth
import uuid
# for gemini calls (gemini api and pydantic for structured output)
from google import genai
from google.genai import types
from pydantic import BaseModel, Field, ValidationError
from typing import List, Optional
# wrapper requirements
from functools import wraps
from typing import Callable, Any, Tuple

# step 1: regenerate the job posting
i have loaded it onto a single string, so you dont have to create multiple windows\
i have currently written a simple function for you to understand how it should work, the ui will do its implementation in the actual case

In [3]:
client = genai.Client(api_key=os.getenv('GEMINI_API_KEY'))

In [4]:
# Nested models and main model for JobPosting
class Qualifications(BaseModel):
    """Represents the qualifications required for the job."""
    education: str
    experience: str
    required_skills: List[str]
    nice_to_have_skills: Optional[List[str]] = None

class Compensation(BaseModel):
    """Represents the compensation details for the job."""
    base_salary: Optional[str]
    benefits: Optional[str]

class JobPosting(BaseModel):
    """Represents a job posting scraped or retrieved from a source."""
    job_title: str
    company_name: str
    location: Optional[str]
    posted_date: Optional[str] = None
    job_description: str
    responsibilities: List[str]
    qualifications: Qualifications
    compensation: Compensation   
    equal_opportunity_employer: Optional[str]

    class Config:
        str_strip_whitespace = True
        validate_by_name = True # Allows using snake_case or camelCase

In [5]:
def retry_decorator(max_attempts: int = 3):
    def decorator(func: Callable) -> Callable:
        @wraps(func)
        def wrapper(resume_text: str, links_str: str = '') -> Tuple[dict, str]:
            valid_output = False
            for attempt in range(max_attempts):
                try:
                    llm_json = func(resume_text, links_str)
                    _ = JobPosting.model_validate_json(llm_json)
                    valid_output = True
                    break
                except ValidationError as e:
                    print(f"Attempt {attempt+1} failed. Validation Error: {e}")
            
            if valid_output:
                return llm_json
            else:
                return 'Could not parse job description accurately.'
        
        return wrapper
    
    return decorator

In [6]:
@retry_decorator(max_attempts=3)
def jobPosting_pre_processing_llm(jobPosting_text, user_customization:str = '') -> str:
    prompt = """Given the Job Posting text, create a structured JSON representation following these guidelines:

    # CRITICAL INSTRUCTION FOR 
    - Keep all your results very grounded and based on the given information provided.
    - Prioritize client requirements if they dont align with contradicting instructions.

    # INSTRUCTIONS FOR ADDING FURTHER DETAILS
    - If the job posting has explicit mention of the fields use them directly.
    - If the responsibilites are missing, generate a reasonable inference based on the overall requirements mentioned.
    - If the educational qualifications are missing, include education requirements witha requirement for a minimum degree in a relvant field e.g. 'Bachelors in Computer Science or Data Science' or 'Minimum Masters in Accounting' or if the requirements are complex research require a PhD.
    - If there are no work experience requirement, add a work experience requirement relevant years of experience e.g. '4+ years of Java Experience', 'At least 2 years of Healthcare Reporting', KEEP the numbers small.  
    - If there are NO explicit details mentioned in the job posting, infer and generate a list of approximately SEVEN (7) skills that are highly relevant to the job title, description, responsibilities, and qualifications mentioned. 
    - Base these inferences on common industry knowledge for the role.
    - Keep the job description less than 4 lines to summarize what the role entails.

    # Formatting Guidelines
    - Convert any gendered pronouns to gender-neutral alternatives.
    - Format dates in YYYY-MM-DD format when possible.

    The given job description is:

    {jobPosting_text}

    The following are the client requirements. If the client has any requirements that clash with the previous instructions, follow the client instruction.
    {user_customization}
    """.format(jobPosting_text=jobPosting_text, user_customization=user_customization)
    
    response = client.models.generate_content(
        model="gemini-2.0-flash-lite",
        contents=prompt,
        config={
        'response_mime_type': 'application/json',
        'response_schema': JobPosting,
        })
    return response.text
    

In [20]:
def json_to_str(data:dict) -> str:
    converted_str = ''
    for i in data:
        value = data[i]
        val_type = type(value)
        if val_type==str or val_type == int:
            pass
        elif val_type==list:
            value = ','.join(value)
        elif val_type==dict:
            value = json_to_str(value)
        elif val_type is None:
            continue
        converted_str += f"{i} : {value} \n"
    return converted_str

In [27]:
def jobPosting_pre_processing(init_jobPosting):
    continue_preprocessing = True
    user_customization = ''
    jobPosting_text = init_jobPosting[:]
    while continue_preprocessing:
        jobPosting_json = jobPosting_pre_processing_llm(jobPosting_text, user_customization)
        json_data = json.loads(jobPosting_json)
        jobPosting_text = json_to_str(json_data)
        print(jobPosting_text)
        inp = input('Regenerate(y/n): ')
        if inp[0].lower() == 'y':
            user_customization = input('What changes do you want to make? :')
        else:
            continue_preprocessing = False
    return jobPosting_json

In [28]:
file_path = "D:\\resumatrix_custom\\data\\test1_jobDesc.txt" # Replace with your actual file path
try:
    with open(file_path, 'r') as file:
        init_jobPosting = file.read()
        
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
jobPosting_json = jobPosting_pre_processing(init_jobPosting)

job_title : Software Engineer 
company_name : AppFolio 
location : None 
posted_date : None 
job_description : Join a fast-growing team at AppFolio Investment Management to build software solutions for the private equity market. Collaborate with a passionate team, interact with customers, and contribute to the full lifecycle of product development. 
responsibilities : Build software solutions for private equity investments.,Collaborate with a team of engineers and product managers.,Interact with customers to understand challenges and gather feedback.,Participate in the full lifecycle of solution development, from discovery to production.,Write code that reflects technical values.,Collaborate in a team setting. 
qualifications : education : Bachelors in Computer Science or related field 
experience : 3+ years of experience building web-based products 
required_skills : Web Development,Cloud Software,Modern Web Frameworks,Collaboration,Software Design,Problem-solving,Agile Methodologies 

in the above execution, i asked it to take out the equal opportunity employer bit for regenerate\
jobPosting_pre_processing - this is the main function in this 

In [26]:
jobPosting_json

'{\n  "job_title": "Software Engineer",\n  "company_name": "AppFolio",\n  "location": null,\n  "posted_date": null,\n  "job_description": "Develop software solutions for the private equity market. Collaborate with engineers and product managers. Interact with customers and ship innovative solutions.",\n  "responsibilities": [\n    "Build software solutions for private equity investment.",\n    "Collaborate with engineering and product teams.",\n    "Interact with customers to understand challenges and gather feedback.",\n    "Operate within an autonomous team, managing the full lifecycle of solutions.",\n    "Write code that aligns with technical values (SMART).",\n    "Interact with colleagues in a manner that supports team values (HEART)."\n  ],\n  "qualifications": {\n    "education": "Bachelors in Computer Science or related field",\n    "experience": "3+ years of web-based product development",\n    "required_skills": [\n      "Web Development",\n      "Cloud Software",\n      "Mo

# step 2: convert the zip or folder and extract text out of pdfs (no llm ver)

In [31]:
def scanRecurse(baseDir):
    for entry in os.scandir(baseDir):
        if entry.is_file():
            yield os.path.join(baseDir, entry.name)
        else:
            yield from scanRecurse(entry.path)

In [55]:
def resume_pdf_processing(curr_resume_filepath):
    doc = pymupdf.open(curr_resume_filepath)
    page = doc[0]
    resume_text = page.get_text()
    resume_links = [i['uri'] for i in page.get_links()]
    links_str = 'Links: '+' , '.join(resume_links)
    resume_text += links_str
    if len(resume_text)<30:
        print(f'Parsing issue for {curr_resume_filepath}')
        return None
    return resume_text


In [56]:
def resume_doc_processing(docx_path):
    with open(docx_path, "rb") as docx_file:
        result = mammoth.extract_raw_text(docx_file)
        text = result.value  
        messages = result.messages  # Console log 
    return text

In [58]:
def resume_processing(file_path):
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower()
    if file_extension == '.docx':
        return resume_doc_processing(file_path)
    elif file_extension == '.pdf':
        return resume_pdf_processing(file_path)
    return 'Unsupported file type'

In [50]:
# Create a custom dialect
csv.register_dialect('custom', 
                    quoting=csv.QUOTE_ALL,
                    doublequote=True,
                    escapechar='\\')

In [62]:
### THE CURRENT PROCESS DOES NOT INCLUDE KEY DEMOGRAPHIC VARIABLES - VISA STATUS, DISABILITY AND VETERAN
### BUILD A RANDOM GENERATOR FOR THESE BOOLEAN TYPE VARIABLES VISA(0.5,0.5), DISABILITY(0.1,0.9), VETERAN(0.1,0.9)
### TAG VISA WITH EDUCATION OR WORK EXPERIENCE IF NEEDED BUT 
### ENFORCE VISA==(!VETERAN) cuz commonsense
### do we need to add this to the supervised model too?
def resume_pre_processing(job_id, file_path):
    ## need to change both base_path_csv and base_path_zip while integrating into the backend
    base_path_csv = "D:\\resumatrix_custom\\data" # to be changed
    csv_path = os.path.join(base_path_csv,job_id+'.csv')
    with open(csv_path, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile, dialect='custom')
        csv_writer.writerow(['header'])
        if os.path.isfile(file_path):
            base_path_zip = "D:\\resumatrix_custom\\data\\extracted_resumes" #this is the folder where the zip will be extracted
            # a new folder with name 'job_id' will be created
            #verify against file extension and mime type, extracting file extension
            _, file_extension = os.path.splitext(file_path)
            file_extension = file_extension.lower()

            mime_type = magic.from_file(file_path,mime=True)
            if file_extension == '.zip' and mime_type == 'application/zip':
                extracted_resume_path = os.path.join(base_path_zip,job_id)
                resume_zip = zipfile.ZipFile(file_path, 'r')
                if os.path.exists(extracted_resume_path):
                    os.rmdir(extracted_resume_path)
                    os.mkdir(extracted_resume_path)
                    print('Existing folder removed, recreated folder.')
                else:
                    os.mkdir(extracted_resume_path)
                    print(f'Created new folder at {extracted_resume_path}.')
                resume_zip.extractall(extracted_resume_path)
                for curr_resume in scanRecurse(extracted_resume_path):
                    resume_text = resume_processing(curr_resume)
                    csv_writer.writerow([resume_text])
        elif os.path.isdir(file_path):
            for curr_resume in scanRecurse(file_path):
                resume_text = resume_processing(curr_resume)
                csv_writer.writerow([resume_text])
        else:
            print(f"Unsupported file type {mime_type}. Only zip/docx/pdf files are supported.") #console log
    
    return f'Parsed resumes are stored in {csv_path}.'

In [67]:
#this is the code to generate the uuid, this will go into the db step, not required for now
job_id = uuid.uuid4().__str__()
file_path = 'D:\\resumatrix_custom\\data\\123'
status=resume_pre_processing(job_id, file_path)
print(status)

Parsed resumes are stored in D:\resumatrix_custom\data\525c9498-4ebf-4183-95a4-99476e0c2b6b.csv.
