In [4]:
# Hypothetical JSON Schema sent to a REST API endpoint
resume_schema = {
  "type": "object",
  "properties": {
    "personalInformation": {
      "type": "object",
      "properties": {
        "firstName": { "type": "string" },
        "lastName": { "type": "string" },
        "middleName": { "type": "string" },
        "dateOfBirth": { "type": "string", "format": "date" }
      },
      "required": ["firstName", "lastName", "dateOfBirth"]
    },
    "contactInformation": {
      "type": "object",
      "properties": {
        "email": { "type": "string", "format": "email" },
        "phone": { "type": "string" },
        "address": {
          "type": "object",
          "properties": {
            "street": { "type": "string" },
            "city": { "type": "string" },
            "state": { "type": "string" },
            "zip": { "type": "string" }
          },
          "required": ["street", "city", "state", "zip"]
        }
      },
      "required": ["email", "phone", "address"]
    },
    "education": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "institution": { "type": "string" },
          "degree": { "type": "string" },
          "fieldOfStudy": { "type": "string" },
          "graduationDate": { "type": "string", "format": "date" }
        },
        "required": ["institution", "degree", "graduationDate"]
      }
    },
    "workExperience": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "employer": { "type": "string" },
          "position": { "type": "string" },
          "startDate": { "type": "string", "format": "date" },
          "endDate": { "type": "string", "format": "date" },
          "responsibilities": { "type": "string" }
        },
        "required": ["employer", "position", "startDate"]
      }
    },
    "skills": {
      "type": "array",
      "items": { "type": "string" }
    },
    "skills_keywords": {
      "type": "array",
      "items": { "type": "string" }
    },
    "ai_generated_roles": {
      "type": "array",
      "items": { "type": "string" }
    },
    "references": {
      "type": "array",
      "items": {
        "type": "object",
        "properties": {
          "name": { "type": "string" },
          "relationship": { "type": "string" },
          "contact": {
            "type": "object",
            "properties": {
              "email": { "type": "string", "format": "email" },
              "phone": { "type": "string" }
            },
            "required": ["email", "phone"]
          }
        },
        "required": ["name", "relationship", "contact"]
      }
    }
  },
  "required": ["personalInformation", "contactInformation", "education", "workExperience"]
}

In [5]:
# Function for OpenAI to fill
functions = [
    {
        "name": "submit_application",
        "description": "Use to submit a job application. Fill with 'N/A' if information not found in Applicant Resume.",
        "parameters": {
            "type": "object",
            "properties": {
                "personalInformation": {
                    "type": "object",
                    "properties": {
                        "firstName": { "type": "string" },
                        "lastName": { "type": "string" },
                        "middleName": { "type": "string" },
                        "dateOfBirth": { "type": "string", "format": "date" }
                    },
                    "required": ["firstName", "lastName", "dateOfBirth"]
                },
                "contactInformation": {
                    "type": "object",
                    "properties": {
                        "email": { "type": "string", "format": "email" },
                        "phone": { "type": "string" },
                        "address": {
                            "type": "object",
                            "properties": {
                                "street": { "type": "string" },
                                "city": { "type": "string" },
                                "state": { "type": "string" },
                                "zip": { "type": "string" }
                            },
                            "required": ["street", "city", "state", "zip"]
                        }
                    },
                    "required": ["email", "phone", "address"]
                },
                "education": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "institution": { "type": "string" },
                            "degree": { "type": "string" },
                            "fieldOfStudy": { "type": "string" },
                            "graduationDate": { "type": "string", "format": "date" }
                        },
                        "required": ["institution", "degree", "graduationDate"]
                    }
                },
                "workExperience": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "employer": { "type": "string" },
                            "position": { "type": "string" },
                            "startDate": { "type": "string", "format": "date" },
                            "endDate": { "type": "string", "format": "date" },
                            "responsibilities": { "type": "string" }
                        },
                        "required": ["employer", "position", "startDate"]
                    }
                },
                "skills": {
                    "type": "array",
                    "items": { "type": "string" }
                },
                 "skills_keywords": {
                    "description": "REQUIRED GenAI Field - if Skills not keywords, insert array of keyword skills separated by comma below", # GenAI Field
                    "type": "array",
                    "items": { "type": "string" }
                },
                "ai_generated_roles": {
                    "description": "REQUIRED GenAI Field - generate a list of 10 possible roles person could do based on experience and skills", # GenAI Field
                    "type": "array",
                    "items": { "type": "string" }
                },
                "references": {
                    "type": "array",
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": { "type": "string" },
                            "relationship": { "type": "string" },
                            "contact": {
                                "type": "object",
                                "properties": {
                                    "email": { "type": "string", "format": "email" },
                                    "phone": { "type": "string" }
                                },
                                "required": ["email", "phone"]
                            }
                        },
                        "required": ["name", "relationship", "contact"]
                    }
                }
            },
            "required": ["personalInformation", "contactInformation", "education", "workExperience", "ai_generated_roles"]
        }
    }
]

In [6]:
# Azure Search Class
# Required libraries and modules
from dotenv import load_dotenv
import os
from azure.core.exceptions import ResourceNotFoundError
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.models import Vector
from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration
)
import requests
from requests.auth import HTTPBasicAuth
import hashlib
import openai

class Config:
    def __init__(self, path='./environment.env'):
        # Load environment variables from specified file path
        load_dotenv(dotenv_path=path)
        # Azure Search Service configurations
        self.endpoint = os.getenv("AZURE_ENDPOINT")
        self.api_key = os.getenv("AZURE_API_KEY")
        # Jira API configurations
        self.jira_id = os.getenv("JIRA_ID")
        self.jira_key = os.getenv("JIRA_KEY")
        # OpenAI API configurations
        self.open_ai_endpoint = os.getenv("OPENAI_ENDPOINT")
        self.openai_key = os.getenv("OPENAI_KEY")
        self.engine = os.getenv("ENGINE")

# Class to handle Azure Search operations
class AzureSearch:
    def __init__(self, config, index_name):
        # Initialize with configurations
        self.endpoint = config.endpoint
        self.api_key = config.api_key
        self.open_ai_endpoint = config.open_ai_endpoint
        self.openai_key = config.openai_key
        self.engine = config.engine
        self.index_name = index_name
        self.search_client = SearchClient(endpoint=self.endpoint, index_name=self.index_name, credential=AzureKeyCredential(self.api_key))
    
    # Method to generate embeddings for a given text using OpenAI
    def generate_embeddings(self, text):
        openai.api_type = "azure"
        openai.api_key = self.openai_key
        openai.api_base = self.open_ai_endpoint
        openai.api_version = "2023-05-15"
        response = openai.Embedding.create(
            input=text,
            engine=self.engine
        )
        return response['data'][0]['embedding']
    
    # Method to create Azure Search index if it does not exist
    def create_index_if_not_exists(self, index_name, index_fields, vector_search=None):
        client = SearchIndexClient(endpoint=self.endpoint, credential=AzureKeyCredential(self.api_key))
        try:
            # Check if index already exists
            client.get_index(name=index_name)
            print(f"Index {index_name} already exists.")
            return False
        except ResourceNotFoundError:
            # Create a new index if it doesn't exist
            index = SearchIndex(name=index_name, fields=index_fields, vector_search=vector_search)
            client.create_index(index)
            print(f"Index {index_name} created.")
            return True
    
    # Method to perform a vector search on the Azure Cognitive Search index
    def vector_search(self, query, k=3, select_fields=None):
        vector = Vector(value=self.generate_embeddings(query), k=k, fields="descriptionVector")

        # Set default fields if none are provided
        if select_fields is None:
            select_fields = "*"
        # Parse results    
        results = self.search_client.search(
            search_text=None,
            vectors=[vector],
            select=select_fields,
        )

        # Parse and return the search results
        search_output = []
        for result in results:
            result_data = {
                "description": result['description'],
                "Score": result['@search.score'],
                "key": result['key'],
            }
            search_output.append(result_data)

        return search_output
    
    # Method to upload documents to the specified Azure Search index
    def push_to_index(self, json_data, index_name):
        search_client = SearchClient(endpoint=self.endpoint, index_name=index_name, credential=AzureKeyCredential(self.api_key))
        for item in json_data:
            item_description = item.get('description', '')
            if item_description:
                # Generate embeddings for the document description
                item['descriptionVector'] = self.generate_embeddings(item_description)
        # Upload documents to Azure Search
        result = search_client.upload_documents(documents=json_data)
        if all(r.succeeded for r in result):
            return f"Successfully added {len(json_data)} documents to the index {index_name}."
        else:
            failed_count = sum(1 for r in result if not r.succeeded)
            return f"Added {len(json_data) - failed_count} documents to the index {index_name}. {failed_count} documents failed."
        
config = Config()
index_name = "resumes"
azure_search = AzureSearch(config, index_name)
index_fields = [
    SearchableField(name="description", type=SearchFieldDataType.String),
    SimpleField(name="key", type=SearchFieldDataType.String, key=True),
    SearchableField(name="name", type=SearchFieldDataType.String),  # New 'name' field
    SearchField(name="descriptionVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
                searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config")
]

vector_search = VectorSearch(
    algorithm_configurations=[
        HnswVectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 500,
                "metric": "cosine"
            }
        )
    ]
)
config = Config()
azure_search = AzureSearch(config, index_name)
azure_search.create_index_if_not_exists(index_name, index_fields, vector_search)

import openai
import json
import os
import re

def initialize_openai(api_key, api_version, api_type, api_base):
    openai.api_key = api_key
    openai.api_version = api_version
    openai.api_type = api_type
    openai.api_base = api_base

def read_file(file_path):
    with open(file_path, 'r', encoding="utf-8") as r:
        return r.read()

def get_openai_response(messages, function_call_name=None, functions=None):
    request_payload = {
        "engine": "gpt35",
        "messages": messages,
        "temperature": .5
    }
    if function_call_name:
        request_payload["function_call"] = {"name": function_call_name}
    if functions:
        request_payload["functions"] = functions

    response = openai.ChatCompletion.create(**request_payload)
    return response

def get_summary(mapped_fields, max_lenght = 250):
    summary_prompt = f"Summarize the following Resume using extractive summarization to remain unbiased. Use neutral pronouns, do not use padding language. The lenght must be of {max_lenght} words. Resume:\n{mapped_fields}"
    messages = [
        {"role": "user", "content": summary_prompt}
    ]
    response = get_openai_response(messages)
    summary = response["choices"][0]["message"]["content"]
    return summary

def remove_pii(text):
    messages = [
        {
            "role": "system",
            "content": "You are am expert PII and Bias remover bot. Remove any personally identifiable information and gender pronouns from the following text. Adopt the [] bracket removal style."
        },
        {"role": "user", "content": text}
    ]
    response = get_openai_response(messages)
    sanitized_text = response["choices"][0]["message"]["content"]
    return sanitized_text

def map_fields(src_dict, schema_dict, dst_dict):
    for key, value in src_dict.items():
        if key in schema_dict['properties']:
            if schema_dict['properties'][key]['type'] == 'object':
                dst_dict[key] = {}
                map_fields(src_dict[key], schema_dict['properties'][key], dst_dict[key])
            elif schema_dict['properties'][key]['type'] == 'array':
                dst_dict[key] = src_dict[key]
            else:
                dst_dict[key] = value

def map_fields_to_schema(arguments_str, schema):
    arguments_dict = json.loads(arguments_str)
    mapped_fields = {}
    map_fields(arguments_dict, schema, mapped_fields)
    return mapped_fields

def remove_empty_lines(text):
    return re.sub(r'\n', ' ', text).strip()

def push_to_index(folder_path, azure_search):
    for applicant_folder in os.listdir(folder_path):
        applicant_folder_path = os.path.join(folder_path, applicant_folder)
        for file_name in os.listdir(applicant_folder_path):
            if file_name.endswith('_Summary.txt'):
                summary_path = os.path.join(applicant_folder_path, file_name)
                with open(summary_path, 'r') as txt_file:
                    summary = txt_file.read()
                
                # Extracting the user's name from the file_name
                name = file_name.split('_')[0]
                
                doc = {
                    "name": name,
                    "description": remove_empty_lines(summary),
                    "key": hashlib.md5(file_name.encode()).hexdigest()  # Create a hash of the file name as key
                }
                index_response = azure_search.push_to_index([doc], 'resumes')
                print(index_response)

def main():
    api_config = {
        "api_key": "",
        "api_version": "2023-07-01-preview",
        "api_type": "azure",
        "api_base": ""
    }
    initialize_openai(**api_config)
    
    input_folder = './Resumes'  # Assume this is the folder containing the resumes
    output_folder = './Resources'
    os.makedirs(output_folder, exist_ok=True)  # Create Resources folder if it doesn't exist

    for file_name in os.listdir(input_folder):
        file_path = os.path.join(input_folder, file_name)
        resume = read_file(file_path)
        
        # Assume the resume file name is formatted as 'FirstName_LastName.txt'
        applicant_name = file_name.split('_')[0]
        applicant_folder = os.path.join(output_folder, applicant_name)
        os.makedirs(applicant_folder, exist_ok=True)  # Create applicant folder if it doesn't exist

        messages_extractor = [
            {
                "role": "system",
                "content": """
You are an AI NLP Resume Extractor to JSON. Your job is to fill the required fields on the function submit_application with information from provided Resume. Fields that require AI generation are indicated with GenAI and are REQUIRED. For example, you might need to extract skills as keywords based on the full Resume.
"""
            },
            {"role": "user", "content": f"\n\nResume:\n{resume}"}
        ]
        
        # Assume functions is loaded as a list of function definitions
        # functions = ...
        response = get_openai_response(messages_extractor, "submit_application", functions)

        arguments_str = response["choices"][0]["message"]["function_call"]["arguments"]
        
        # Assume resume_schema is loaded as a dictionary
        # resume_schema = ... First 2 cells
        mapped_fields = map_fields_to_schema(arguments_str, resume_schema)
        summary = get_summary(resume)
        biased_summary_path = os.path.join(applicant_folder, f'{applicant_name}_BiasedSummary.txt')
        with open(biased_summary_path, 'w') as txt_file:
            txt_file.write(summary)

        sanitized_summary = remove_pii(summary)  # Call the new remove_pii function
        
        # Save the JSON dump of mapped_fields and the summary to the applicant folder
        json_resume_path = os.path.join(applicant_folder, f'{applicant_name}_JSONResume.json')
        summary_path = os.path.join(applicant_folder, f'{applicant_name}_Summary.txt')

        with open(json_resume_path, 'w') as json_file:
            json.dump(mapped_fields, json_file, indent=2)
        
        with open(summary_path, 'w') as txt_file:
            summary = sanitized_summary
            txt_file.write(sanitized_summary)


if __name__ == "__main__":
    main()
    # Push all of the created resources to Azure Cognitive search
    push_to_index('./Resources', azure_search)

# Perform vector search using the query and specifying fields to select
des = {"swe":"Software Eng description",
 "aide":"AI Eng description",
 "ds": "Data Scientist description"}


for i in des.keys():
    print(i)
    results = azure_search.vector_search(query=f"{des.get(i)}", select_fields=["description", "key"])
    for r in results:
        sc = r.get("Score")
        print(f'{r}\n{sc}')

Index resumes already exists.
Successfully added 1 documents to the index resumes.
Successfully added 1 documents to the index resumes.
Successfully added 1 documents to the index resumes.
Successfully added 1 documents to the index resumes.
swe
{'description': '[Person] is a Software Engineer with a strong technical background and a passion for creating efficient software solutions. [They] have experience in various programming languages and tools and have a proven track record of delivering complex projects on time and exceeding client expectations. [Person] is skilled in problem-solving, collaborating with cross-functional teams, and staying updated with emerging technologies.  At Tech Innovations Inc., [Person] led the development of a real-time chat application using Node.js, WebSocket, and React, improving team communication and collaboration. [They] also designed and implemented RESTful APIs for a cloud-based inventory management system, resulting in a 20% increase in efficiency