In [1]:
import os
import logging
import llmParser
import pdfplumber
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

Modelo de Ollama cargado correctamente.


In [2]:
# Requered directories
homePath = os.path.expanduser('~') # Path to the user's home directory
dataPath = os.path.join(homePath, 'NLP_p2025', 'CVBestFit') # Path to the working directory
cvsPath = os.path.join(dataPath, 'cvs') # Path where CVs are stored
jobDescriptionsPath = os.path.join(dataPath, 'jobDescriptions') # Path to the job descriptions directory
jobDescriptionFile = os.path.join(jobDescriptionsPath, 'Especialista en Instrumentacion y control.pdf') # Path to the job description file

In [3]:
def extractText(docPath):
    """ Extract the text from a PDF document using pdfplumber.

        Parameters:
            - docPath (str): Path to the PDF document.

        Returns:
            - str: Extracted text from the PDF document.
    """
    with pdfplumber.open(docPath) as pdf:
        return "\n".join([page.extract_text() or '' for page in pdf.pages])
    
def extractCVsInfo(cvPath):
    """ Extracts the information of the CVs in a given directory and stores it in a formatted dictionary.

        Parameters:
            - cvPath (str): Path to the directory containing the CVs in PDF format.

        Returns:
            - cvDict (dict): Dictionary containing the extracted information of each CV, indexed by an integer key.
    """

    ######################## Avoid unnecessary warnings from pdfplumber about cropbox ############################

    logging.getLogger("pdfminer").setLevel(logging.ERROR)
    logging.getLogger("pdfplumber").setLevel(logging.ERROR)

    parser = llmParser.LLMParser() # Load language model to parse and cast the CV information
    cvDict = {} # Dictionary to store the CV information
 
    for i, file in enumerate(os.listdir(cvPath)):
        if file.endswith('.pdf'):
            cPath = os.path.join(cvPath, file)
            text = extractText(cPath) # Extract the text from the current document
            cvInfo = parser.extract_CVInfo(text, True) # Extract the requested information in JSON format
            
        cvDict[i] = cvInfo # Store the CV information in the dictionary
        print(f"Processing: {cPath}")

    return cvDict

def extractJobDescriptionInfo(jobDescriptionFile):
    """ Extracts the information from a job description PDF file and returns it in a structured format.

        Parameters:
            - jobDescriptionFile (str): Path to the job description PDF file.
        
        Returns:
            - jobDescriptionInfo (dict): Dictionary containing the extracted information from the job description.    
    """

    ######################## Avoid unnecessary warnings from pdfplumber about cropbox ############################
    logging.getLogger("pdfminer").setLevel(logging.ERROR)
    logging.getLogger("pdfplumber").setLevel(logging.ERROR)

    parser = llmParser.LLMParser() # Load language model to parse and cast the CV information
    jobDescriptionText = extractText(jobDescriptionFile) # Extracts the text from the job description PDF file
    jobDescriptionInfo = parser.extract_jobDescriptionInfo(jobDescriptionText) # Extracts the requested information in JSON format

    return jobDescriptionInfo

def radarSkills(scoresPerSkill, jobInfo, candidates=None, mode="individual"):
    """
    Visualize the skills of candidates in a radar chart format.

        Parameters:
            - scoresPerSkill (dict): Dictionary containing the scores of each candidate for each skill.
            - jobInfo (dict): Dictionary containing the job information, including hard and soft skills.
            - candidates (list, optional): List of candidates to compare. If None, all candidates in scoresPerSkill are used.
            - mode (str): Mode of visualization. Options are 'individual', 'hard', 'soft', or 'compare'.

        Returns:
            - None: Displays the radar charts for the candidates' skills.
    """

    if mode not in ["individual", "hard", "soft", "compare"]:
        raise ValueError("Modo inválido. Usa: 'individual', 'hard', 'soft', o 'compare'.")

    hardSkills = jobInfo.hardSkills
    softSkills = jobInfo.softSkills

    if mode == "individual":
        for candidato, skills in scoresPerSkill.items():
            fig, axs = plt.subplots(1, 2, figsize=(12, 6), subplot_kw=dict(polar=True))
            fig.suptitle(f"Habilidades de {candidato}", fontsize=16)

            # Technical Skills
            values = [skills.get(skill, 0) for skill in hardSkills]
            values += values[:1]
            angles = np.linspace(0, 2 * np.pi, len(hardSkills), endpoint=False).tolist()
            angles += angles[:1]
            axs[0].plot(angles, values, linewidth=2)
            axs[0].fill(angles, values, alpha=0.25)
            axs[0].set_xticks(angles[:-1])
            axs[0].set_xticklabels(hardSkills, fontsize=9)
            axs[0].set_title("Técnicas", pad=15)

            # Soft Skills
            values = [skills.get(skill, 0) for skill in softSkills]
            values += values[:1]
            angles = np.linspace(0, 2 * np.pi, len(softSkills), endpoint=False).tolist()
            angles += angles[:1]
            axs[1].plot(angles, values, linewidth=2)
            axs[1].fill(angles, values, alpha=0.25)
            axs[1].set_xticks(angles[:-1])
            axs[1].set_xticklabels(softSkills, fontsize=9)
            axs[1].set_title("Blandas", pad=15)

            plt.tight_layout(rect=[0, 0, 1, 0.95])
            plt.show()

    elif mode == "hard":
        for candidato, skills in scoresPerSkill.items():
            values = [skills.get(skill, 0) for skill in hardSkills]
            values += values[:1]
            angles = np.linspace(0, 2 * np.pi, len(hardSkills), endpoint=False).tolist()
            angles += angles[:1]

            fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
            ax.plot(angles, values, linewidth=2)
            ax.fill(angles, values, alpha=0.25)
            ax.set_xticks(angles[:-1])
            ax.set_xticklabels(hardSkills, fontsize=9)
            ax.set_title(f"{candidato} - Habilidades Técnicas", pad=20)
            plt.tight_layout()
            plt.show()

    elif mode == "soft":
        for candidato, skills in scoresPerSkill.items():
            values = [skills.get(skill, 0) for skill in softSkills]
            values += values[:1]
            angles = np.linspace(0, 2 * np.pi, len(softSkills), endpoint=False).tolist()
            angles += angles[:1]

            fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
            ax.plot(angles, values, linewidth=2)
            ax.fill(angles, values, alpha=0.25)
            ax.set_xticks(angles[:-1])
            ax.set_xticklabels(softSkills, fontsize=9)
            ax.set_title(f"{candidato} - Habilidades Blandas", pad=20)
            plt.tight_layout()
            plt.show()

    elif mode == "compare":
        if not candidates or len(candidates) < 2:
            raise ValueError("Para 'compare', proporciona una lista de al menos dos candidatos.")

        allSkills = hardSkills + softSkills
        angles = np.linspace(0, 2 * np.pi, len(allSkills), endpoint=False).tolist()
        angles += angles[:1]

        fig, ax = plt.subplots(figsize=(14, 7), subplot_kw=dict(polar=True))
        ax.set_title("Comparación de Habilidades", pad=20)

        for cand in candidates:
            if cand not in scoresPerSkill:
                continue
            skills = scoresPerSkill[cand]
            values = [skills.get(skill, 0) for skill in allSkills]
            values += values[:1]
            ax.plot(angles, values, linewidth=2, label=cand)
            ax.fill(angles, values, alpha=0.1)

        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(allSkills, fontsize=9)
        ax.legend(loc="upper right", bbox_to_anchor=(1.2, 1.1))
        plt.tight_layout()
        plt.show()

### Funciones para evaluar el perfil general del candidato contra la vacante:

In [4]:
def matchScore(model, cvText, jobText):
    """ Calculate the match score between a CV and a job description using Sentence Transformers.
        Parameters:
            - model: SentenceTransformer model to use for encoding.
            - jobText: Text of the job description to encode.
            - cvText: Text of the CV to encode.

        Returns:
            - A float representing the cosine similarity score between the CV and job description embeddings.
    """
    emb1 = model.encode(f"query: {jobText}",normalize_embeddings=True)
    emb2 = model.encode(f"passage: {cvText}", normalize_embeddings=True)

    return np.dot(emb1, emb2)

### Funciones para evaluar las habilidades de los candidatos contra la vacante

In [5]:
# Defined functions to calculate the heuristic score of similarity between skills
def evaluateSkill(model, jobSkills, cvSkills, weight, mandatorySkill=False, threshold=0.7):
    """ Evaluate the similarity score between job skills and CV skills using a SentenceTransformer model.
    
        Parameters:
            - model: SentenceTransformer model to use for encoding.
            - jobSkills (list): List of skills required for the job.
            - cvSkills (list): List of skills present in the CV.
            - weight (float): Weight assigned to the skills in the scoring system.
            - mandatorySkill (bool): Whether the skill is mandatory or not.
            - threshold (float): Threshold for the cosine similarity score to consider a skill as matching.
            
        Returns:
            - score (float): The total score for the skills based on their similarity.
            - totalScore (float): The total possible score based on the number of job skills and their weights.
            - skillDict (dict): Dictionary containing the maximum similarity score for each job skill.
        """	
    score = 0
    totalScore = 0

    skillDict = {} # Diccionario para almacenar las habilidades laborales y sus scores por candidato

    # jobEmbs = model.encode([f"query: {skill}" for skill in jobSkills], normalize_embeddings=True) # Extract the embeddings of the job skills using the SentenceTransformer model
    # cvEmbs = model.encode([f"passage: {skill}" for skill in cvSkills], normalize_embeddings=True) # Extract the embeddings of the CV skills using the SentenceTransformer model

    # Calculate the cosine similarity between the job skills and CV skills
    # score = util.cos_sim(jobEmbs, cvEmbs)

    for jobSkill in jobSkills:
        jobEmb = model.encode(f"query: {jobSkill}", normalize_embeddings=True) # Extract the embedding of the job skill using the SentenceTransformer model
        maxSim = 0

        for cvSkill in cvSkills:
            cvEmb = model.encode(f"query: {cvSkill}", normalize_embeddings=True) # Extract the embedding of the CV skill using the SentenceTransformer model

            sim = util.cos_sim(jobEmb, cvEmb) # Calculate the cosine similarity between the job skill and CV skill

            maxSim = max(sim, maxSim) # Save the maximum cosine similarity found (key in below radar charts)

        totalScore += weight

        if mandatorySkill: # if the skill is mandatory
            score = weight if sim >= threshold else -weight # Penalize if it does not meet the threshold

        else: # Si la habilidad no es obligatoria:
            score = weight if sim >= threshold else 0 # Do not penalize if it does not meet the threshold

        skillDict[jobSkill] = maxSim # Save the skill highest score in the dictionary
        

    return score, totalScore, skillDict

def skillScore(model, jobHardSkills, jobSoftSkills, cvHardSkills, cvSoftSkills, mJobHardSkills, mJobSoftSkills, mCvHardSkills, mCvSoftSkills):
    """ Calculate the heuristic score based on the skills in the job description and CVs.
        Parameters:
            - model: SentenceTransformer model to use for encoding.
            - jobHardSkills (list): Hard skills required for the job.
            - jobSoftSkills (list): Soft skills required for the job.
            - cvHardSkills (list): Hard skills present in the CV.
            - cvSoftSkills (list): Soft skills present in the CV.
            - mjobHardSkills (list): Mandatory hard skills required for the job.
            - mjobSoftSkills (list): Mandatory soft skills required for the job.
            - mCvHardSkills (list): Mandatory hard skills present in the CV.
            - mCvSoftSkills (list): Mandatory soft skills present in the CV.
            
        Returns:
            - heuristicScore (float): The heuristic score based on the skills.
            - mergedSkillDict (dict): Dictionary containing the maximum similarity score for each job skill.
    """

    weights = {
        "hardSkills": 5,
        "softSkills": 3,
        "mandatoryHardSkills": 10,
        "mandatorySoftSkills": 6
    }

    categories = [
        (model, jobHardSkills, cvHardSkills, weights["hardSkills"]),
        (model, jobSoftSkills, cvSoftSkills, weights["softSkills"]),
        (model, mJobHardSkills, mCvHardSkills, weights["hardSkills"], True), # Mandatory skills
        (model, mJobSoftSkills, mCvSoftSkills, weights["softSkills"], True)  # Mandatory skills
    ]

    score = 0
    totaPossibleScore = 0
    mergedSkillDict = {}  # To merge all results

    for cat in categories:
        # Support both with and without `mandatorySkill`
        if len(cat) == 4:
            modelST, jobSkills, cvSkills, weight = cat
            mandatory = False
        else:
            modelST, jobSkills, cvSkills, weight, mandatory = cat

        if cvSkills:
            subScore, subTotal, skillDict = evaluateSkill(modelST, jobSkills, cvSkills, weight, mandatory)
            score += subScore
            totaPossibleScore += subTotal
            mergedSkillDict.update(skillDict) # Combine dictionaries
        else:
            continue

    heuristicScore = max(score, 0) / totaPossibleScore if totaPossibleScore else 0

    return heuristicScore, mergedSkillDict

In [6]:
# Se extrae la información de los CVs y el puesto de trabajo:
cvsInfo = extractCVsInfo(cvsPath) # Extraer la información de los CVs
jobInfo = extractJobDescriptionInfo(jobDescriptionFile) # Extraer la información de la vacante

CV Information Response:
 {
  "name": "MANUEL",
  "email": "mljch21@gmail.com",
  "phone": "",
  "location": "Panamá",
  "profile": "Ingeniero Electromecánico con 3 años de experiencia en el diseño, implementación y optimización de sistemas mecánicos y eléctricos en diversos sectores industriales. Sólido conocimiento en el manejo de herramientas de ingeniería, análisis estructural, control de procesos y mantenimiento de maquinaria.",
  "experience": ["METALFER (AES) PLANTA HIDROELECTRICA BAYANO", "EMPRESAS MELO", "ACISA PANAMÁ"],
  "education": ["COLEGIO JOSÉ DANIEL CRESPO (2010-2016)", "UNIVERSIDAD TECNOLOGICA DE Panamá (2017-2023)"],
  "hardSkills": ["Excel Intermedio", "Autocad", "360 Office", "SAP", "SCADA ABB"],
  "softSkills": ["Trabajo en equipo", "Lectura de planos eléctricos y de control"]
}
Processing: C:\Users\estiv\NLP_p2025\CVBestFit\cvs\CurriculumProfesionalSinFotoSencilloBlancoyNegro.pdf
CV Information Response:
 {
    "name": "JUAN PABLO OCAMPO",
    "email": "juanp.oca

In [7]:
modelE5 = SentenceTransformer("intfloat/e5-large-v2") # Load the SentenceTransformer model for text embeddings

jobText = jobInfo.description + " ".join(jobInfo.responsibilities) # Extraer la descripción y responsabilidades del puesto de trabajo

profileScores = {} # Dictionary to store the match scores of each candidate based on their profiles
skillScores = {} # Dictionary to store the scores of each candidate based on their skills

for i in range(len(cvsInfo)):
    # Extract the information for the analysis
    cvName = cvsInfo[i].name # Name of the candidate
    cvText = cvsInfo[i].profile # Profile text of the candidate

    jobHardSkills = jobInfo.hardSkills # Hard skills required for the job
    jobSoftSkills = jobInfo.softSkills # Soft skills required for the job
    cvHardSkills = cvsInfo[i].hardSkills # Hard skills of the candidate
    cvSoftSkills = cvsInfo[i].softSkills # Soft skills of the candidate

    ##### To develop in the future: #####
    mjobHardSkills = [] # Mandatory hard skills required for the job
    mJobSoftSkills = [] # Mandatory soft skills required for the job
    mCvHardSkills = [] # Mandatory hard skills of the candidate
    mCvSoftSkills = [] # Mandatory soft skills of the candidate 

    profileScore = matchScore(modelE5, cvText.lower(), jobText.lower()) # Calculate the match score between the CV and job description

    profileScores[cvName.title()] = profileScore # Store the score in the dictionary with the candidate's name as the key
    skillScores[cvName.title()], scoresPerSkill = skillScore(modelE5, jobHardSkills, jobSoftSkills, cvHardSkills, cvSoftSkills, mjobHardSkills, mJobSoftSkills, mCvHardSkills, mCvSoftSkills) # Calculate the heuristic score based on the skills in the job description and CVs

profileScores = dict(sorted(profileScores.items(), key=lambda item: item[1], reverse=True)) # Sort the scores in descending order
skillScores = dict(sorted(skillScores.items(), key=lambda item: item[1], reverse=True)) # Sort the scores in descending order

In [8]:
profileScoresDF = pd.DataFrame(list(profileScores.items()), columns=["Candidato", "Puntuacion"])
display(profileScoresDF)

Unnamed: 0,Candidato,Puntuacion
0,Juan Carlo,0.825471
1,Juan Pablo Ocampo,0.823148
2,Manuel,0.820403
3,Kirvin Adriel González,0.811297
4,Miguel Gutiérrez,0.802656
5,Emanuel Valdés,0.799941
6,Harllyn Velazco,0.79822
7,Hipolito Rodriguez,0.79229
8,Abdiel Jimenez,0.788473
9,Estiven Angel Echeverria,0.780198


In [11]:
skillScoresDF = pd.DataFrame(list(skillScores.items()), columns=["Candidato", "Puntuacion"])
display(skillScoresDF)

Unnamed: 0,Candidato,Puntuacion
0,Manuel,0.333333
1,Juan Pablo Ocampo,0.333333
2,Carol B. Fruto,0.333333
3,Hipolito Rodriguez,0.333333
4,Kirvin Adriel González,0.333333
5,Abdiel Jimenez,0.333333
6,Emanuel Valdés,0.333333
7,Estiven Angel Echeverria,0.333333
8,Harllyn Velazco,0.333333
9,Juan Carlo,0.333333


In [10]:
# Se ordenan los scores de manera descendente:
scores = dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))
heuristicScores = dict(sorted(heuristicScores.items(), key=lambda item: item[1], reverse=True))

combinedScores = {} # Se almacenan los scores combinados de cada CV

for candidate in scores:
    combinedScore = (0.8 * scores[candidate] + 0.2 * heuristicScores[candidate]) # Calcular el score combinado
    percentageScore = round(combinedScore * 100, 2) # Convertir a porcentaje
    combinedScores[candidate] = percentageScore # Almacenar el score combinado

combinedScores = dict(sorted(combinedScores.items(), key=lambda item: item[1], reverse=True)) # Ordenar los scores combinados de manera descendente

NameError: name 'scores' is not defined

In [None]:
# Crear un DataFrame con la información de la descripción del puesto de trabajo:
jobData = {key: value for key, value in jobInfo.model_dump().items()}  # Utiliza dict() para convertir el modelo a diccionario
dfJobDesc = pd.DataFrame([jobData])  # Convertir a DataFrame
dfJobDesc.columns = ["Titulo", "Descripción", "Responsabilidades", "Habilidades Técnicas", "Habilidades Blandas", "Ubicación"]  # Renombrar columnas

display(dfJobDesc) # Mostrar el DataFrame

In [None]:
# Crear un DataFrame con la información de los CVs:
cvData = {key: value.model_dump() for key, value in cvsInfo.items()} # Convertir a diccionario
dfCVs = pd.DataFrame(cvData).T # Convertir a DataFrame
dfCVs.columns = ["Nombre", "Correo", "Teléfono", "Perfil", "Ubicación", "Experiencia", "Educación", "Habilidades Técnicas", "Habilidades Blandas"] # Renombrar columnas
dfCVs.set_index("Nombre", inplace=True) # Establecer la columna "name" como índice
dfCVs = dfCVs[~dfCVs.index.duplicated(keep='first')] # Eliminar duplicados

In [None]:
# Crear un DataFrame con los scores:
scoresDF = pd.DataFrame(list(scores.items()), columns=['Candidato', 'Puntuación']) # Crear un DataFrame
display(scoresDF) # Mostrar el DataFrame

In [None]:
# Crear un DataFrame con los scores:
heuristicScoresDF = pd.DataFrame(list(heuristicScores.items()), columns=['Candidato', 'Puntuación Heurística']) # Crear un DataFrame
display(heuristicScoresDF) # Mostrar el DataFrame

In [None]:
combinedScoresDF = pd.DataFrame(list(combinedScores.items()), columns=['Candidato', 'Porcentaje de afinidad']) # Crear un DataFrame
display(combinedScoresDF) # Mostrar el DataFrame

In [None]:
# Agregar el score combinado al DataFrame de CVs:
dfCVs['Porcentaje de afinidad'] = pd.Series(combinedScores) # Agregar la columna de scores al DataFrame
dfCVs = dfCVs.sort_values(by='Porcentaje de afinidad', ascending=False) # Ordenar el DataFrame por el porcentaje de afinidad
display(dfCVs) # Mostrar el DataFrame

In [None]:
# Graficar los resultados de los porcentajes:
fig, ax = plt.subplots(figsize=(12, 8))
sns.barplot(x='Candidato', y='Porcentaje de afinidad', data=combinedScoresDF, hue='Candidato', palette='Spectral',ax=ax)
plt.title("Porcentaje de afinidad Candidato-Vacante")
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.xticks(rotation=90)
plt.tight_layout()

plt.show()

In [None]:
radarSkills(scoresPerSkill, jobInfo, mode="individual") # Graficar las habilidades de cada candidato

In [None]:
candidate1 = "Emanuel Valdés"
candidate2 = "Juan Pablo Ocampo"
radarSkills(scoresPerSkill, jobInfo, candidatos=[candidate1, candidate2], mode="compare")