In [19]:
!pip install PyPDF2
!pip install transformers
!pip install datasets
!pip install pdfminer
!pip install PyPDF2
!pip install torch torchvision torchaudio



In [32]:
import os
import re
import csv
import pandas as pd
import pdfminer
import random
import numpy as np
import torch
from transformers import DistilBertTokenizerFast, DistilBertModel
from sklearn.metrics.pairwise import cosine_similarity

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams, LTTextBox
from io import StringIO
import PyPDF2

In [33]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        num_pages = len(pdf_reader.pages)

        for page_num in range(num_pages):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()

    return text

# Define the function to extract key details from a PDF resume
def extract_resume_details(pdf_file):
    """Extracts key details from a PDF resume.

    Args:
        pdf_file: The path to the PDF resume file.

    Returns:
        A dictionary containing the extracted key details, including:
            category: The job role of the candidate.
            skills: A list of the candidate's skills.
            education: A list of the candidate's educational qualifications.
    """

    # Extract the text from the PDF file
    with open(pdf_file, "rb") as f:
        text = extract_text_from_pdf(pdf_file)

    # Parse the text to extract the key details
    role = text.split("\n")[0]
    skills = r'Skills((?:(?!Education|Education and Training|Experience|Accomplishments|Work History|ProfessionalExperience|Languages|Additional Information|Highlights|Interests).)+)'
    skills_match = re.search(skills, text, re.DOTALL | re.IGNORECASE)
    if skills_match:
        skills = skills_match.group(1).strip()
    else:
        skills = None
    education = r'Education((?:(?!Skills|Experience|Accomplishments|Work History|ProfessionalExperience|Languages|Additional Information|Highlights|Interests).)+)'
    education_match = re.search(education, text, re.DOTALL | re.IGNORECASE)
    if education_match:
        education = education_match.group(1).strip()
    else:
        education = None
    # Return the extracted key details
    return {
        "file_path" : pdf_file,
        "role": role,
        "skills": skills,
        "education": education,
    }

In [34]:
# Define the function to fetch job descriptions from the Hugging Face dataset
def fetch_job_descriptions():
    """Fetches job descriptions from the Hugging Face dataset.

    Returns:
        A list of job descriptions.
    """

    # Import the Hugging Face datasets library
    import datasets

    # Load the Job Descriptions dataset from Hugging Face
    dataset = datasets.load_dataset("jacob-hugging-face/job-descriptions")

    # Select a random sample of 15 job descriptions
    job_descriptions = random.choices(dataset["train"]["job_description"], k=15)

    # Return the job descriptions
    return job_descriptions

In [35]:
# Define the function to calculate the cosine similarity between two embeddings
def calculate_cosine_similarity(embedding1, embedding2):
    """Calculates the cosine similarity between two embeddings.

    Args:
        embedding1: The first embedding.
        embedding2: The second embedding.

    Returns:
        The cosine similarity between the two embeddings.
    """

    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)

    # Calculate the cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)[0][0]

    # Return the cosine similarity
    return similarity

In [42]:
# Define the function to match candidate resumes to job descriptions based on skills and education
def match_candidates_to_jobs(candidates, job_descriptions):
    """Matches candidate resumes to job descriptions based on skills and education.

    Args:
        candidates: A list of candidate resumes.
        job_descriptions: A list of job descriptions.

    Returns:
        A dictionary mapping each job description to a list of the top 5 matching candidate resumes.
    """

    # Create a dictionary to store the matching results
    matching_results = {}

    # Iterate over the job descriptions
    for job_description in job_descriptions:

        # Get the job description embedding
        job_description_embedding = get_embedding(job_description)

        # Create a list to store the matching candidates
        matching_candidates = []

        # Iterate over the candidate resumes
        for candidate in candidates:
            print(candidate)
            candidate_info = str(candidate["role"]) +" "+ str(candidate["skills"]) +" "+ str(candidate["education"])
            # Get the candidate resume embedding
            candidate_embedding = get_embedding(candidate_info)

            # Calculate the cosine similarity between the job description and the candidate resume
            similarity = calculate_cosine_similarity(job_description_embedding, candidate_embedding)

            # Add the candidate to the matching candidates list if the similarity is greater than a certain threshold
            if similarity > 0.3:
                matching_candidates.append((candidate, similarity))

        # Sort the matching candidates list by similarity
        matching_candidates.sort(key=lambda x: x[1], reverse=True)

        # Add the top 5 matching candidates to the matching results dictionary
        matching_results[job_description] = matching_candidates[:5]

    # Return the matching results dictionary
    return matching_results


In [37]:
# Define the function to get the embedding of a text sequence using a pre-trained model
def get_embedding(text_sequence):
    """Gets the embedding of a text sequence using a pre-trained model.

    Args:
        text_sequence: The text sequence to get the embedding.

    Returns:
        The embedding of the text sequence.
    """

    # Load the pre-trained DistilBERT tokenizer and model
    tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
    model = DistilBertModel.from_pretrained("distilbert-base-uncased")

    # Tokenize the text sequence
    tokenized_text = tokenizer(text_sequence, return_tensors="pt", padding=True, truncation=True)

    # Get the embedding of the tokenized text
    with torch.no_grad():
        embedding = torch.mean(model(**tokenized_text).last_hidden_state, dim=1)

    # Return the embedding
    return embedding


In [None]:
# Extract the key details from the PDF resumes
extracted_data = open('extracted_data.csv', 'w', newline='')
csvwriter = csv.writer(extracted_data)

csvwriter.writerow(["category", "file_path", "role", "skills", "education"])
for root, dirs, files in os.walk("data"):
    for file in files:
        pdf_file = os.path.join(root, file)
        candidate_details = {}
        candidate_details["category"] = dirs
        candidate_details.update(extract_resume_details(pdf_file))
        csvwriter.writerow(candidate_details.values())

extracted_data.close()

In [None]:
# Fetch the job descriptions from the Hugging Face dataset
job_descriptions = fetch_job_descriptions()

candidates = pd.read_csv('extracted_data.csv').to_dict(orient='records')
# Match the candidate resumes to the job descriptions
matching_results = match_candidates_to_jobs(candidates, job_descriptions)

# Write the matching results to a CSV file
with open("matching_results.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["Job description", "Matching candidates"])
    for job_description, matching_candidates in matching_results.items():
        candidates_string = ",".join([candidate[0]["category"] for candidate in matching_candidates])
        writer.writerow([job_description, candidates_string])