<a href="https://colab.research.google.com/github/SirArthur7/Resume-Matching-with-JD/blob/main/JD_CV_Matcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Entire Code for Resume parser and Resume-JD matcher**




---



# Resume extractor and parser

### Installing and importing required packages

In [None]:
!pip install PyPDF2 fuzzywuzzy unidecode

In [None]:
import PyPDF2
from fuzzywuzzy import fuzz
import pandas as pd
from unidecode import unidecode
import re
import os
from os.path import isfile, join



### Defining function for parsing the resume from the pdf files

 Function to extract and clean the text from the PDFs

In [None]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            if page_num > 0:  # Add newline for page breaks (except the first page)
                text += "\n"
            text += pdf_reader.pages[page_num].extract_text()
    text = unidecode(text)
    return text

Function to parse and segregate the resume based on headings

In [None]:
def identify_headings(text):
    headings = []
    lines = text.split('\n')

    # Define synonyms and their corresponding fuzz ratio thresholds
    heading_synonyms = {
        "Education": ["Education", "Qualifications", "Educational Qualifications", "Academic Background", "Educational Details", "Education and Training"],
        "Skills": ["Skills", "Technical Skills", "Key Competencies", "Skill Highlights", "Primary Skills", "Specializations", "Areas of Expertise", "Expertise", "Programming Languages"],
        "Experience": ["Work Experience", "Professional Background", "Professional Experience", "Work History", "Teaching Experience", "Employment History"],
        "Achievements": ["Accomplishments", "Achievements", "Notable Projects", "Qualifications"],
        "Others": ["Awards", "Honors", "Recognition", "Publications", "Certifications", "Presentations", "Volunteer Experience", "Leadership Experience","Interests","Hobbies", "Languages", "Licenses"],
        "Summary": ["Career Overview", "Summary", "About Me", "Profile Summary", "Highlights", "Objective"]
    }

    for line in lines:
        line = line.strip()
        for heading, synonyms in heading_synonyms.items():
            for synonym in synonyms:
                if re.match(r'^\s*{}\s*'.format(synonym), line, re.IGNORECASE):
                    headings.append((line, heading))

    found = False
    for tup in headings:
      if tup[1] == "Experience":
          found = True

    if not found:
      for line in lines:
        line = line.strip()
        for heading, synonyms in heading_synonyms.items():
          if heading == "Experience":
            synonyms = ["Work Experience", "Professional Background", "Professional Experience", "Work History", "Teaching Experience", "Employment History", "Experience"]
            for synonym in synonyms:
                if re.match(r'^\s*{}\s*'.format(synonym), line, re.IGNORECASE):
                    headings.append((line, heading))
    return headings

def clean_text(text):
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Remove leading and trailing spaces
    text = text.strip()
    # Remove newline characters
    text = text.replace('\n', ' ')
    # Remove tab characters
    text = text.replace('\t', ' ')

    return text

Function to parse and segregate the resume based on headings

In [None]:
def parse_resume(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    headings = identify_headings(text)

    resume_parts = {}
    current_heading = ""
    current_part = ""
    capturing_work_experience = False

    for line in text.split('\n'):
        line = line.strip()
        if (line, "Education") in headings:
            current_heading = "Education"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Skills") in headings:
            current_heading = "Skills"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Experience") in headings:
            current_heading = "Experience"
            current_part = ""
            capturing_work_experience = True
        elif (line, "Achievements") in headings:
            current_heading = "Achievements"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Others") in headings:
            current_heading = "Others"
            current_part = ""
            capturing_work_experience = False
        elif (line, "Summary") in headings:
            current_heading = "Summary"
            current_part = ""
            capturing_work_experience = False
        else:
            if capturing_work_experience:
                current_part += line + "\n"
                resume_parts.setdefault(current_heading, "")  # Initialize the dictionary key if not present
                resume_parts[current_heading] += line + "\n"
            else:
               if current_heading:
                current_part += line + "\n"
                resume_parts[current_heading] = current_part

    # Create a DataFrame from the parsed resume parts
    fixed_columns = {
        'ResumeID': '',
        'Category':'',
        'Education': '',
        'Skills': '',
        'Experience': '',
        'Achievements': '',
        'Others':'',
        'Summary':''
    }

    # Update the fixed_columns dictionary with data from the input dictionary
    fixed_columns.update(resume_parts)

    # Create a DataFrame from the updated dictionary
    df = pd.DataFrame([fixed_columns])
    df = df.applymap(clean_text)
    return df

### Applying the functions on the entire Data directory

In [None]:
dfs = []
root_dir = "/content/Dataset/data/data"
for root, dirs, files in os.walk(root_dir):
    for file in files:
        if file.endswith(".pdf"):
            # Extract text from the PDF file
            pdf_path = os.path.join(root, file)
            parsed_resume_df = parse_resume(pdf_path)
            parsed_resume_df["ResumeID"] = file.replace(".pdf","")
            parsed_resume_df["Category"] = os.path.basename(root)
            dfs.append(parsed_resume_df)
dfs = pd.concat(dfs).reset_index(drop=True)

### Storing the extracted and parsed resumes into a .csv file for future use

In [None]:
dfs.to_csv("Resume_extracted.csv", index=False)

# JD Matching

### Installing and importing required packages

In [None]:
!pip install -qU datasets transformers sentence-transformers git+https://github.com/naver/splade.git
!pip install einops

In [None]:
from datasets import load_dataset
import pandas as pd
import ast
import torch
from splade.models.transformer_rep import Splade
from transformers import AutoTokenizer
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Loading SPLADE tokenizer and embedding model

In [None]:
sparse_model_id = 'naver/splade-cocondenser-selfdistil'

sparse_model = Splade(sparse_model_id, agg='max')
sparse_model.to(device)  # move to GPU if possible
sparse_model.eval()
tokenizer = AutoTokenizer.from_pretrained(sparse_model_id)

Downloading:   0%|          | 0.00/670 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Loading Job Description Dataset from Huggingface

In [None]:
dataset = load_dataset("jacob-hugging-face/job-descriptions")
dataset = dataset['train']

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/3.77M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
df = pd.DataFrame(data=dataset)
df = df[["job_description", "position_title", "model_response"]]

Loading the extracted Resumes from previous part

In [1]:
df_candidates = pd.read_csv("/content/Resume_extracted.csv")

In [None]:
df_candidates.isna().sum() #shows statistically how well the extractor has performed

ResumeID           0
Category           0
Education         22
Skills           105
Experience        12
Achievements    1656
Others          1597
Summary          935
dtype: int64

### Using only 15 job descriptions from the dataset

In [None]:
df['model_response'] = df['model_response'].apply(lambda x: ast.literal_eval(x))
df_jobdesc = df[df['model_response'].apply(lambda x: x.get('Educational Requirements') != 'N/A' and x.get('Required Skills') != 'N/A' and x.get('Preferred Qualifications') != 'N/A')]
df_jobdesc = df_jobdesc.sample(n=15, random_state=42).reset_index(drop=True)

In [None]:
df_jobdesc

Unnamed: 0,job_description,position_title,model_response
0,this position is located on rikers islandphysi...,Administrative Assistant,{'Core Responsibilities': 'Assist in the overa...
1,responsibilities\nthe vice president marketing...,"Vice President, Marketing, MSNBC",{'Core Responsibilities': 'Lead the developmen...
2,mathematica applies expertise at the intersect...,"Senior Vice President and Managing Director, I...",{'Core Responsibilities': 'Lead and grow the i...
3,the regional sales manager will report to the...,"Regional Sales Manager, Video Conferencing (East)",{'Core Responsibilities': 'Responsible for mee...
4,the role of the chief executive officer ceo is...,Chief Executive Officer,{'Core Responsibilities': 'Responsible for day...
5,at haley aldrich we pride ourselves on our sm...,Project Manager (CCR focused),{'Core Responsibilities': 'Understand client n...
6,spire learning united states remote\n\nspire i...,Vice President/Sr. Director Business Developme...,{'Core Responsibilities': 'Lead the creation a...
7,company description\nproject finds mission is ...,Director of Finance,{'Core Responsibilities': 'Responsible for the...
8,job description\nour vision\n\nin the new and ...,Software Engineer (JavaScript Backend),{'Core Responsibilities': 'Defining and implem...
9,senior financial analyst\n\ncompany highlights...,Senior Financial Analyst,{'Core Responsibilities': 'Maintain financial ...


### Defining functions for tokenizing and creating embeddings of education and skills of all JD and Resumes

In [None]:
def process_row1(row):
    text_data = str(row['Education'])
    input_ids = tokenizer(
        text_data, return_tensors='pt',
        padding=True, truncation=True
    )

    with torch.no_grad():
        text_embed = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    return text_embed.cpu().detach().numpy()

def process_row2(row):
    text_data = str(row['Skills'])
    input_ids = tokenizer(
        text_data, return_tensors='pt',
        padding=True, truncation=True
    )

    with torch.no_grad():
        text_embed = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    return text_embed.cpu().detach().numpy()

def process_row3(row):
    text_data = row['model_response'].get('Educational Requirements') + row['model_response'].get('Preferred Qualifications')
    input_ids = tokenizer(
        text_data, return_tensors='pt',
        padding=True, truncation=True
    )

    with torch.no_grad():
        text_embed = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    return text_embed.cpu().detach().numpy()

def process_row4(row):
    text_data = row['model_response'].get('Required Skills')
    input_ids = tokenizer(
        text_data, return_tensors='pt',
        padding=True, truncation=True
    )

    with torch.no_grad():
        text_embed = sparse_model(
            d_kwargs=input_ids.to(device)
        )['d_rep'].squeeze()
    return text_embed.cpu().detach().numpy()

### Defining scoring functions based on cosine similarities between the JD and the entire Resume dataset to find the top 5 matches

In [None]:
from scipy.spatial import distance
import numpy as np

def score(a, b):
  return distance.cosine(np.array(a),np.array(b))

In [None]:
def find_min_match(row):
    row_score = df_candidates['edu_embed'].apply(lambda x: score(row['edu_embed'], x)) + df_candidates['skill_embed'].apply(lambda x: score(row['skill_embed'], x))
    min_score_indices = row_score.nsmallest(5).index.tolist()
    return df_candidates.loc[min_score_indices, 'ResumeID'].tolist()

### Applying the functions to the dataset

In [None]:
df_candidates['edu_embed'] = df_candidates.apply(process_row1, axis=1)
df_candidates['skill_embed'] = df_candidates.apply(process_row2, axis=1)
df_jobdesc['edu_embed'] = df_jobdesc.apply(process_row3, axis=1)
df_jobdesc['skill_embed'] = df_jobdesc.apply(process_row4, axis=1)

In [None]:
from tqdm import tqdm
tqdm.pandas()
df_jobdesc['qualified_candidate_ResumeID'] = df_jobdesc.progress_apply(find_min_match, axis=1)

100%|██████████| 15/15 [00:10<00:00,  1.50it/s]


Final Output of Top 5 Resume matches for all the 15 JDs

In [None]:
final_df = df_jobdesc[["position_title","job_description", "qualified_candidate_ResumeID"]]
final_df

Unnamed: 0,position_title,job_description,qualified_candidate_ResumeID
0,Administrative Assistant,this position is located on rikers islandphysi...,"[17539842, 14752209, 11995013, 14391434, 29926..."
1,"Vice President, Marketing, MSNBC",responsibilities\nthe vice president marketing...,"[12567516, 21297828, 12230301, 85766635, 19444..."
2,"Senior Vice President and Managing Director, I...",mathematica applies expertise at the intersect...,"[29926588, 20850529, 25482567, 11266906, 20279..."
3,"Regional Sales Manager, Video Conferencing (East)",the regional sales manager will report to the...,"[27024099, 36574147, 23246831, 28711616, 37521..."
4,Chief Executive Officer,the role of the chief executive officer ceo is...,"[34962725, 19926135, 29926588, 11266906, 65456..."
5,Project Manager (CCR focused),at haley aldrich we pride ourselves on our sm...,"[28815362, 24001783, 26921245, 17252448, 14900..."
6,Vice President/Sr. Director Business Developme...,spire learning united states remote\n\nspire i...,"[37521676, 34962725, 22391901, 25482567, 89508..."
7,Director of Finance,company description\nproject finds mission is ...,"[88038965, 84356308, 93653247, 95792386, 12802..."
8,Software Engineer (JavaScript Backend),job description\nour vision\n\nin the new and ...,"[23464505, 26069113, 98348532, 44115326, 60489..."
9,Senior Financial Analyst,senior financial analyst\n\ncompany highlights...,"[23387174, 14224370, 12780508, 20253563, 78229..."


## Demo of the performance of the JD-Resume matching algorithm

Job description

In [None]:
print(final_df.iloc[7]['job_description'])

company description
project finds mission is to provide low and moderateincome and homeless seniors with the services and support they need to enrich their lives and live independently
today project find operates three supportive housing residences that are home to about  people and four senior centers with over  members
our members and residents range from the healthy and active to the frail and homebound to the homeless
for all these individuals project find is a critical resource providing housing meals and programs that help individuals navigate the challenges of aging by encouraging community engagement and healthy living
please visit our website at
projectfindorg 
job overview
as the director of finance you will be responsible for building and enhancing the financial infrastructure of project find
the director of finance is also responsible for all aspects of the financial operations for project find
reporting directly to the executive director the director of finance provides ac

Corresponding top 5 Resume matches

In [None]:
dfz = df_candidates[df_candidates['ResumeID'].isin(final_df['qualified_candidate_ResumeID'].iloc[7])]
dfz = dfz[["ResumeID","Category", "Education", "Skills"]]
dfz

Unnamed: 0,ResumeID,Category,Education,Skills
9,12802330,ACCOUNTANT,"BS : Accounting Business Administration , 2005...","accounting, Accounting Systems, ad, analytical..."
1252,95792386,CONSULTANT,Master of Science : Accounting/Taxation San Fr...,"Accounting, accountant, Accounts Payable, Acco..."
1683,84356308,FINANCE,Bachelor of Science : Mathematics 1997 Univers...,"Accounting, Accounts Payable, Accounts Receiva..."
1687,88038965,FINANCE,Bachelor of Science : Accounting University of...,"Accounting, audit reports, billing, budgets, b..."
1690,93653247,FINANCE,"Master of Business Administration , Business E...","Budgets, budget, business analysis, contracts,..."
