# Resume Parsing

In [1]:
import fitz

PDF_PATH = 'resumes/latex/plushcv.pdf'

## Group Text into sections
To determine a section title, check if it fulfills all 3 following conditions:
1. It is the only text item in the line
2. It is bolded
3. Its letters are all UPPERCASE  
OR,
- Perform a keyword matching against a list of common resume section title keywords.

In [2]:

# Initialize an empty dictionary to hold the sections
sections = {}
current_section = None

# Define a dictionary of common resume section titles and their associated keywords
section_keywords = {
    "PROFILE": [
        "PROFILE",
        "SUMMARY",
        "ABOUT ME",
        "PERSONAL PROFILE",
        "PERSONAL SUMMARY",
    ],
    "EXPERIENCE": [
        "EXPERIENCE",
        "WORK EXPERIENCE",
        "PROFESSIONAL EXPERIENCE",
        "RELEVANT WORK EXPERIENCE",
        "JOB HISTORY",
        "EMPLOYMENT HISTORY",
    ],
    "EDUCATION": ["EDUCATION", "EDUCATIONAL BACKGROUND", "ACADEMIC HISTORY"],
    "SKILLS": [
        "SKILLS",
        "TECHNICAL SKILLS",
        "PROGRAMMING SKILLS",
        "ABILITIES",
        "COMPETENCIES",
        "EXPERTISE",
    ],
    "PROJECTS": ["PROJECTS", "PORTFOLIO"],
    "CERTIFICATIONS": ["CERTIFICATIONS", "CREDENTIALS", "ACCREDITATIONS"],
    "AWARDS": ["AWARDS", "HONORS", "ACHIEVEMENTS"],
    "INTERESTS": ["INTERESTS", "HOBBIES", "ACTIVITIES"],
}

# Create a list of all keywords from the section_keywords dictionary
keywords = [
    keyword for keyword_list in section_keywords.values() for keyword in keyword_list
]

potential_section_titles = []

def extract_sections_from_pdf(pdf_path, keywords):
    # Open the PDF file
    doc = fitz.open(pdf_path)

    # Initialize an empty dictionary to hold the sections
    sections = {}
    current_section = None

    # Loop over each page in the document
    for page in doc:
        # Extract the text as a dictionary
        blocks = page.get_text("dict")["blocks"]

        for b in blocks:  # iterate through the text blocks
            if "lines" in b:
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        # check if the text is bold and uppercase
                        if s["flags"] == 20 and s["text"].isupper():
                            """
                            This is to identify different new keywords that could be used as section titles by various resumes
                            """
                            if s["text"] not in keywords:
                                potential_section_titles.append(s["text"])
                            # # This is a section title, so start a new section
                            # current_section = s["text"]
                            # sections[current_section] = ""                        

                        if any(keyword == s["text"].strip().upper() for keyword in keywords):
                            # The text contains a keyword, so start a new section
                            current_section = s["text"].strip().upper()
                            sections[current_section] = ""
                        elif current_section is not None:
                            # This is not a section title, so append it to the current section
                            sections[current_section] += s["text"] + " "

    return sections

sections = extract_sections_from_pdf(PDF_PATH, keywords)

print('Potential Section Titles \n', potential_section_titles)

Potential Section Titles 
 ['WEYLAND YUTANI INDUSTRIES', 'TESSIER‑ASHPOOLE S.A.', 'LEXCORP', 'CHESS ENGINE', 'SPEECH‑ENABLED CHATBOT', 'PROGRAMMING', 'LIBRARIES/FRAMEWORKS', 'TOOLS/PLATFORMS', 'UNIVERSITY OF UTAH', 'UNIVERSITY OF OREGON', 'REFERENCES']


**Display the extracted sections**

In [3]:
import textwrap

for section, content in sections.items():
    print(section)
    wrapped_content = textwrap.fill(content, width=100)  # wraps the text at 50 characters
    print(wrapped_content[:500])
    print("-"*100)  # prints a line for separation

EXPERIENCE
WEYLAND YUTANI INDUSTRIES  | PROGRAM MANAGER May 2021 – Current | Tokyo, Japan    Lorem ipsum dolor
sit amet, consectetur adipiscing elit. Proin ullamcorper venenatis nisi at suscipit. Vestibulum vel
odio in diam ultrices posuere. Cras suscipit faucibus ullamcorper.    Ut consectetur tempus
tincidunt. Curabitur in felis et leo elementum facilisis at non metus. Vestibulum et ullamcorper
augue, nec accumsan tellus.    Cras posuere in nunc vel sagittis. Aliquam aliquet non orci id
pellentesque. Nul
----------------------------------------------------------------------------------------------------
PROJECTS
CHESS ENGINE  | C++ 2018    Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin
ullamcorper venenatis nisi at suscipit. Vestibulum vel odio in diam ultrices posuere. Cras suscipit
faucibus ullamcorper.    Lorem ipsum dolor sit amet, consectetur adipiscing elit. Proin ullamcorper
venenatis nisi at suscipit. SPEECH‑ENABLED CHATBOT  | C#, MICROSOFT BOT FRAMEWORK 2018 

### NER

In [4]:
import spacy
from spacy import displacy

## Identify Degree and Major in Education

Our system uses spacy library to extract information (degree and major) from Education. We prepared a dictionary that has all education degrees and majors related to computer engineering field. We fed that dictionary to the Spacy rule-based EntityRuler in order to detect and recognize entities in our job description.

In [5]:
# Initialize the NLP pipeline and entity ruler
nlp = spacy.load("en_core_web_sm")
# Disable the default 'ner' component
nlp.disable_pipe('ner')
# Add the pattern to the matcher
PATTERN_PATH = 'degrees_majors.jsonl'
ruler = nlp.add_pipe("entity_ruler", before='ner')
# Load the degree and major patterns
ruler.from_disk(PATTERN_PATH)

<spacy.pipeline.entityruler.EntityRuler at 0x1a2eca85090>

In [6]:
# General Fields
general_fields = [
    "Computer Science",
    "Computer Engineering",
    "Information Technology (IT)",
    "Software Engineering",
    "Network Engineering",
    "Systems Engineering",
    "Information Systems",
]

# Specific Fields
specific_fields = [
    "Data Science",
    "Cybersecurity",
    "Artificial Intelligence",
    "Machine Learning",
    "Web Development",
    "Database Management",
    "Cloud Computing",
    "Mobile App Development",
    "Game Development",
    "Computer Graphics",
    "Robotics",
    "Internet of Things (IoT)",
    "Natural Language Processing",
    "Computer Vision",
    "Embedded Systems",
    "Quantum Computing",
    "Cryptography",
    "Bioinformatics",
    "Data Analytics"
]

In [7]:
def match_degrees_by_spacy(education_text, nlp):
    doc = nlp(education_text)
    degree_levels = []
    for ent in doc.ents:
        labels_parts = ent.label_.split('|')
        if labels_parts[0] == 'DEGREE':
            # print((ent.text, ent.label_))
            if labels_parts[1] not in degree_levels:
                degree_levels.append(labels_parts[1])
    return degree_levels

def match_majors_by_spacy(education_text, nlp):
    doc = nlp(education_text)
    acceptable_majors = []
    for ent in doc.ents:
        labels_parts = ent.label_.split('|')
        if labels_parts[0] == 'MAJOR':
            acceptable_majors.append(labels_parts[1:])
    return acceptable_majors

In [8]:
doc = nlp(' '.join(general_fields) + ' ' + ' '.join(specific_fields))
displacy.render(doc, style='ent')

In [9]:
# Function to extract degree level and field of study from education
def extract_education_info(df_resume, target_job):
    # Initialize the NLP pipeline and entity ruler
    nlp = spacy.load("en_core_web_sm")
    # Disable the default 'ner' component
    nlp.disable_pipe('ner')
    # Add the pattern to the matcher
    PATTERN_PATH = 'degrees_majors.jsonl'
    ruler = nlp.add_pipe("entity_ruler", before='ner')
    # Load the degree and major patterns
    ruler.from_disk(PATTERN_PATH)

    # Initialize lists to store the results
    resume_degrees = []
    resume_majors = []

    # Extract degree and major for each resume
    for education_text in df_resume["EDUCATION"]:
        degrees = match_degrees_by_spacy(education_text, nlp)
        majors = match_majors_by_spacy(education_text, nlp)
        resume_degrees.append(degrees)
        resume_majors.append(majors)

    # Extract degree and major for the target job
    job_degree = match_degrees_by_spacy(target_job["education"], nlp)
    job_major = match_majors_by_spacy(target_job["education"], nlp)

    (
        df_resume["education_degree"],
        df_resume["education_major"],
        target_job["education_degree"],
        target_job["education_major"],
    ) = (resume_degrees, resume_majors, job_degree, job_major)

    return resume_degrees, resume_majors, job_degree, job_major


"""
Example:
    job_major = ['CS', 'AI', 'machine-learning']
    resume_major = [['CS', 'CS', 'computer-science'], ['CS', 'AI', 'data-science']]
    # Output: Major Score: 0.6 (based on the partial match in the 2nd part i.e. 'AI')
"""
def calculate_education_major_similarity(resume_major, job_major):
    score = 0.0
    for major in resume_major:
        # Check for an exact match in the 3rd part
        if job_major[2] == major[2]:
            score = 1.0 # max_score reached
            break
        # Check for a match in the 2nd part
        elif job_major[1] == major[1]:
            score = max(score, 0.6)
        # Check for a match in the 1st part
        elif job_major[0] == major[0]:
            score = max(score, 0.3)
    return score


def get_education_major_score(resume_majors, job_major):
    field_scores = []

    for resume_major in resume_majors:
        field_score = 0
        similarities = [
            calculate_education_major_similarity(resume_major, required_major)
            for required_major in job_major
        ]
        # Find max similarity score
        if similarities:
            field_score = max(similarities)
        field_scores.append(field_score)

    return field_scores


"""
Function to calculate degree score:
- It maps degree levels to numerical values, finds the highest degree for each resume,
- Converts the job degree to numerical form, and calculates a degree score based on a scoring formula.
- If the highest degree in a resume is greater than or equal to the minimum job degree, the score is 1,
- Otherwise, it is calculated using a formula that considers the difference between the degrees.
"""

# Define a mapping for degree levels
degree_mapping = {
    'BACHELOR': 1,
    'MASTER': 2,
    'PHD': 3,
}

def get_education_degree_score(resume_degrees, job_degree):
    # Apply the mapping to the list of degrees
    numerical_degrees = [[degree_mapping.get(degree, 0) for degree in degrees] for degrees in resume_degrees]
    # Get the highest degree for each resume
    applicant_degrees = [max(degrees) if degrees else 0 for degrees in numerical_degrees]

    # Convert the job degree to numerical form
    numerical_job_degree = [degree_mapping.get(degree, 0) for degree in job_degree]
    # Get the minimum degree required for the job
    min_degree_required = min(numerical_job_degree)

    degree_scores = []
    for applicant_degree in applicant_degrees:
        degree_score = 0
        
        if applicant_degree >= min_degree_required:
            degree_score = 1
        else:
            degree_score = 1- (min_degree_required-applicant_degree)/max(min_degree_required, 1)

        degree_scores.append(degree_score)

    return degree_scores
        

def get_education_score(df_resume, target_job):
    resume_degrees, resume_majors, job_degree, job_major = extract_education_info(
        df_resume, target_job
    )

    degree_scores = get_education_degree_score(resume_degrees, job_degree)
    field_scores = get_education_major_score(resume_majors, job_major)

    # Set weights for degree and field_of_study
    degree_weight = 0.7
    field_weight = 0.3

    combined_scores = [degree_weight * degree_score + field_weight * field_score for degree_score, field_score in zip(degree_scores, field_scores)]

    # df_resume['degree_score'] = degree_scores
    # df_resume['major_score'] = field_scores
    df_resume['education_score'] = combined_scores

In [10]:
import pandas as pd
applicant_education = [
    {'name': 'Applicant 1', 'EDUCATION': 'Bachelor in Computer Engineering and Master in Artificial Intelligence'},
    {'name': 'Applicant 2', 'EDUCATION': 'Bachelor Degree in Machine Learning. Software Engineering'},
    {'name': 'Applicant 3', 'EDUCATION': 'Masters in IT'},
    {'name': 'Applicant 4', 'EDUCATION': 'PhD in Computer Science'},
    {'name': 'Applicant 5', 'EDUCATION': ' Computer Science, Data Science'},
    {'name': 'Applicant 5', 'EDUCATION': ' B.E in Electronics Engineering'},
]

target_job = {
    'education': 'phd in Data Science'
}

df_resume = pd.DataFrame(applicant_education)

get_education_score(df_resume, target_job)
df_resume

Unnamed: 0,name,EDUCATION,education_degree,education_major,education_score
0,Applicant 1,Bachelor in Computer Engineering and Master in...,"[BACHELOR, MASTER]","[[CS, CS, computer-engineering], [CS, AI, arti...",0.646667
1,Applicant 2,Bachelor Degree in Machine Learning. Software ...,[BACHELOR],"[[CS, AI, machine-learning], [CS, CS, software...",0.413333
2,Applicant 3,Masters in IT,[MASTER],"[[CS, CS, information-technology]]",0.556667
3,Applicant 4,PhD in Computer Science,[PHD],"[[CS, CS, computer-science]]",0.79
4,Applicant 5,"Computer Science, Data Science",[],"[[CS, CS, computer-science], [CS, AI, data-sci...",0.3
5,Applicant 5,B.E in Electronics Engineering,[BACHELOR],[],0.233333


## Extract Dates

We need to extract the dates under EXPERIENCE section. Then calculate the total years of experience from it.

In [11]:
text_with_dates = '''
I was employed at Company D from 05/2015 - 06/2017.
I worked on a project from 01/2021 - Present.
I worked on a project from Jan 2022 - Present.
Then, I joined Company B in March 2018 - Present.  
I was part of Company F from Feb 2010 - Jan 2013.
I worked at Company A from January 2011 - February 2012. 
I was part of a project from 10/2019 - 07/2021. 
I worked at Company A from January 2011 to February 2012. 
I was part of a project from 2021 - 2023. 
My time at Company E spanned from 2020 - current.
My time at Company E spanned from Jan 2020 - current. 

Jun 2018 – Present. (This is En-dash '–' not Hyphen '-')
Feb 2020 — Present. (Em-dash)

June 2018 - January 2020 / Pittsburgh
Web Developer Fanatics 2020 - current Jacksonville
Web Designer Magic Leap 2018 - 2020
Junior Web Developer HSN 2016 - 2018 Saint Petersburg

2020-2023
2017-2019
Jun 2016 – Sep 2016
Jun 2016 – Jan 2017
'''

**En-dash '–' and Hyphen '-'**
- In some resumes, date formats use En-dash '–', e.g. Jun 2018 – Present. But, space can't detect these as Date.
- So, replace En-dash '–' with Hyphen '-' before NER.

In [12]:
text_with_dates = text_with_dates.replace('–', '-')

In [13]:
import spacy
from spacy import displacy


def create_nlp_for_experience():
    nlp = spacy.load("en_core_web_sm")

    VALID_MONTH_NAMES = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
    
    # Most of the date patterns are detected by default DATE entity
    # Define the pattern for '05/2015 - 06/2017' and '10/2020 - Present'.
    patterns = [
        # 05/2015 - 06/2017
        {"label": "DATE", "pattern": [{"SHAPE": "dd/dddd"}, {"TEXT": "-"}, {"SHAPE": "dd/dddd"}]},

        # 10/2020 - Present
        {"label": "DATE", "pattern": [{"SHAPE": "dd/dddd"}, {"TEXT": "-"}, {"LOWER": "present"}]},
        {"label": "DATE", "pattern": [{"SHAPE": "dd/dddd"}, {"TEXT": "-"}, {"LOWER": "current"}]},

        # Jan 2020 - current, March 2018 - Present
        {"label": "DATE", "pattern": [{"LOWER": {"in": VALID_MONTH_NAMES}}, {"TEXT": {"REGEX": "^\d{4}$"}}, {"TEXT": "-"}, {"LOWER": {"in": ["current", "present"]}}]},

        # Jun 2016 - Sep 2016
        {"label": "DATE", "pattern": [{"LOWER": {"in": VALID_MONTH_NAMES}},
        {"TEXT": {"REGEX": "^\d{4}$"}}, {"TEXT": "-"}, {"LOWER": {"in": VALID_MONTH_NAMES}}, {"TEXT": {"REGEX": "^\d{4}$"}}]},

        # 2020 - current
        {"label": "DATE", "pattern": [{"SHAPE": "dddd"}, {"TEXT": "-"}, {"LOWER": {"in": ["current", "present"]}}]},
        ]

    ruler = nlp.add_pipe("entity_ruler", before="ner")
    # Add the pattern to the ruler
    ruler.add_patterns(patterns)

    return nlp

nlp = create_nlp_for_experience()

# Use the nlp object on your text
doc = nlp(text_with_dates)

# Test on experience section from resume
# doc = nlp(sections['EXPERIENCE'])

# Specify options for displacy
options = {'ents': ['DATE']}
# Use 'ent' as the style for entity visualization
displacy.render(doc, style='ent', options=options)


#### Calculate years of experience

In [14]:
# Extract the dates that are in the "start - end" format
dates = [ent.text for ent in doc.ents if ent.label_ == 'DATE' and ('-' in ent.text)]
print(dates)

['05/2015 - 06/2017', '01/2021 - Present', 'Jan 2022 - Present', 'March 2018 - Present', 'Feb 2010 - Jan 2013', 'January 2011 - February 2012', '10/2019 - 07/2021', '2021 - 2023', '2020 - current', 'Jan 2020 - current', 'Jun 2018 - Present', 'June 2018 - January 2020', '2020 - current', '2018 - 2020', '2020-2023', '2017-2019', 'Jun 2016 - Sep 2016', 'Jun 2016 - Jan 2017']


In [15]:
from dateutil import parser
from dateutil.relativedelta import relativedelta
from datetime import datetime

def calculate_years(dates):
    year_diffs = []
    for date in dates:
        # Check if the date is in the "start - end" format
        if '-' not in date:
            continue
        
        try:
            # Split the date range into start and end dates
            start_date, end_date = date.split('-')
            
            # Replace 'Present' or 'current' with today's date
            if 'present' in end_date.lower() or 'current' in end_date.lower():
                end_date = datetime.today().strftime('%m/%Y')
            
            # Parse the dates
            start_date = parser.parse(start_date)
            end_date = parser.parse(end_date)
            
            # Calculate the difference in years (considering months)
            diff = relativedelta(end_date, start_date)
            years = diff.years + diff.months / 12
            years = round(years, 2)
            year_diffs.append(years)
            print(f'{date} \t {years} yr')
        except ValueError as e:
            print(f"Error parsing dates: {e}. Skipping this entry.")

    return year_diffs

print(calculate_years(dates))

05/2015 - 06/2017 	 2.08 yr
01/2021 - Present 	 3.0 yr
Jan 2022 - Present 	 2.0 yr
March 2018 - Present 	 5.83 yr
Feb 2010 - Jan 2013 	 2.92 yr
January 2011 - February 2012 	 1.08 yr
10/2019 - 07/2021 	 1.75 yr
2021 - 2023 	 2.0 yr
2020 - current 	 4.0 yr
Jan 2020 - current 	 4.0 yr
Jun 2018 - Present 	 5.58 yr
June 2018 - January 2020 	 1.58 yr
2020 - current 	 4.0 yr
2018 - 2020 	 2.0 yr
2020-2023 	 3.0 yr
2017-2019 	 2.0 yr
Jun 2016 - Sep 2016 	 0.25 yr
Jun 2016 - Jan 2017 	 0.58 yr
[2.08, 3.0, 2.0, 5.83, 2.92, 1.08, 1.75, 2.0, 4.0, 4.0, 5.58, 1.58, 4.0, 2.0, 3.0, 2.0, 0.25, 0.58]


Job Experience Requirements

In [16]:
import re

def extract_min_experience(text):
    # Define the regular expression patterns
    pattern_months = r'(\d+)(?=\s*month)'
    pattern_years = r'(\d+)(?=\s*year)'
    
    # Use search to get the first match
    match_years = re.search(pattern_years, text)
    match_months = re.search(pattern_months, text)
    
    # If a match was found, convert it to an integer and return
    if match_years:
        return int(match_years.group())
    elif match_months:
        return int(match_months.group()) / 12  # Convert months to years
    
    # If no match was found, return None
    return 0

# Test the function
job_exp = [
'Minimum 6 months of professional front-end development experience.',
'More than 1 year.',
'A minimum of 2 years of professional experience',
'No experience'
]
min_yoe = [extract_min_experience(job) for job in job_exp]
min_yoe

[0.5, 1, 2, 0]

Find experience_score

In [17]:
applicant_years_list = [2.08, 2.92, 1.92, 2.92, 2.08, 1.08, 5.75, 1.75, 2.0, 3.0, 3.92]
target_job_experience = 2
for applicant_years in applicant_years_list:
    if target_job_experience == 0: # no experience required
        experience_score = 1.0
    else:
        experience_score = min(applicant_years / target_job_experience, 1.0)
    print(f'{applicant_years} \t {experience_score}')

2.08 	 1.0
2.92 	 1.0
1.92 	 0.96
2.92 	 1.0
2.08 	 1.0
1.08 	 0.54
5.75 	 1.0
1.75 	 0.875
2.0 	 1.0
3.0 	 1.0
3.92 	 1.0
