# Resume Parsing

In [59]:
import fitz

# Open the PDF file
doc = fitz.open('resumes/sw2.pdf')

## Group Text into sections
To determine a section title, check if it fulfills all 3 following conditions:
1. It is the only text item in the line
2. It is bolded
3. Its letters are all UPPERCASE  
OR,
- Perform a keyword matching against a list of common resume section title keywords.

In [60]:

# Initialize an empty dictionary to hold the sections
sections = {}
current_section = None

# Define a dictionary of common resume section titles and their associated keywords
section_keywords = {
    "PROFILE": [
        "PROFILE",
        "SUMMARY",
        "ABOUT ME",
        "PERSONAL PROFILE",
        "PERSONAL SUMMARY",
    ],
    "EXPERIENCE": [
        "EXPERIENCE",
        "WORK EXPERIENCE",
        "PROFESSIONAL EXPERIENCE",
        "JOB HISTORY",
        "EMPLOYMENT HISTORY",
    ],
    "EDUCATION": ["EDUCATION", "EDUCATIONAL BACKGROUND", "ACADEMIC HISTORY"],
    "SKILLS": [
        "SKILLS",
        "PROGRAMMING SKILLS",
        "ABILITIES",
        "COMPETENCIES",
        "EXPERTISE",
    ],
    "PROJECTS": ["PROJECTS", "PORTFOLIO"],
    "CERTIFICATIONS": ["CERTIFICATIONS", "CREDENTIALS", "ACCREDITATIONS"],
    "AWARDS": ["AWARDS", "HONORS", "ACHIEVEMENTS"],
    "INTERESTS": ["INTERESTS", "HOBBIES", "ACTIVITIES"],
}

# Create a list of all keywords from the section_keywords dictionary
keywords = [
    keyword for keyword_list in section_keywords.values() for keyword in keyword_list
]

# Loop over each page in the document
for page in doc:
    # Extract the text as a dictionary
    blocks = page.get_text("dict")["blocks"]
    
    for b in blocks:  # iterate through the text blocks
        if "lines" in b:
            for l in b["lines"]:  # iterate through the text lines
                for s in l["spans"]:  # iterate through the text spans

                    # if s["flags"] == 20 and s["text"].isupper():  # check if the text is bold and uppercase
                    #     # This is a section title, so start a new section
                    #     current_section = s["text"]
                    #     sections[current_section] = ""
                    
                    if any(keyword == s["text"].upper() for keyword in keywords):
                        # The text contains a keyword, so start a new section
                        current_section = s["text"].upper()
                        sections[current_section] = ""
                    elif current_section is not None:
                        # This is not a section title, so append it to the current section
                        sections[current_section] += s["text"] + " "

sections

{'PROFESSIONAL EXPERIENCE': "Resume Worded , New York, NY Jun 2018 – Present Software Developer ● Created a new system that tracked and examined customer data, increasing sales by 23% within 32 days of implementation. ● Conceived and introduced an automated email response system, which reduced response time to 3K email messages by 71%. ● Saved 13 person-hours daily by writing scripts to automate 25 tasks like database backups, restores, and server provisioning. ● Developed a user interface as a single-page application using React and MobX, which increased the productivity of 10K users by 87%. Growthsi , New York, NY Jan 2015 – May 2018 Software Tester ● Introduced the idea of bug fixing during graveyard shifts, which led to a quarterly savings of $950K in labor costs. ● Championed a major decision to abandon 17 projects by discovering critical defects in a high-level application worth $2M ● Completed and achieved the launch of 15 products within 35 days with a moderate error rate of 1%

In [61]:
import textwrap

for section, content in sections.items():
    print(section)
    wrapped_content = textwrap.fill(content, width=100)  # wraps the text at 50 characters
    print(wrapped_content)
    print("-"*100)  # prints a line for separation

PROFESSIONAL EXPERIENCE
Resume Worded , New York, NY Jun 2018 – Present Software Developer ● Created a new system that
tracked and examined customer data, increasing sales by 23% within 32 days of implementation. ●
Conceived and introduced an automated email response system, which reduced response time to 3K email
messages by 71%. ● Saved 13 person-hours daily by writing scripts to automate 25 tasks like database
backups, restores, and server provisioning. ● Developed a user interface as a single-page
application using React and MobX, which increased the productivity of 10K users by 87%. Growthsi ,
New York, NY Jan 2015 – May 2018 Software Tester ● Introduced the idea of bug fixing during
graveyard shifts, which led to a quarterly savings of $950K in labor costs. ● Championed a major
decision to abandon 17 projects by discovering critical defects in a high-level application worth
$2M ● Completed and achieved the launch of 15 products within 35 days with a moderate error rate of
1%, an 

### NER

In [62]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sections['EDUCATION'])

# Use 'ent' as the style for entity visualization
displacy.render(doc, style='ent')

## Identify Degree and Major in Education

Our system uses spacy library to extract information (degree and major) from Education. We prepared a dictionary that has all education degrees and majors related to computer engineering field. We fed that dictionary to the Spacy rule-based EntityRuler in order to detect and recognize entities in our job description.

In [63]:
import spacy

In [64]:
def match_degrees_by_spacy(nlp):
    doc = nlp(sections['EDUCATION'])
    degree_levels = []
    for ent in doc.ents:
        labels_parts = ent.label_.split('|')
        if labels_parts[0] == 'DEGREE':
            print((ent.text, ent.label_))
            if labels_parts[1] not in degree_levels:
                degree_levels.append(labels_parts[1])
    return degree_levels


def match_majors_by_spacy(nlp):
    doc = nlp(sections['EDUCATION'])

    displacy.render(doc, style='ent')

    acceptable_majors = []
    for ent in doc.ents:
        labels_parts = ent.label_.split('|')
        if labels_parts[0] == 'MAJOR':
            if labels_parts[2].replace('-', ' ') not in acceptable_majors:
                acceptable_majors.append(labels_parts[2].replace('-', ' '))
            if labels_parts[2].replace('-', ' ') not in acceptable_majors:
                acceptable_majors.append(labels_parts[2].replace('-', ' '))
    return acceptable_majors

In [65]:
nlp = spacy.load("en_core_web_sm")
# Add the pattern to the matcher
patterns_path = 'degrees_majors.jsonl'
ruler = nlp.add_pipe("entity_ruler", before='ner')
ruler.from_disk(patterns_path)

<spacy.pipeline.entityruler.EntityRuler at 0x1f70e4d5390>

In [66]:
match_degrees_by_spacy(nlp)

('University', 'DEGREE|BACHELOR')
('Master', 'DEGREE|MASTER')


['BACHELOR', 'MASTER']

In [67]:
match_majors_by_spacy(nlp)

['information technology']

## Extract Dates

We need to extract the dates under EXPERIENCE section. Then calculate the total years of experience from it.

In [68]:
text_with_dates = '''
I was employed at Company D from 05/2015 - 06/2017.
I worked on a project from 01/2021 - Present.
I worked on a project from Jan 2022 - Present.
I was part of Company F from Feb 2010 - Jan 2013.
My tenure at Company G was from 07/2014 - 08/2016.
I worked at Company A from January 2011 - February 2012. 
Then, I joined Company B in March 2018 - Present. 
I was part of a project from 10/2019 - 07/2021. 
I worked at Company A from January 2011 to February 2012. 
I was part of a project from 2019 - 2021. 
I was part of a project from 2021 - 2023. 
My time at Company E spanned from 2020 - current.
My time at Company E spanned from Jan 2020 - current. 
Jun 2018 – Present. (This is En-dash '–' not Hyphen '-'. So, spacy can't detect this as Date)
'''

In [74]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

# Most of the date patterns are detected by default DATE entity
# Define the pattern for '05/2015 - 06/2017' and '10/2020 - Present'.
patterns = [
    # 05/2015 - 06/2017
    {"label": "DATE", "pattern": [{"SHAPE": "dd/dddd"}, {"TEXT": "-"}, {"SHAPE": "dd/dddd"}]},

    # 10/2020 - Present
    {"label": "DATE", "pattern": [{"SHAPE": "dd/dddd"}, {"TEXT": "-"}, {"LOWER": "present"}]},
    {"label": "DATE", "pattern": [{"SHAPE": "dd/dddd"}, {"TEXT": "-"}, {"LOWER": "current"}]},

    # Jan 2020 - current, March 2018 - Present
    {"label": "DATE", "pattern": [{"LOWER": {"in": ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec", "january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]}}, {"TEXT": {"REGEX": "^\d{4}$"}}, {"TEXT": "-"}, {"LOWER": {"in": ["current", "present"]}}]},

    # 2020 - current
    {"label": "DATE", "pattern": [{"SHAPE": "dddd"}, {"TEXT": "-"}, {"LOWER": {"in": ["current", "present"]}}]},
    ]

ruler = nlp.add_pipe("entity_ruler", before='ner')
# Add the pattern to the ruler
ruler.add_patterns(patterns)

# Use the nlp object on your text
doc = nlp(text_with_dates)

# Specify options for displacy
options = {'ents': ['DATE']}
# Use 'ent' as the style for entity visualization
displacy.render(doc, style='ent', options=options)


#### Calculate years of experience

In [70]:
# Extract the dates
dates = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
print(dates)

['05/2015 - 06/2017', '01/2021 - Present', 'Jan 2022 - Present', 'Feb 2010 - Jan 2013', '07/2014 - 08/2016', 'January 2011 - February 2012', 'March 2018 - Present', '10/2019 - 07/2021', 'January 2011 to February 2012', '2019 - 2021', '2021 - 2023', '2020', 'Jan 2020 - current']


In [71]:
from dateutil import parser
from dateutil.relativedelta import relativedelta
from datetime import datetime

def calculate_years(dates):
    year_diffs = []
    for date in dates:
        # Check if the date is in the "start - end" format
        if ' - ' not in date:
            continue
        
        # Split the date range into start and end dates
        start_date, end_date = date.split(' - ')
        
        # Replace 'Present' or 'current' with today's date
        if 'present' in end_date.lower() or 'current' in end_date.lower():
            end_date = datetime.today().strftime('%m/%Y')
        
        # Parse the dates
        start_date = parser.parse(start_date)
        end_date = parser.parse(end_date)
        
        # Calculate the difference in years (considering months)
        diff = relativedelta(end_date, start_date)
        years = diff.years + diff.months / 12
        years = round(years, 2)
        
        year_diffs.append(years)
        print(f'{date} \t {years} yr')
    return year_diffs

print(calculate_years(dates))

05/2015 - 06/2017 	 2.08 yr
01/2021 - Present 	 2.92 yr
Jan 2022 - Present 	 1.92 yr
Feb 2010 - Jan 2013 	 2.92 yr
07/2014 - 08/2016 	 2.08 yr
January 2011 - February 2012 	 1.08 yr
March 2018 - Present 	 5.75 yr
10/2019 - 07/2021 	 1.75 yr
2019 - 2021 	 2.0 yr
2021 - 2023 	 2.0 yr
Jan 2020 - current 	 3.92 yr
[2.08, 2.92, 1.92, 2.92, 2.08, 1.08, 5.75, 1.75, 2.0, 2.0, 3.92]


Job Experience Requirements

In [72]:
import re

def extract_min_experience(text):
    # Define the regular expression patterns
    pattern_months = r'(\d+)(?=\s*month)'
    pattern_years = r'(\d+)(?=\s*year)'
    
    # Use search to get the first match
    match_years = re.search(pattern_years, text)
    match_months = re.search(pattern_months, text)
    
    # If a match was found, convert it to an integer and return
    if match_years:
        return int(match_years.group())
    elif match_months:
        return int(match_months.group()) / 12  # Convert months to years
    
    # If no match was found, return None
    return 0

# Test the function
job_exp = [
'Minimum 6 months of professional front-end development experience.',
'More than 1 year.',
'A minimum of 2 years of professional experience',
'No experience'
]
min_yoe = [extract_min_experience(job) for job in job_exp]
min_yoe

[0.5, 1, 2, 0]

Find experience_score

In [73]:
applicant_years_list = [2.08, 2.92, 1.92, 2.92, 2.08, 1.08, 5.75, 1.75, 2.0, 3.0, 3.92]
target_job_experience = 2
for applicant_years in applicant_years_list:
    if target_job_experience == 0:
        experience_score = 1.0
    else:
        experience_score = min(applicant_years / target_job_experience, 1.0)
    print(f'{applicant_years} \t {experience_score}')

2.08 	 1.0
2.92 	 1.0
1.92 	 0.96
2.92 	 1.0
2.08 	 1.0
1.08 	 0.54
5.75 	 1.0
1.75 	 0.875
2.0 	 1.0
3.0 	 1.0
3.92 	 1.0
