# Resume Parsing

In [1]:
import fitz

# Open the PDF file
doc = fitz.open('resumes/react.pdf')

## Group Text into sections
To determine a section title, check if it fulfills all 3 following conditions:
1. It is the only text item in the line
2. It is bolded
3. Its letters are all UPPERCASE  
OR,
- Perform a keyword matching against a list of common resume section title keywords.

In [2]:

# Initialize an empty dictionary to hold the sections
sections = {}
current_section = None

# Define a list of common resume section titles
keywords = [
    "PROFILE", "SUMMARY", "ABOUT ME", "PERSONAL PROFILE", "PERSONAL SUMMARY",
    "WORK EXPERIENCE", "EXPERIENCE", "JOB HISTORY", "EMPLOYMENT HISTORY",
    "EDUCATION", "EDUCATIONAL BACKGROUND", "ACADEMIC HISTORY",
    "SKILLS", "ABILITIES", "COMPETENCIES", "EXPERTISE",
    "PROJECTS", "PORTFOLIO",
    "CERTIFICATIONS", "CREDENTIALS", "ACCREDITATIONS",
    "AWARDS", "HONORS", "ACHIEVEMENTS",
    "INTERESTS", "HOBBIES", "ACTIVITIES",
]

# Loop over each page in the document
for page in doc:
    # Extract the text as a dictionary
    blocks = page.get_text("dict")["blocks"]
    
    for b in blocks:  # iterate through the text blocks
        if "lines" in b:
            for l in b["lines"]:  # iterate through the text lines
                for s in l["spans"]:  # iterate through the text spans
                    if s["flags"] == 20 and s["text"].isupper():  # check if the text is bold and uppercase
                        # This is a section title, so start a new section
                        current_section = s["text"]
                        sections[current_section] = ""
                    elif any(keyword == s["text"].upper() for keyword in keywords):
                        # The text contains a keyword, so start a new section
                        current_section = s["text"].upper()
                        sections[current_section] = ""
                    elif current_section is not None:
                        # This is not a section title, so append it to the current section
                        sections[current_section] += s["text"] + " "

sections

{'WORK EXPERIENCE': '_______________________________________________________________________________________________________________ Resume Worded, New York, NY 09/2015 – Present Education technology startup with 50+ employees and $100m+ annual revenue React Developer ● Decreased load times by 58% using isomorphic React and Node.js for 13 web applications one month after taking over the project. ● Partnered with a team of 5 developers to create 14 e-commerce websites using React and NodeJS in the ﬁrst month on the job. ● Reduced the time spent on development by 73% by creating maintainable reusable components. ● Created an interactive and user-friendly website experience by working with a team of 25 developers for one month. Polyhire, London, United Kingdom 10/2012 – 08/2015 NYSE-listed recruitment and employer branding company Mobile Application Developer ● Created a web service used by 200K clients and generates $100K monthly within 20 days after inception. ● Devised a system to host

In [3]:
import textwrap

for section, content in sections.items():
    print(section)
    wrapped_content = textwrap.fill(content, width=100)  # wraps the text at 50 characters
    print(wrapped_content)
    print("-"*100)  # prints a line for separation

WORK EXPERIENCE
____________________________________________________________________________________________________
___________ Resume Worded, New York, NY 09/2015 – Present Education technology startup with 50+
employees and $100m+ annual revenue React Developer ● Decreased load times by 58% using isomorphic
React and Node.js for 13 web applications one month after taking over the project. ● Partnered with
a team of 5 developers to create 14 e-commerce websites using React and NodeJS in the ﬁrst month on
the job. ● Reduced the time spent on development by 73% by creating maintainable reusable
components. ● Created an interactive and user-friendly website experience by working with a team of
25 developers for one month. Polyhire, London, United Kingdom 10/2012 – 08/2015 NYSE-listed
recruitment and employer branding company Mobile Application Developer ● Created a web service used
by 200K clients and generates $100K monthly within 20 days after inception. ● Devised a system to
host mob

### NER

In [2]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sections['EDUCATION'])

# Use 'ent' as the style for entity visualization
displacy.render(doc, style='ent')

NameError: name 'sections' is not defined

## Identify Degree and Major in Education

Our system uses spacy library to extract information (degree and major) from Education. We prepared a dictionary that has all education degrees and majors related to computer engineering field. We fed that dictionary to the Spacy rule-based EntityRuler in order to detect and recognize entities in our job description.

In [1]:
import spacy

In [7]:
def match_degrees_by_spacy():
    nlp = spacy.load("en_core_web_sm")
    # Add the pattern to the matcher
    patterns_path = 'degrees.jsonl'
    ruler = nlp.add_pipe("entity_ruler", before='ner')
    ruler.from_disk(patterns_path)
    # Process some text
    doc1 = nlp(sections['EDUCATION'])
    degree_levels = []
    for ent in doc1.ents:
        labels_parts = ent.label_.split('|')
        if labels_parts[0] == 'DEGREE':
            print((ent.text, ent.label_))
            if labels_parts[1] not in degree_levels:
                degree_levels.append(labels_parts[1])
    return degree_levels


def match_majors_by_spacy():
    nlp = spacy.load("en_core_web_sm")
    # Add the pattern to the matcher
    patterns_path = 'majors.jsonl'
    ruler = nlp.add_pipe("entity_ruler", before='ner')
    ruler.from_disk(patterns_path)
    # Process some text
    doc1 = nlp(sections['EDUCATION'])

    displacy.render(doc1, style='ent')

    acceptable_majors = []
    for ent in doc1.ents:
        labels_parts = ent.label_.split('|')
        if labels_parts[0] == 'MAJOR':
            if labels_parts[2].replace('-', ' ') not in acceptable_majors:
                acceptable_majors.append(labels_parts[2].replace('-', ' '))
            if labels_parts[2].replace('-', ' ') not in acceptable_majors:
                acceptable_majors.append(labels_parts[2].replace('-', ' '))
    return acceptable_majors

In [8]:
match_degrees_by_spacy()

('University', 'DEGREE|BACHELOR')
('Bachelor', 'DEGREE|BACHELOR')


['BACHELOR']

In [9]:
match_majors_by_spacy()

['computer science']

## Extract Dates

We need to extract the dates under EXPERIENCE section. Then calculate the total years of experience from it.

In [10]:
text_with_dates = '''
I was employed at Company D from 05/2015 - 06/2017.
I worked on a project from 01/2021 - Present.
I worked on a project from Jan 2022 - Present.
I was part of Company F from Feb 2010 - Jan 2013.
My tenure at Company G was from 07/2014 - 08/2016.
I worked at Company A from January 2011 - February 2012. 
Then, I joined Company B in March 2018 - Present. 
I was part of a project from 10/2019 - 07/2021. 
I worked at Company A from January 2011 to February 2012. 
I was part of a project from 10/2019 to 07/2021. 
My tenure at Company C spanned from Jan '15 - Dec '18.
My time at Company E spanned from March '18 - April '20.
'''

In [11]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")

# Most of the date patterns are detected by default DATE entity
# Define the pattern for '05/2015 - 06/2017' and '10/2020 - Present'.
patterns = [{"label": "DATE", "pattern": [{"SHAPE": "dd/dddd"}, {"TEXT": "-"}, {"SHAPE": "dd/dddd"}]},
             {"label": "DATE", "pattern": [{"SHAPE": "dd/dddd"}, {"TEXT": "-"}, {"LOWER": "present"}]}]

ruler = nlp.add_pipe("entity_ruler", before='ner')
# Add the pattern to the ruler
ruler.add_patterns(patterns)

# Use the nlp object on your text
doc = nlp(text_with_dates)

# Specify options for displacy
options = {'ents': ['DATE']}
# Use 'ent' as the style for entity visualization
displacy.render(doc, style='ent', options=options)


#### Calculate years of experience

In [12]:
# Extract the dates
dates = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
print(dates)

['05/2015 - 06/2017', '01/2021 - Present', 'Jan 2022 - Present', 'Feb 2010 - Jan 2013', '07/2014 - 08/2016', 'January 2011 - February 2012', 'March 2018 - Present', '10/2019 - 07/2021', 'January 2011 to February 2012', "Jan '", "'18", 'March', "18 - April '20"]


In [17]:
from dateutil import parser
from dateutil.relativedelta import relativedelta
from datetime import datetime

def calculate_years(dates):
    year_diffs = []
    for date in dates:
        # Check if the date is in the "start - end" format
        if ' - ' not in date:
            continue
        
        # Split the date range into start and end dates
        start_date, end_date = date.split(' - ')
        
        # Replace 'Present' with today's date
        if 'Present' in end_date:
            end_date = datetime.today().strftime('%m/%Y')
        
        # Parse the dates
        start_date = parser.parse(start_date)
        end_date = parser.parse(end_date)
        
        # Calculate the difference in years (considering months)
        diff = relativedelta(end_date, start_date)
        years = diff.years + diff.months / 12
        years = round(years, 2)
        
        year_diffs.append(years)
        print(f'{date} \t {years} yr')
    return year_diffs

print(calculate_years(dates))

05/2015 - 06/2017 	 2.08 yr
01/2021 - Present 	 2.75 yr
Jan 2022 - Present 	 1.75 yr
Feb 2010 - Jan 2013 	 2.92 yr
07/2014 - 08/2016 	 2.08 yr
January 2011 - February 2012 	 1.08 yr
March 2018 - Present 	 5.58 yr
10/2019 - 07/2021 	 1.75 yr
18 - April '20 	 -0.42 yr
[2.08, 2.75, 1.75, 2.92, 2.08, 1.08, 5.58, 1.75, -0.42]


Job Experience Requirements

In [24]:
import re

def extract_min_experience(text):
    # Define the regular expression patterns
    pattern_months = r'(\d+)(?=\s*month)'
    pattern_years = r'(\d+)(?=\s*year)'
    
    # Use search to get the first match
    match_years = re.search(pattern_years, text)
    match_months = re.search(pattern_months, text)
    
    # If a match was found, convert it to an integer and return
    if match_years:
        return int(match_years.group())
    elif match_months:
        return int(match_months.group()) / 12  # Convert months to years
    
    # If no match was found, return None
    return 0

# Test the function
job_exp = [
'Minimum 6 months of professional front-end development experience.',
'More than 1 year.',
'A minimum of 2 years of professional experience',
'No experience'
]
min_yoe = [extract_min_experience(job) for job in job_exp]
min_yoe

[0.5, 1, 2, 0]