# Resume Parsing

In [39]:
import fitz

# Open the PDF file
doc = fitz.open('resumes/node.pdf')

## Group Text into sections
To determine a section title, check if it fulfills all 3 following conditions:
1. It is the only text item in the line
2. It is bolded
3. Its letters are all UPPERCASE  
OR,
- Perform a keyword matching against a list of common resume section title keywords.

In [40]:

# Initialize an empty dictionary to hold the sections
sections = {}
current_section = None

# Define a list of common resume section titles
keywords = [
    "PROFILE", "SUMMARY", "ABOUT ME", "PERSONAL PROFILE", "PERSONAL SUMMARY",
    "WORK EXPERIENCE", "EXPERIENCE", "JOB HISTORY", "EMPLOYMENT HISTORY",
    "EDUCATION", "EDUCATIONAL BACKGROUND", "ACADEMIC HISTORY",
    "SKILLS", "ABILITIES", "COMPETENCIES", "EXPERTISE",
    "PROJECTS", "PORTFOLIO",
    "CERTIFICATIONS", "CREDENTIALS", "ACCREDITATIONS",
    "AWARDS", "HONORS", "ACHIEVEMENTS",
    "INTERESTS", "HOBBIES", "ACTIVITIES",
]

# Loop over each page in the document
for page in doc:
    # Extract the text as a dictionary
    blocks = page.get_text("dict")["blocks"]
    
    for b in blocks:  # iterate through the text blocks
        if "lines" in b:
            for l in b["lines"]:  # iterate through the text lines
                for s in l["spans"]:  # iterate through the text spans
                    if s["flags"] == 20 and s["text"].isupper():  # check if the text is bold and uppercase
                        # This is a section title, so start a new section
                        current_section = s["text"]
                        sections[current_section] = ""
                    elif any(keyword == s["text"].upper() for keyword in keywords):
                        # The text contains a keyword, so start a new section
                        current_section = s["text"].upper()
                        sections[current_section] = ""
                    elif current_section is not None:
                        # This is not a section title, so append it to the current section
                        sections[current_section] += s["text"] + " "

sections

{'WORK EXPERIENCE': "______________________________________________________________________ Resume Worded,  London, United Kingdom Education technology startup with 50+ employees and $100m+ annual revenue Node.js Developer 08/2021 – Present ● Developed a chatbot for RW's regional branch oﬃces, allowing sales teams to sell popular services to 1500+ customers. ● Built a high-performance, scalable, and effortless solution for web, N-tier, and 20+ distributed systems. ● Created the technical ﬂow, selection of tools and strategy for over 1200+ bots in the ﬁrst month of employment. ● Integrated 40+ user-facing elements created by front-end developers with server-side logic. Polyhire,  London, United Kingdom NYSE-listed recruitment and employer branding company VMware Administrator 10/2019 – 07/2021 ● Delivered OS support for 60+ non-virtualized servers using Microsoft Windows Group Policy Objects (GPO) and other software tools. ● Established VPN connectivity for remote access to 30+ internal

In [41]:
import textwrap

for section, content in sections.items():
    print(section)
    wrapped_content = textwrap.fill(content, width=100)  # wraps the text at 50 characters
    print(wrapped_content)
    print("-"*100)  # prints a line for separation

WORK EXPERIENCE
______________________________________________________________________ Resume Worded,  London,
United Kingdom Education technology startup with 50+ employees and $100m+ annual revenue Node.js
Developer 08/2021 – Present ● Developed a chatbot for RW's regional branch oﬃces, allowing sales
teams to sell popular services to 1500+ customers. ● Built a high-performance, scalable, and
effortless solution for web, N-tier, and 20+ distributed systems. ● Created the technical ﬂow,
selection of tools and strategy for over 1200+ bots in the ﬁrst month of employment. ● Integrated
40+ user-facing elements created by front-end developers with server-side logic. Polyhire,  London,
United Kingdom NYSE-listed recruitment and employer branding company VMware Administrator 10/2019 –
07/2021 ● Delivered OS support for 60+ non-virtualized servers using Microsoft Windows Group Policy
Objects (GPO) and other software tools. ● Established VPN connectivity for remote access to 30+
internal reso

### NER

In [42]:
import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp(sections['EDUCATION'])

# Use 'ent' as the style for entity visualization
displacy.render(doc, style='ent')

## Identify Degree and Major in Education

Our system uses spacy library to extract information (degree and major) from Education. We prepared a dictionary that has all education degrees and majors related to computer engineering field. We fed that dictionary to the Spacy rule-based EntityRuler in order to detect and recognize entities in our job description.

In [43]:
import spacy

In [44]:
def match_degrees_by_spacy():
    nlp = spacy.load("en_core_web_sm")
    # Add the pattern to the matcher
    patterns_path = 'degrees.jsonl'
    ruler = nlp.add_pipe("entity_ruler", before='ner')
    ruler.from_disk(patterns_path)
    # Process some text
    doc1 = nlp(sections['EDUCATION'])
    degree_levels = []
    for ent in doc1.ents:
        labels_parts = ent.label_.split('|')
        if labels_parts[0] == 'DEGREE':
            print((ent.text, ent.label_))
            if labels_parts[1] not in degree_levels:
                degree_levels.append(labels_parts[1])
    return degree_levels


def match_majors_by_spacy():
    nlp = spacy.load("en_core_web_sm")
    # Add the pattern to the matcher
    patterns_path = 'majors.jsonl'
    ruler = nlp.add_pipe("entity_ruler", before='ner')
    ruler.from_disk(patterns_path)
    # Process some text
    doc1 = nlp(sections['EDUCATION'])

    displacy.render(doc1, style='ent')

    acceptable_majors = []
    for ent in doc1.ents:
        labels_parts = ent.label_.split('|')
        if labels_parts[0] == 'MAJOR':
            if labels_parts[2].replace('-', ' ') not in acceptable_majors:
                acceptable_majors.append(labels_parts[2].replace('-', ' '))
            if labels_parts[2].replace('-', ' ') not in acceptable_majors:
                acceptable_majors.append(labels_parts[2].replace('-', ' '))
    return acceptable_majors

In [45]:
match_degrees_by_spacy()

('University', 'DEGREE|BACHELOR')


['BACHELOR']

In [46]:
match_majors_by_spacy()

['computer programming']

## Extract Dates

We need to extract the dates under EXPERIENCE section. Then calculate the total years of experience from it.

In [48]:
text_with_dates = '''
I was employed at Company D from 05/2015 - 06/2017.
My time at Company E spanned from March '18 - April '20.
I worked on a project from 01/2021 - Present.
I was part of Company F from Feb 2010 - Jan 2013.
My tenure at Company G was from 07/2014 - 08/2016.
I worked at Company A from January 2011 - February 2012. 
Then, I joined Company B in March 2012 - Present. 
I was part of a project from 10/2019 - 07/2021. 
My tenure at Company C spanned from Jan '15 - Dec '18.
I worked at Company A from January 2011 to February 2012. 
Then, I joined Company B in March 2012 and have been working there till present. 
I was part of a project from 10/2019 to 07/2021. 
'''

In [51]:
doc = nlp(text_with_dates)

# Specify options for displacy
options = {'ents': ['DATE', 'Cardinal']}
# Use 'ent' as the style for entity visualization
displacy.render(doc, style='ent', options=options)