# Reference Used

https://omkarpathak.in/2018/12/18/writing-your-own-resume-parser/

In [1]:
import numpy as np
import pandas as pd
import spacy
import re
import io

In [2]:
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as fh:
        
        # iterate over all pages of PDF document
        for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
            
            # creating a resoure manager
            resource_manager = PDFResourceManager()
            
            # create a file handle
            fake_file_handle = io.StringIO()
            
            # creating a text converter object
            converter = TextConverter(
                                resource_manager, 
                                fake_file_handle, 
                                codec='utf-8', 
                                laparams=LAParams()
                        )

            # creating a page interpreter
            page_interpreter = PDFPageInterpreter(
                                resource_manager, 
                                converter
                            )

            # process current page
            page_interpreter.process_page(page)
            
            # extract text
            text = fake_file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()

# calling above function and extracting text
text = ''
for page in extract_text_from_pdf('Whitmore-resume.pdf'):
    text += ' ' + page


In [3]:
text

' Jonathan Whitmore\nPhD, Data Scientist\n\nExperience\n\nMountain View, CA\n+1 650-943-3715\n(cid:66) JBWhit@gmail.com\n(cid:205) JonathanWhitmore.com\nJBWhit\nJonathanBWhitmore\n\n2014-\nPresent\n\nData Scientist, Silicon Valley Data Science, Mountain View, CA, USA.\n(cid:123) Consulting as a member of several small data science/data engineering teams at multiple companies.\n(cid:123) Creating output to explain data analysis, data visualization, and statistical modeling results to managers.\n(cid:123) Modeling survey data responses with ordinal logistic regression in R.\n(cid:123) Analyzing and visualizing user behavior migration.\n\n2014 Insight Fellow, Insight Data Science, Palo Alto, CA, USA.\n\n(cid:123) Created a Data Science project to predict the auction sale price of Abstract Expressionist art.\n\n2011–2014 Postdoctoral Research Associate, Swinburne University, Melbourne, AUS.\n\n(cid:123) Cleaned noisy and inhomogeneous astronomical data taken over four years by diﬀerent obs

# Extracting Name 

In [4]:
from spacy.matcher import Matcher

# load pre-trained model
nlp = spacy.load('en_core_web_sm')

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # First name and Last name are always Proper Nouns
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    
    matcher.add('NAME', None, pattern)
    
    matches = matcher(nlp_text)
    
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text

In [5]:
name = extract_name(text)
print(name)

Jonathan Whitmore


# Extracting Phone Numbers

In [6]:
def extract_mobile_number(text):
    phone = re.findall(re.compile(r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'), text)
    
    if phone:
        number = ''.join(phone[0])
        if len(number) > 10:
            return '+' + number
        else:
            return number

In [7]:
mobile_number = extract_mobile_number(text)
print(mobile_number)

+16509433715


# Extracting Email

In [8]:
def extract_email(email):
    email = re.findall("([^@|\s]+@[^@]+\.[^@|\s]+)", email)
    if email:
        try:
            return email[0].split()[0].strip(';')
        except IndexError:
            return None

In [9]:
email = extract_email(text)
print(email)

JBWhit@gmail.com


# Extracting Skills
First we created a skills.csv file containing some skills

In [10]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')


def extract_skills(resume_text):
    nlp_text = nlp(resume_text)

    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # reading the csv file
    data = pd.read_csv("new_skills.csv", encoding='latin-1') 
    
    # extract values
    skills = list(data['skills'])
    
    skillset = []
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)
    
    # check for bi-grams and tri-grams (example: machine learning)
    for token in nlp_text.noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.append(token)
    
    return [i.capitalize() for i in set([i.lower() for i in skillset])]


In [11]:
skills = extract_skills(text)
print(skills)

['Stochastic methods', 'Astronomy', 'Building', 'Silicon', 'Testing', 'Publishing', 'Star', 'Mathematics', 'Git', 'Visualization', 'Physics', 'Allocations', 'Output', 'Alto', 'Research', 'Narrator', 'Data analysis', 'Survey', 'Web', 'Python', 'Sql', 'Scipy', 'Pandas', 'Mastering', 'Modeling', '3d', 'Latex', 'Sensitivity analysis', 'Model building', 'Css', 'Groups', 'Teams', 'Html', 'Numpy', 'Art', 'Philosophy', 'Data visualization', 'Hive', 'Languages', 'Range', 'R', 'Led', 'Repository', 'Project', 'Sensitivity', 'Access', 'Computational physics', 'Matplotlib']


In [12]:
import nltk

In [13]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
from nltk.corpus import stopwords

# load pre-trained model
nlp = spacy.load('en_core_web_sm')

# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S', 
            'ME', 'M.E', 'M.E.', 'MS', 'M.S', 
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH', 
            'SSC', 'HSC', 'CBSE', 'ICSE', 'X', 'XII'
        ]

def extract_education(resume_text):
    nlp_text = nlp(resume_text)

    # Sentence Tokenizer
    nlp_text = [sent.string.strip() for sent in nlp_text.sents]

    edu = {}
    
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS:
                edu[tex] = text + nlp_text[index + 1]

    # Extract year
    education = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])
        if year:
            education.append((key, ''.join(year[0])))
        else:
            education.append(key)
    return education

In [15]:
education = extract_education(text)
print(education)

[('MS', '2005')]


In [16]:
result = {'name': name, 'education': education,'email': email, 'mobile_number': mobile_number, 'skills': skills}
print(result)

{'name': 'Jonathan Whitmore', 'education': [('MS', '2005')], 'email': 'JBWhit@gmail.com', 'mobile_number': '+16509433715', 'skills': ['Stochastic methods', 'Astronomy', 'Building', 'Silicon', 'Testing', 'Publishing', 'Star', 'Mathematics', 'Git', 'Visualization', 'Physics', 'Allocations', 'Output', 'Alto', 'Research', 'Narrator', 'Data analysis', 'Survey', 'Web', 'Python', 'Sql', 'Scipy', 'Pandas', 'Mastering', 'Modeling', '3d', 'Latex', 'Sensitivity analysis', 'Model building', 'Css', 'Groups', 'Teams', 'Html', 'Numpy', 'Art', 'Philosophy', 'Data visualization', 'Hive', 'Languages', 'Range', 'R', 'Led', 'Repository', 'Project', 'Sensitivity', 'Access', 'Computational physics', 'Matplotlib']}


In [17]:
import csv
results = []
results.append(result)

filename = 'Jonathan_Whitmore_Resume.csv'
with open(filename,'w',newline='') as f:
    w = csv.DictWriter(f,list(result.keys()))
    w.writeheader()
    for r in results:
        w.writerow(r)

In [19]:
dataset = pd.read_csv('Jonathan_Whitmore_Resume.csv')
dataset.head()

Unnamed: 0,name,education,email,mobile_number,skills
0,Jonathan Whitmore,"[('MS', '2005')]",JBWhit@gmail.com,16509433715,"['Stochastic methods', 'Astronomy', 'Building'..."
