In [8]:
# https://www.omkarpathak.in/2018/12/18/writing-your-own-resume-parser/
import PyPDF2
import spacy
import io
from spacy.matcher import Matcher
import re
from nltk.corpus import stopwords
import pandas as pd
import string as str

### Text Extraction from PDF

In [45]:
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def text_extraction(file_path):
    
#   Step 1- Open the file
    with open(file_path,'rb') as resume:
#       Step 2- Read the pages of the pdfpages
        for page in PDFPage.get_pages(resume,check_extractable=True,caching=True):
#             Step 3- Create a ResouceManagager (ResourceManager facilitates reuse of shared resources 
#               such as fonts and images so that large objects are not allocated multiple times.)
            resource_manager= PDFResourceManager()
#             Step 4- Create a filehandle that would collect the extracted text
            file_handle= io.StringIO()
#             Step 5- Create a converter that uses the filehandle and the resource manager and needs to provided with the encoding
            converter = TextConverter(resource_manager,file_handle,codec='utf-8',laparams=LAParams())
#             Step 6- Create an interpreter that uses the converter and resource manager to extract text from pdf
            interpreter = PDFPageInterpreter(resource_manager,converter)
            interpreter.process_page(page)
#             Step 7- Get text from file handle
            text = file_handle.getvalue()
            yield text
            converter.close()
            file_handle.close()
text= ''
# calling above function and extracting text
file= "Vismayak\'s Resume.pdf"
for page in text_extraction(file):
    text += ' ' + page
# print(text)

## Get Name

In [46]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')    # Load English tokenizer, tagger, parser, NER and word vectors
matcher = Matcher(nlp.vocab, validate=True)          # Match sequences of tokens, based on pattern rules
def get_name(resume_text):
    nlp_text = nlp(resume_text)       # Tag text accordingly
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]    #looks for a POS pattern of two proper nouns together 
    matcher.add('NAME', None, pattern)
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text
    
name= get_name(resume_text=text)
print(name)

Vismayak Mohanarajan


## Get Email ID

In [47]:
matcher.remove('NAME')
def get_email(resume_text):
    nlp_text = nlp(resume_text)       # Tag text accordingly
    pattern = [{'LIKE_EMAIL': True}]    #looks for a POS pattern of two proper nouns together 
    matcher.add('EMAIL', None, pattern)
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text
email= get_email(text)
print (email)

mohanar2@illinois.edu


### Get Number

In [48]:
# matcher.remove('NUMBER')
def get_num(resume_text):
    nlp_text = nlp(resume_text) 
    phone_expression = r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'
    for match in re.finditer(phone_expression, nlp_text.text):
        start, end = match.span()
        span = nlp_text.char_span(start, end) 
        return span.text
number = get_num(text)
print(number)

217-974-5947


### Get Education Variables (Degrees, Subject, Year)

In [49]:
majors = pd.read_csv("majors-list.csv")
majors = set(majors["Major"])

In [50]:
nlp = spacy.load('en_core_web_sm')

# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S', 
            'ME', 'M.E', 'M.E.', 'MS', 'M.S', 
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH','BACHELOR','MASTER','MINOR'
        ]

def get_education(resume_text):
    nlp_text = nlp(resume_text)

    # Sentence Tokenizer
    nlp_text = [sent.string.strip() for sent in nlp_text.sents]

    edu = {}
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS:
                edu[tex] = text + nlp_text[index + 1] 
    
    education_qual= list(edu.keys())
    
#     Extract Major
    matches = {x for x in majors for j in edu.keys() if x in edu[j].upper() }
    majors_or_minors = list(matches)
    # Extract year
    years = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])
        if year:
            years.append(year[0])
        
    return education_qual,majors_or_minors,years

print(extract_education(text))

(['Bachelor', 'Minor'], ['COMPUTER ENGINEERING', 'STATISTICS'], [])


### Skill Extraction

In [None]:
    # reading the csv file I created
    data = pd.read_csv("skills.csv") 
    
    skills = []
    skills =[i for x in data.columns for i in data[x] ]
    skills = list(skills)
    
#     The one from github 
    
   

In [80]:
import pandas as pd
import spacy

# load pre-trained model
nlp = spacy.load('en_core_web_sm')

def get_skills(resume_text):
    nlp_text = nlp(resume_text)
    noun_chunks = nlp_text.noun_chunks
    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    skillset = {}
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in skills:
            if token.lower() in skillset:
                skillset[token.lower()] += 1
            else:
                skillset[token.lower()] = 1
            
    
    # check |for bi-grams and tri-grams (example: machine learning)
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            if token in skillset:
                skillset[token] += 1
            else:
                skillset[token] = 1
    
    return skillset

In [81]:
get_skills(text)