In [20]:
# https://www.omkarpathak.in/2018/12/18/writing-your-own-resume-parser/
import PyPDF2
import spacy
import io
from spacy.matcher import Matcher
import re
from nltk.corpus import stopwords
import pandas as pd
import string as str
import glob
import pickle

### Text Extraction from PDF

In [21]:
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

def text_extraction(file_path):
    
#   Step 1- Open the file
    with open(file_path,'rb') as resume:
#       Step 2- Read the pages of the pdfpages
        for page in PDFPage.get_pages(resume,check_extractable=True,caching=True):
#             Step 3- Create a ResouceManagager (ResourceManager facilitates reuse of shared resources 
#               such as fonts and images so that large objects are not allocated multiple times.)
            resource_manager= PDFResourceManager()
#             Step 4- Create a filehandle that would collect the extracted text
            file_handle= io.StringIO()
#             Step 5- Create a converter that uses the filehandle and the resource manager and needs to provided with the encoding
            converter = TextConverter(resource_manager,file_handle,codec='utf-8',laparams=LAParams())
#             Step 6- Create an interpreter that uses the converter and resource manager to extract text from pdf
            interpreter = PDFPageInterpreter(resource_manager,converter)
            interpreter.process_page(page)
#             Step 7- Get text from file handle
            text = file_handle.getvalue()
            yield text
            converter.close()
            file_handle.close()


def pdf_extraction(file):
    text= ''
    for page in text_extraction(file):
        text += ' ' + page
    return text
# print(text)
# pattern = [{'POS': 'PROPN','LIKE_EMAIL':False,'LIKE_URL':False,'LIKE_NUM':False}, {'POS': 'PROPN','LIKE_EMAIL':False,'LIKE_URL':False,'LIKE_NUM':False}]    

## Get Name

In [22]:
# load pre-trained model
nlp = spacy.load('en_core_web_sm')    # Load English tokenizer, tagger, parser, NER and word vectors
def get_name(resume_text):
    matcher = Matcher(nlp.vocab)          # Match sequences of tokens, based on pattern rules
    nlp_text = nlp(resume_text)       # Tag text accordingly
    pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}]]   #looks for a POS pattern of two proper nouns together 
    matcher.add('NAME', None, *pattern)
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        if 'name' not in span.text.lower():
            return span.text
    
# name= get_name(resume_text=text)
# print(name)

## Get Email ID

In [23]:
def get_email(resume_text):
    matcher = Matcher(nlp.vocab)  
    nlp_text = nlp(resume_text)       # Tag text accordingly
    pattern = [{'LIKE_EMAIL': True}]    #looks for a POS pattern of two proper nouns together 
    matcher.add('EMAIL', None, pattern)
    matches = matcher(nlp_text)
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        return span.text

### Get Number

In [24]:
# matcher.remove('NUMBER')
def get_num(resume_text):
    nlp_text = nlp(resume_text) 
    phone_expression = r'(?:(?:\+?([1-9]|[0-9][0-9]|[0-9][0-9][0-9])\s*(?:[.-]\s*)?)?(?:\(\s*([2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\s*\)|([0-9][1-9]|[0-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\s*(?:[.-]\s*)?)?([2-9]1[02-9]|[2-9][02-9]1|[2-9][02-9]{2})\s*(?:[.-]\s*)?([0-9]{4})(?:\s*(?:#|x\.?|ext\.?|extension)\s*(\d+))?'
    for match in re.finditer(phone_expression, nlp_text.text):
        start, end = match.span()
        span = nlp_text.char_span(start, end) 
        if span and len(span.text)>9:
            return span.text
# number = get_num(text)
# print(number)

### Get Education Variables (Degrees, Subject, Year)

In [25]:
majors = pd.read_csv("majors-list.csv")
majors = set(majors["Major"])

In [26]:
nlp = spacy.load('en_core_web_sm')

# Grad all general stop words
STOPWORDS = set(stopwords.words('english'))

# Education Degrees
EDUCATION = [
            'BE','B.E.', 'B.E', 'BS', 'B.S', 
            'ME', 'M.E', 'M.E.', 'MS', 'M.S', 
            'BTECH', 'B.TECH', 'M.TECH', 'MTECH','BACHELOR','MASTER','MINOR'
        ]

def get_education(resume_text):
    nlp_text = nlp(resume_text)

    # Sentence Tokenizer
    nlp_text = [sent.string.strip() for sent in nlp_text.sents]

    edu = {}
    # Extract education degree
    for index, text in enumerate(nlp_text):
        for tex in text.split():
            # Replace all special symbols
            tex = re.sub(r'[?|$|.|!|,]', r'', tex)
            if tex.upper() in EDUCATION and tex not in STOPWORDS and index+1 < len(nlp_text):
                edu[tex] = text + nlp_text[index + 1] 
    
    education_qual= list(edu.keys())
    
#     Extract Major
    matches = {x for x in majors for j in edu.keys() if x in edu[j].upper() }
    majors_or_minors = list(matches)
    # Extract year
    years = []
    for key in edu.keys():
        year = re.search(re.compile(r'(((20|19)(\d{2})))'), edu[key])
        if year:
            years.append(year[0])
        
    return education_qual,majors_or_minors,years

# print(get_education(text))

### Skill Extraction

In [27]:
    # reading the csv file I created
    data = pd.read_csv("skills.csv") 
    
    skills = []
    skills =[i for x in data.columns for i in data[x] ]
    skills = list(skills)
    
#     The one from github 
    



In [28]:
# https://www.geeksforgeeks.org/python-intersection-two-lists/
def intersection(a, b): 
    
#     b = set(b) 
#     print(a,b) 
    c = set(b).intersection(a) 
#     print(c)
    return len(c) 

In [29]:
import pandas as pd
import spacy

# load pre-trained model
nlp = spacy.load('en_core_web_sm')

def get_skills(resume_text):
    nlp_text = nlp(resume_text)
    noun_chunks = nlp_text.noun_chunks
    # removing stop words and implementing word tokenization
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    skillset = {}
    
    # check for one-grams (example: python)
    for token in tokens:
        if token.lower() in skills:
            if token.lower() in skillset:
                skillset[token.lower()] += 1
            else:
                skillset[token.lower()] = 1
            
    
    # check |for bi-grams and tri-grams (example: machine learning)
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            if token in skillset:
                skillset[token] += 1
            else:
                skillset[token] = 1
    
    skill_score = {}
    for i in data.columns:
#         print(type(skillset.keys),type(data[i]))
        skill_score[i]= intersection(list(skillset.keys()),list(data[i]))
        
    
    return (skillset,skill_score)

 ## Multiple Resumes

In [30]:
resumes_dict = {}
for path in glob.iglob("Multiple_Resumes/" + "*.pdf"):
    print(path)
    resume_text= pdf_extraction(path)
    curr_email = get_email(resume_text)
    resumes_dict[curr_email]= {}
    resumes_dict[curr_email]["NAME"]= get_name(resume_text)
    resumes_dict[curr_email]["NUMBER"] = get_num(resume_text)
    resumes_dict[curr_email]["EDUCATION"] = get_education(resume_text)
    resumes_dict[curr_email]["SKILLS"],resumes_dict[curr_email]["SCORE"] = get_skills(resume_text)

Multiple_Resumes/agrawal_saumya_resume_14323135.pdf
Multiple_Resumes/mehta_jinal_resume_14066856.pdf
Multiple_Resumes/jiang_jingcheng_resume_14792170.pdf
Multiple_Resumes/gupta_anish_resume_14706803.pdf
Multiple_Resumes/shetty_siddharth_kishor_resume_14778403.pdf
Multiple_Resumes/qin_xuefeng_resume_14823891.pdf
Multiple_Resumes/kim_andrew_resume_14937724.pdf
Multiple_Resumes/obetta_ebubechukwu_resume_13415673.pdf
Multiple_Resumes/varma_ayush_resume_12465308.pdf
Multiple_Resumes/kras_michelle_resume_14992930.pdf
Multiple_Resumes/wang_justin_resume_13153799.pdf
Multiple_Resumes/lou_robert_resume_14892216.pdf
Multiple_Resumes/gupta_tanya_resume_14340664.pdf
Multiple_Resumes/pachori_kunal_resume_14723849.pdf
Multiple_Resumes/mehta_chaitya_sunil_resume_14875421.pdf
Multiple_Resumes/yuan_hao_resume_14053396.pdf
Multiple_Resumes/joshi_jay_resume_13020321.pdf
Multiple_Resumes/mehta_monil_resume_14867543.pdf
Multiple_Resumes/parhi_sanjay_resume_14998184.pdf
Multiple_Resumes/putta_sarvani_resume

KeyboardInterrupt: 

In [None]:
f = open("resumes_dict.pkl","wb")
pickle.dump(resumes_dict,f)
f.close()

In [31]:
print(resumes_dict)

{'saumyaa2@illinois.edu': {'NAME': 'Computer Science', 'NUMBER': '217-819-0855', 'EDUCATION': (['Bachelor'], ['ECONOMICS', 'COMPUTER SCIENCE', 'STATISTICS'], []), 'SKILLS': {'statistics': 4, 'python': 4, 'analysis': 3, 'research': 2, 'datasets': 1, 'audio': 1, 'hardware': 1, 'c++': 2, 'android': 2, 'mobile': 1, 'policies': 1, 'ui': 1, 'interactive': 1, 'excel': 1, 'technical': 1, 'git': 2, 'java': 2, 'r': 2, 'html': 1, 'css': 1, 'sql': 1, 'writing': 1, 'design': 1, 'economics': 1, 'calculus': 1, 'computer science': 3, 'microsoft excel': 1, 'technical skills': 1, 'statistical analysis': 1}, 'SCORE': {'Statistics': 1, 'Machine Learning': 0, 'Deep Learning': 0, 'R language': 0, 'Python Language': 1, 'NLP': 0, 'Data Engineering': 1, 'Other Softwares and skills': 4}}, 'jinalm2@illinois.edu': {'NAME': 'Jinal Jayesh', 'NUMBER': None, 'EDUCATION': (['Master', 'Bachelor'], [], ['2017']), 'SKILLS': {'engineering': 8, 'electronics': 2, 'system': 2, 'training': 1, 'consulting': 2, 'database': 1, '