In [1]:
import numpy as np 
import pandas as pd 

from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams 
import io 

### Reading the resume pdf pages 

In [2]:
input_file = open('data/data/CONSTRUCTION/10100240.pdf', 'rb')

resMgr = PDFResourceManager()
retData = io.StringIO()
TxtConverter = TextConverter(resMgr, retData, laparams = LAParams())
interpreter = PDFPageInterpreter(resMgr, TxtConverter)

for page in PDFPage.get_pages(input_file): 
    interpreter.process_page(page)
    txt = retData.getvalue()
    print(txt)

CARPENTER APPRENTICE
Professional Summary

Former construction and carpentry worker committed to high-quality workmanship and safe environments. Over 4 years of hands-on experience
working with remodeling projects in residential and commercial construction. Dedicated work ethic and exceptional attendance record. I am
motivated to learn new trades and skills. Interested in growing career and fostering leadership capabilities. Professional Construction Laborer with
excellent interpersonal skills. Works in timely and efficient manner to see tough jobs through to completion.

Skills

Cabinetry
Material prepping
Construction drawing interpretation
Carpentry techniques
Oral and written communication

Materials handling
New construction and renovation
Drywalling
Safety and compliance

Work History
Carpenter Apprentice , 10/2017 to 09/2019 
Company Name â€“ City , State 

Assisted with movement of materials and equipment to job sites
Worked with master carpenters to install decks for both resi

In [4]:
import re
# replace the new line to space 
txt = txt.replace('\n', " ")  

txt = txt.replace('[^a-zA-Z0-9]', " ")
re.sub('\W+','',txt)
print(txt)

CARPENTER APPRENTICE Professional Summary  Former construction and carpentry worker committed to high-quality workmanship and safe environments. Over 4 years of hands-on experience working with remodeling projects in residential and commercial construction. Dedicated work ethic and exceptional attendance record. I am motivated to learn new trades and skills. Interested in growing career and fostering leadership capabilities. Professional Construction Laborer with excellent interpersonal skills. Works in timely and efficient manner to see tough jobs through to completion.  Skills  Cabinetry Material prepping Construction drawing interpretation Carpentry techniques Oral and written communication  Materials handling New construction and renovation Drywalling Safety and compliance  Work History Carpenter Apprentice , 10/2017 to 09/2019  Company Name â€“ City , State   Assisted with movement of materials and equipment to job sites Worked with master carpenters to install decks for both resi

#### Extracting Name 

In [5]:
import spacy
from spacy.matcher import Matcher 

# Load the pre-trained model, this understands the English text 
nlp = spacy.load('en_core_web_sm')

# creating a matcher object to define and find patterns 
matcher = Matcher(nlp.vocab)

def extract_name(resume_text):
    nlp_text = nlp(resume_text)
    
    # two proper noun in a row
    patterns = [{'POS':"PROPN"},{"POS":'PROPN'}]
    matcher.add('NAME', [patterns], on_match=None)
    matches = matcher(nlp_text)
    
    
    for match_id, start, end in matches: 
        span = nlp_text[start: end]
        return span.text
    
name = extract_name(txt)

print(name)
    

CARPENTER APPRENTICE


#### Extract Mobile number

In [6]:
def extract_mobile(string): 
    pattern = r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})'
    r = re.compile(pattern)
    mobile = r.findall(string)
    return [re.sub(r'\D','', num) for num in mobile]

mobile_number = extract_mobile(txt)
print(mobile_number)

[]


#### Extract email 


In [7]:
def extract_email(string): 
    pattern = r'[\w\.-]+\s*@\s*[\w\.-]+'
    r = re.compile(pattern)
    return r.findall(string)

email = extract_email(txt)
print(email)

[]


#### Extract Education 

In [67]:
def extract_edu(text): 
    nlp_text = nlp(text)
    
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    # skills database 
    edu = ['m.c.a','masters in computer application', 'mca', 'mtech','b.tech',
          'b.e','b.s','bs','be','btech','bsc','b.sc','bachelor of science',
          'bachelor of engineering','bachelor of technology', 'bachelor of',
          'mba','m.b.a','m.com','mcom','b.com','bcom','ma','m.a','b.a','ba','masters of computer application',
          'bachelors of computer application']
    
    extractedEdu = []
    
    # one-gram
    for i in tokens: 
        if i.lower() in edu: 
            extractedEdu.append(i)
            
    # bi-grams or tri-grams 
    for i in nlp_text.noun_chunks: 
        i = i.text.lower().strip()
        if i in edu: 
            extractedEdu.append(i)
        
    # capitalize and remove duplicates 
    
    return [word.capitalize() for word in set([word.lower() for word in extractedEdu])]

education = extract_edu(txt)
print(education)

[]


#### Extracting skills using linkedin skill data 

In [68]:
linkedin_skill = []
with open('linkedin skill', encoding='utf-8') as file: 
    skills = list(file)

for skill in skills: 
    linkedin_skill.append(skill.strip().lower())

def extract_skills(text): 
    nlp_text = nlp(text)
    tokens = [token.text for token in nlp_text if not token.is_stop]
    
    skillset = []
    
    for i in tokens: 
        if i.lower() in linkedin_skill: 
            skillset.append(i)
            
    for i in nlp_text.noun_chunks: 
        i = i.text.lower().strip()
        if i in skills: 
            skillset.append(i)
            
    return [word.capitalize() for word in set([word.lower() for word in skillset])]


skills = extract_skills(txt)

print(skills)

['Microsoft', 'C++', 'Visualization', 'Iit', 'Public', 'Prediction', 'Numpy', 'Analytics', 'Computing', 'Api', 'Models', 'Pandas', 'Uav', 'Project', 'Linkedin', 'Reinforcement', 'Platforms', 'Java', 'Excel', 'Python', 'Android', 'Mobile', 'Transparency', 'Algorithms', 'Scipy', 'Cyber', 'Application', 'Edge', 'Online', 'Web', 'Classification', 'Learning', 'Graphs', 'Technology', 'Sql']


#### Extract Resume details 

In [69]:
def resume_file(file_name):
    file = open(file_name,'rb')
    resMgr = PDFResourceManager()
    retData = io.StringIO()
    TxtConverter = TextConverter(resMgr, retData, laparams = LAParams())
    interpreter = PDFPageInterpreter(resMgr, TxtConverter)
    
    for page in PDFPage.get_pages(file): 
        interpreter.process_page(page)
        txt = retData.getvalue()
        
        return txt
    

In [70]:
def resume_details(text): 
    name = extract_name(txt)
    mobile = extract_mobile(txt)
    email = extract_email(txt)
    edu = extract_edu(txt)
    skills = extract_skills(txt)
    
    details = {}
    details['Name'] = name 
    details['Mobile no.'] = mobile
    details['Email'] = email
    details['Education'] = edu 
    details['Skills'] = skills
    
    
    return details

#### Example 1 

In [71]:
resume_details(txt)

{'Name': 'Vellore Institute',
 'Mobile no.': ['8307509992'],
 'Email': ['psharma0880@gmail.com'],
 'Education': [],
 'Skills': ['Microsoft',
  'C++',
  'Visualization',
  'Iit',
  'Public',
  'Prediction',
  'Numpy',
  'Analytics',
  'Computing',
  'Api',
  'Models',
  'Pandas',
  'Uav',
  'Project',
  'Linkedin',
  'Reinforcement',
  'Platforms',
  'Java',
  'Excel',
  'Python',
  'Android',
  'Mobile',
  'Transparency',
  'Algorithms',
  'Scipy',
  'Cyber',
  'Application',
  'Edge',
  'Online',
  'Web',
  'Classification',
  'Learning',
  'Graphs',
  'Technology',
  'Sql']}

#### Example 2 

In [72]:
txt = resume_file('Resume_Pooja.pdf') 
resume_details(txt)

{'Name': 'Vellore Institute',
 'Mobile no.': ['8307509992'],
 'Email': ['psharma0880@gmail.com'],
 'Education': [],
 'Skills': ['Microsoft',
  'C++',
  'Visualization',
  'Iit',
  'Public',
  'Prediction',
  'Numpy',
  'Analytics',
  'Computing',
  'Api',
  'Models',
  'Pandas',
  'Uav',
  'Project',
  'Linkedin',
  'Reinforcement',
  'Platforms',
  'Java',
  'Excel',
  'Python',
  'Android',
  'Mobile',
  'Transparency',
  'Algorithms',
  'Scipy',
  'Cyber',
  'Application',
  'Edge',
  'Online',
  'Web',
  'Classification',
  'Learning',
  'Graphs',
  'Technology',
  'Sql']}

In [73]:
txt = resume_file('Zoya Lala Resume.pdf')
resume_details(txt)

{'Name': 'Ms. Zoya',
 'Mobile no.': ['4292812'],
 'Email': ['zoya.lala45@gmail.com'],
 'Education': [],
 'Skills': ['C',
  'Oracle',
  'C++',
  'Database',
  'Pwa',
  'App',
  'Chat',
  'Cisco',
  'Typing',
  'Wordpress',
  'Php',
  'Design',
  'Features',
  'Foundations',
  'Socket.io',
  'Project',
  'Java',
  'Languages',
  'Html',
  'Authentication',
  'Python',
  'English',
  'Hindi',
  'Gold',
  'Ux',
  'Javascript',
  'Application',
  'Web',
  'Football',
  'Swimming',
  'Learning',
  'Mysql',
  'Otp',
  'Figma',
  'Technology',
  'Form',
  'Css',
  'Punjabi',
  'Reason']}

In [74]:
txt = resume_file('UtkarshChaturvedi_Resume.pdf')
resume_details(txt)

{'Name': 'Utkarsh Chaturvedi',
 'Mobile no.': ['6265437545'],
 'Email': ['utkarshchaturvedi908@gmail.com'],
 'Education': [],
 'Skills': ['Aws',
  'C++',
  'Database',
  'App',
  'Context',
  'Functionality',
  'Storage',
  'Api',
  'Protocol',
  'Design',
  'Basic',
  'Platforms',
  'Java',
  'Drag',
  'Html',
  'Languages',
  'Python',
  'Amazon',
  'Mobile',
  'Email',
  'Live',
  'Tamil',
  'Facebook',
  'Javascript',
  'Application',
  'Web',
  'Mongodb',
  'Mysql',
  'Lex',
  'Technology',
  'Form',
  'Css']}