In [1]:
import spacy

### Load english tokenizer, tagger, parser, NER & word vectors

In [2]:
nlp=spacy.load("en_core_web_sm")

### Sample text

In [6]:
# process whole documents
text="Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet."
text

"Google was founded on September 4, 1998, by computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet."

### NLP at Work

In [7]:
doc=nlp(text)

### Tokenization

In [8]:
for token in doc:
    print(token)

Google
was
founded
on
September
4
,
1998
,
by
computer
scientists
Larry
Page
and
Sergey
Brin
while
they
were
PhD
students
at
Stanford
University
in
California
.
Together
they
own
about
14
%
of
its
publicly
listed
shares
and
control
56
%
of
its
stockholder
voting
power
through
super
-
voting
stock
.
The
company
went
public
via
an
initial
public
offering
(
IPO
)
in
2004
.
In
2015
,
Google
was
reorganized
as
a
wholly
owned
subsidiary
of
Alphabet
Inc.
Google
is
Alphabet
's
largest
subsidiary
and
is
a
holding
company
for
Alphabet
's
internet
properties
and
interests
.
Sundar
Pichai
was
appointed
CEO
of
Google
on
October
24
,
2015
,
replacing
Larry
Page
,
who
became
the
CEO
of
Alphabet
.
On
December
3
,
2019
,
Pichai
also
became
the
CEO
of
Alphabet
.


#### only noun

In [9]:
for token in doc:
    if token.pos_=="NOUN":
        print(token)
    

computer
scientists
PhD
students
%
shares
control
%
stockholder
voting
power
stock
company
offering
subsidiary
subsidiary
company
internet
properties
interests
CEO
CEO
CEO


### Named entity recognition

In [11]:
for entity in doc.ents:
    print(entity.text,entity.label_)

Google ORG
September 4, 1998 DATE
Larry Page PERSON
Sergey Brin PERSON
PhD WORK_OF_ART
Stanford University ORG
California GPE
about 14% PERCENT
56% PERCENT
IPO ORG
2004 DATE
2015 DATE
Google ORG
Alphabet Inc. ORG
Alphabet GPE
Alphabet GPE
Sundar Pichai PERSON
Google ORG
October 24, 2015 DATE
Larry Page PERSON
Alphabet GPE
December 3, 2019 DATE
Pichai PERSON
Alphabet GPE


# Resume parsing

In [136]:
import spacy # nature language processing
import pdfminer # pdf to txt
import re # regurlar expression
import os # file manipulation
import pandas as pd # csv-tablular format


In [137]:
from pdfminer.high_level import extract_text

### Create output file/folder

In [138]:
def convert_pdf(input_file):
    # create output directory if it does not exist
    output_dir = r"F:\Auto_pyhton\output"
    os.makedirs(output_dir, exist_ok=True)
    
    # construct output filepath and save text file
    output_filename = os.path.basename(input_file).replace('.pdf', '.txt') # create output filename
    output_filepath = os.path.join(output_dir, output_filename) # create output filepath (location to save file)
    with open(output_filepath, "w", encoding='utf-8') as f_out: #open file located at outputfilepath &intention of writing to it
        f_out.write(extract_text(input_file))  #writes the extracted text from the input PDF file to the output file
    print(f"{output_filepath} saved successfully!!!") 
    return output_filepath

In [139]:
#load language model
nlp=spacy.load("en_core_web_sm")


In [140]:
#create output file structure
result_dict = {'name':[], 'phone':[], 'email':[], 'skills':[]}

### Create content needed 

In [141]:
## create some regular expressions(reg)
def parse_content(text):
    skillset = re.compile("python|java|sql|hadoop|tableau")
    phone_num = re.compile("(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})")
    # nlp at work & extract content by named recognition + reg
    doc = nlp(text)
    name = [entity.text for entity in doc.ents if entity.label_ == "PERSON"][0]
    email = [word.text for word in doc if word.like_email][0]
    phone = re.findall(phone_num, text.lower())[0]
    skill_list = re.findall(skillset, text.lower())
    unique_skills_list = list(set(skill_list))

    # place into placeholder
    result_dict['name'].append(name)
    result_dict['phone'].append(phone)
    result_dict['email'].append(email)
    result_dict['skills'].append(unique_skills_list)
    print("Extraction completed successfully!!!")


### Check all input PDF file, convert to text & extract infor neeeded

In [142]:
# loop through all PDF files in the resumes directory
resumes_dir = r"F:\Auto_pyhton\resumes"
for file in os.listdir(resumes_dir):
    if file.endswith('.pdf'):
        print(f"Reading {file}...")
        pdf_file = os.path.join(resumes_dir, file) 
        txt_file = convert_pdf(pdf_file) # convert pdf to text
        with open(txt_file, "r", encoding='utf-8') as f_in:
            text = f_in.read()
            parse_content(text)  # extract info from file above
        


Reading Vu Manh Tien _CV.pdf...
F:\Auto_pyhton\output\Vu Manh Tien _CV.txt saved successfully!!!
Extraction completed successfully!!!
Reading VuManhTien_CV.pdf...
F:\Auto_pyhton\output\VuManhTien_CV.txt saved successfully!!!
Extraction completed successfully!!!


In [143]:
# convert result_dict to a pandas dataframe and save it as a CSV file
df = pd.DataFrame(result_dict)
df.to_csv(r"F:\Auto_pyhton\output\resume_info.csv", index=False)
print("Resume information saved to CSV file.")

Resume information saved to CSV file.


In [144]:
df

Unnamed: 0,name,phone,email,skills
0,https://www.linkedin.com/in/tienvu1995/,035.911.0632,vumanhtien1995@gmail.com,"[sql, python, tableau]"
1,", Ba Đình",035.911.0632,vumanhtien1995@gmail.com,"[sql, python, tableau]"
