## PDF Parser

In [1]:
import pandas as pd
import numpy as np 
import nltk as nltk
import os
import re
import fitz
import spacy
from spacy import displacy




In [12]:
def load_cvs(cdir):
    '''
        Function to load all the cvs from a folder.
        
        Parameters
        ----------
        cdir - String indicating the directory where Cvs are present
        
        Returns
        ----------
        cv_s - list of document objects by PyMuPDF
    '''
    path = os.getcwd() + "\\" + cdir
    CV_folders = os.listdir(path)
    cv_s = {}
    for i in CV_folders:
        folder_path = path + "\\" + i
        files = os.listdir(folder_path)
        cv_s[i] = list()
        for j in files:
            if re.search(".pdf",j) is not None:
                cv_s[i].append(fitz.open(folder_path+"\\"+j))
            
    return cv_s

def process_text(up_text):
    '''
        Function to process the extracted text.
        
        Parameters
        ----------
        up_text - unprocessed text
        
        Returns
        ----------
        processed_text -  processed text
    '''
    processed_text = re.sub(r'\n (?!=\w)',"",up_text)
    processed_text = re.sub(r'\n\d{1} (?!=\d)',"",processed_text)
#     processed_text = re.sub(r'\d{1} (?!=\w)',"",processed_text)
    processed_text = re.sub(r'[_]',"",processed_text)
    processed_text = re.sub(r'[•]',"",processed_text)
    return processed_text
    

def extract_text(document):
    '''
        Function to extract and process text from documents pages.
        
        Parameters
        ----------
        document - A documents object containing CV
        
        Returns
        ----------
        txt - text extracted from the documents object
    '''
    n_pages = document.page_count
    txt = ""
    for i in range(n_pages):
        txt = txt+" "+document[i].get_text("text",flags = 1)
        txt = process_text(txt)
    return txt
    

def doc_to_txt(CV_objects):
    '''
        Function to convert the PyMuPDF document objects to processed plain text data.
        
        Parameters
        ----------
        CV_objects - Document objects containing CVs
        
        Returns
        ----------
        text_objs - dict containing each CV as a string  
    '''
    
    text_objs = {}
    for i in CV_objects.keys():
        text_objs[i] = list()
        for j in range(len(CV_objects[i])):
            text_objs[i].append(extract_text(CV_objects[i][j]))
    return text_objs
    

In [4]:
cv_objs =  load_cvs("Public_CVs")

In [13]:
txt_objs = doc_to_txt(cv_objs)

In [14]:
print(txt_objs["Biology"][1])

  
Ashley Elias 
Assistant Professor of Biology 
Department of Biology 
office phone: 816.271.4381 
Missouri Western State University 
email: AshleyElias@mail.com 
Saint Joseph, MO 64507 
website: www.AshleyElias.science        
Education 
PURDUE UNIVERSITY, West Lafayette, Indiana 
PhD Ecology and Evolutionary Biology, December 2014 
Advisor: Dr. Krista M. Nichols 
UNIVERSITY OF FLORIDA, Gainesville, Florida 
BS Integrative Biology (minors: Anthropology and French), May 2008 
Professional Experience 
2019-present 
Assistant Professor of Biology, Department of Biology, Missouri 
Western State University, Saint Joseph, MO. 
2018-2019 
Assistant Professor of Biology, College of Science and Health, Avila 
University, Kansas City, MO. 
2016-2018 
NSF Postdoctoral Research Fellow, Department of Biological 
Sciences, North Carolina State University, Raleigh, NC; Sponsoring 
Scientist: Dr. R. B. Roberts. 
2015-2016 
Postdoctoral Research Scholar, Department of Biological Sciences, 
North Caro

In [15]:
a = txt_objs["Biology"][1]

In [7]:
text_file = open("data.txt", "w",encoding="utf-8")
 
#write string to file
text_file.write(a)
 
#close file
text_file.close()

In [17]:
nlp = spacy.load("en_core_web_trf")
doc = nlp(a)
displacy.render(doc, style="ent")

In [18]:
# nlp = spacy.load("en_core_web_trf")
# doc = nlp(a)
# # This usually happens under the hood

entities = list()
labels = list()

In [19]:
for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.label_)

In [20]:
df = pd.DataFrame(np.c_[entities,labels],columns=["Entity","Label"])

In [21]:
df

Unnamed: 0,Entity,Label
0,Ashley Elias,PERSON
1,Missouri Western State University,ORG
2,Saint Joseph,GPE
3,MO,GPE
4,PURDUE UNIVERSITY,ORG
...,...,...
508,Sept 2008,DATE
509,Society for the Study of Evolution,ORG
510,Animal Behavior Society,ORG
511,International Association for Great Lakes Rese...,ORG


In [75]:
awards =df[df["Label"] == "MONEY"]["Entity"]

In [81]:
awards.apply(lambda x:int(re.sub(r'[,]',"",x))).sum()

1356262

In [22]:
import stanza

In [23]:
stanza.install_corenlp()

2022-03-01 13:36:37 INFO: Installing CoreNLP package into C:\Users\Pavan Balaji Kumar\stanza_corenlp...


HBox(children=(HTML(value='Downloading https://huggingface.co/stanfordnlp/CoreNLP/resolve/main/stanford-corenl…




In [25]:
stanza.download(lang="en",package="OntoNotes")

HBox(children=(HTML(value='Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/res…

2022-03-01 14:00:15 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package   |
-------------------------------
| ner             | ontonotes |
| backward_charlm | 1billion  |
| forward_charlm  | 1billion  |






2022-03-01 14:00:15 INFO: File exists: C:\Users\Pavan Balaji Kumar\stanza_resources\en\ner\ontonotes.pt.
2022-03-01 14:00:15 INFO: File exists: C:\Users\Pavan Balaji Kumar\stanza_resources\en\backward_charlm\1billion.pt.
2022-03-01 14:00:15 INFO: File exists: C:\Users\Pavan Balaji Kumar\stanza_resources\en\forward_charlm\1billion.pt.
2022-03-01 14:00:15 INFO: Finished downloading models and saved to C:\Users\Pavan Balaji Kumar\stanza_resources.


In [34]:
snlp = stanza.Pipeline(lang='en',package="Default",processors='tokenize,ner')

2022-03-01 14:17:24 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-03-01 14:17:24 INFO: Use device: gpu
2022-03-01 14:17:24 INFO: Loading: tokenize
2022-03-01 14:17:24 INFO: Loading: ner
2022-03-01 14:17:25 INFO: Done loading processors!


In [35]:
doc = snlp(a)
print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents], sep='\n')

entity: Ashley Elias	type: PERSON
entity: Biology 
Department of Biology	type: ORG
entity: 816.271.4381	type: CARDINAL
entity: Missouri Western State University	type: ORG
entity: Saint Joseph	type: PERSON
entity: MO 64507	type: ORG
entity: www.AshleyElias.science        
Education 
PURDUE UNIVERSITY	type: ORG
entity: West Lafayette	type: GPE
entity: Indiana 
PhD Ecology	type: ORG
entity: Evolutionary Biology	type: ORG
entity: December 2014	type: DATE
entity: Krista M. Nichols	type: PERSON
entity: FLORIDA	type: GPE
entity: Gainesville	type: GPE
entity: Florida	type: GPE
entity: BS Integrative Biology	type: ORG
entity: French	type: LANGUAGE
entity: May 2008	type: DATE
entity: 2019	type: DATE
entity: Department of Biology	type: ORG
entity: Missouri 
Western State University	type: ORG
entity: Saint Joseph	type: PERSON
entity: MO.	type: GPE
entity: 2018-2019	type: DATE
entity: College of Science and Health	type: ORG
entity: Avila 
University	type: ORG
entity: Kansas City	type: GPE
entity: M