## PDF Parser

In [1]:
import pandas as pd
import numpy as np 
import nltk as nltk
import os
import re
import fitz
import spacy
from spacy import displacy
import torch
import stanza

In [14]:
def text_to_number(text):
    text = re.sub(r'[^0-9]',"",text)
    if len(text) > 0:
        return int(text)
    else:
        return 0
def load_cvs(cdir):
    '''
        Function to load all the cvs from a folder.
        
        Parameters
        ----------
        cdir - String indicating the directory where Cvs are present
        
        Returns
        ----------
        cv_s - list of document objects by PyMuPDF
    '''
    path = os.getcwd() + "\\" + cdir
    CV_folders = os.listdir(path)
    cv_s = {}
    for i in CV_folders:
        folder_path = path + "\\" + i
        files = os.listdir(folder_path)
        for j in files:
            if re.search(".pdf",j) is not None:
                name = re.sub(".pdf","",j)
                file = fitz.open(folder_path+"\\"+j)
                cv = doc_to_txt(file)
                cv_s[name]={"CV":cv,"Department": i,"n_pages":file.page_count}
                file.close()
            
    return cv_s

def process_text(up_text):
    '''
        Function to process the extracted text.
        
        Parameters
        ----------
        up_text - unprocessed text
        
        Returns
        ----------
        processed_text -  processed text
    '''
    processed_text = re.sub(r'\n (?!=\w)',"",up_text)
    processed_text = re.sub(r'\n\d{1} (?!=\d)',"",processed_text)
    processed_text = re.sub(r'\n\d{1} (?!=\d)',"",processed_text)
#     processed_text = re.sub(r'\d{1} (?!=\w)',"",processed_text)
    processed_text = re.sub(r'[_]',"",processed_text)
    processed_text = re.sub(r'[•]',"",processed_text)
    return processed_text
    

def extract_text(document):
    '''
        Function to extract and process text from documents pages.
        
        Parameters
        ----------
        document - A documents object containing CV
        
        Returns
        ----------
        txt - text extracted from the documents object
    '''
    n_pages = document.page_count
    txt = ""
    for i in range(n_pages):
        txt = txt+" "+document[i].get_text("text",flags = 1)
        txt = process_text(txt)
    return txt
    

def doc_to_txt(CV_objects):
    '''
        Function to convert the PyMuPDF document objects to processed plain text data.
        
        Parameters
        ----------
        CV_objects - Document objects containing CVs
        
        Returns
        ----------
        CV_objects- returns the object but each CV as a string  
    '''
    
    CV_objects = extract_text(CV_objects)
    return CV_objects

def extract_req_data(original_data):
    req_data = original_data.copy()
    awards = list()
    snlp = stanza.Pipeline(lang='en',package="Default",processors='tokenize,ner')
    n = 0
    for i in req_data["CV"]:
        doc = snlp(i)
        entities = list()
        labels = list()
        for ent in doc.ents:
            entities.append(ent.text)
            labels.append(ent.type)
        temp = pd.DataFrame(np.c_[entities,labels],columns=["Entity","Label"])
        print(req_data["Name"][n])
        sum1 = temp[temp["Label"] == "MONEY"]["Entity"].apply(text_to_number).sum()
        print(sum1)
        awards.append(sum1)
        n = n + 1
        del doc 
        torch.cuda.empty_cache() 
    
    data["Awards"] = awards
    return req_data
    

In [8]:
cv_objs =  load_cvs("Public_CVs")

In [9]:
data = pd.DataFrame(cv_objs,index=None).T
data.reset_index(inplace=True)
data.columns = ["Name","CV","Department","PageCount"]

In [41]:
data.to_pickle()

In [36]:
extract_req_data(data)

2022-04-04 17:58:19 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-04-04 17:58:19 INFO: Use device: gpu
2022-04-04 17:58:19 INFO: Loading: tokenize
2022-04-04 17:58:19 INFO: Loading: ner
2022-04-04 17:58:20 INFO: Done loading processors!


Alexandra Martynova-Van Kley
672908
Ashley Elias
1356262
BARRY G. ROBINSON
174435
BRIAN BECKAGE 
62311469
CARMEN G. MONTAÑA-SCHALK
782753
Caroline (Lina) Lund Dahlberg
0
CHRISTOPHER LEONARD BRETT
3488191
DANIEL J. BENNETT
28931
DENIS A. LAROCHELLE
959307
DENNIS A. GRAVATT
146118080
Dior R. Kelley
69617
Easton R. White
2221001
Eric Stabb
3656681
Heidi J Gill Super
1268320
JASON N. BRUCK
1456259
JENNIFER M. BHATNAGAR
5939536
John J. Ewel
0
John M. Schmitt
0
JOHN SAKULICH
20268
JOSEPH PETER MONTOYA
2110742
Kimberly L. Mowry
0
LAURA J. OLSEN
1133488
Lindsay M. Porter
73975
Matthew A. Kwiatkowsk
581592
Michael E. Burns
1310264
Michael I. Coates
0
Nathan J Sanders
9042869
STEPHANIE J.B. FRETHAM
102426
Stephen Wagner
811052
WILLIAM I. LUTTERSCHMIDT
106451613
William R. McCleary
1001500
ALEXANDROS MAKRIYANNIS
0
Alison Flynn
0
ANDREI TOKMAKOFF
0
André M. Beauchemin
0
Arthur Bragg
0
Bing Xu
0
C. Dale Keefe
1420719
Carlos Velazquez-Martinez
480000
Cathleen Crudden
27666586
Cecilia I. Zurita Lopez

Unnamed: 0,Name,CV,Department,PageCount
0,Alexandra Martynova-Van Kley,\nAlexandra Martynova-Van Kley \nPROFESSIONA...,Biology,7
1,Ashley Elias,\nAshley Elias \nAssistant Professor of Biol...,Biology,9
2,BARRY G. ROBINSON,CURRICULUM VITAE \nBARRY G. ROBINSON \nPhD St...,Biology,5
3,BRIAN BECKAGE,BRIAN BECKAGE \nCurriculum Vitae \nCURRENT PO...,Biology,9
4,CARMEN G. MONTAÑA-SCHALK,2020 \nCURRICULUM VITAE \nCARMEN G. MONTAÑA-...,Biology,13
...,...,...,...,...
111,Manjusri Misra,\nUniversity of Guelph \nManjusri MISRA \nCV...,Engineering,5
112,umberger_vita,Page 1 of 20 \nCurriculum Vitae \nBrian R. Um...,Kinesiology,20
113,Ben Webster,Curriculum Vitae\nBEN WEBSTER\nOﬃce Address:\...,Mathematics,6
114,ADINA LUICAN-MAYER,Curriculum Vitae – last update July 2020 | Ad...,Physics,7


In [45]:
b = pd.read_json("data.json")

TypeError: read_json() got an unexpected keyword argument 'index'

In [44]:
b["Name"].to_csv("Name.csv")

In [25]:
a = data[data["Name"] == 'Curtis Berlinguette']["CV"]

In [16]:
import stanza

In [26]:
snlp = stanza.Pipeline(lang='en',package="Default",processors='tokenize,ner')

2022-04-04 17:55:34 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-04-04 17:55:34 INFO: Use device: gpu
2022-04-04 17:55:34 INFO: Loading: tokenize
2022-04-04 17:55:34 INFO: Loading: ner
2022-04-04 17:55:34 INFO: Done loading processors!


In [31]:
doc = snlp(a[45])
# print(*[f'entity: {ent.text}\ttype: {ent.type}' for ent in doc.ents if ent.type == "MONEY"], sep='\n')

In [22]:
del snlp

In [28]:
print(torch.cuda.caching_allocator_delete())

245366784


In [35]:
for ent in doc.ents:
    entities.append(ent.text)
    labels.append(ent.type)
temp = pd.DataFrame(np.c_[entities,labels],columns=["Entity","Label"])
sum1 = temp[temp["Label"] == "MONEY"]["Entity"].apply(text_to_number).sum()
print(sum1)

0
