## PDF Parser

In [1]:
import pandas as pd
import numpy as np 
import nltk as nltk
import os
import re
import fitz
import spacy
from spacy import displacy

In [2]:
def load_cvs(cdir):
    '''
        Function to load all the cvs from a folder.
        
        Parameters
        ----------
        cdir - String indicating the directory where Cvs are present
        
        Returns
        ----------
        cv_s - list of document objects by PyMuPDF
    '''
    path = os.getcwd() + "\\" + cdir
    CV_folders = os.listdir(path)
    cv_s = {}
    for i in CV_folders:
        folder_path = path + "\\" + i
        files = os.listdir(folder_path)
        cv_s[i] = list()
        for j in files:
            if re.search(".pdf",j) is not None:
                cv_s[i].append(fitz.open(folder_path+"\\"+j))
            
    return cv_s

def process_text(up_text):
    '''
        Function to process the extracted text.
        
        Parameters
        ----------
        up_text - unprocessed text
        
        Returns
        ----------
        processed_text -  processed text
    '''
    processed_text = re.sub(r'\n (?!=\w)',"",up_text)
    processed_text = re.sub(r'\n\d{1} (?!=\d)',"",processed_text)
    
    return processed_text
    

def extract_text(document):
    '''
        Function to extract and process text from documents pages.
        
        Parameters
        ----------
        document - A documents object containing CV
        
        Returns
        ----------
        txt - text extracted from the documents object
    '''
    n_pages = document.page_count
    txt = ""
    for i in range(n_pages):
        txt = txt+" "+document[i].get_text("text",flags = 1)
    return txt
    

def doc_to_txt(CV_objects):
    '''
        Function to convert the PyMuPDF document objects to processed plain text data.
        
        Parameters
        ----------
        CV_objects - Document objects containing CVs
        
        Returns
        ----------
        text_obj - dict containing each CV as a string  
    '''
    return text_obj
    

In [3]:
cv_objs =  load_cvs("Public_CVs")

In [4]:
unprocessd_text = extract_text(cv_objs["Biology"][0])

In [5]:
print(unprocessd_text)

  
1 
Alexandra Martynova-Van Kley 
PROFESSIONAL PREPARATION 
B.S. Biology, Biochemistry, & Pedagogy of Chemistry & Biology  
 
 
 
1984 
Bashkir State University Ufa, Russia 
Ph.D. in Plant Physiology 
 
 
 
 
 
 
 
 
1990 
Institute of Experimental Botany, Belorussian Academy of Science Minsk, Belorussia 
Received tenure 
 
 
 
 
 
 
 
 
 
2008 
Stephen F. Austin State University Nacogdoches, TX 
PROFESSIONAL EXPERIENCE 
2013-present 
Professor, Department of Biology 
 
 
 
Stephen F. Austin State University, Nacogdoches, TX 
2011-2013 
Associate Professor, Department of Biology 
 
 
 
Stephen F. Austin State University, Nacogdoches, TX 
2008-2011 
Associate Professor, Division of Biotechnology 
 
 
 
Stephen F. Austin State University, Nacogdoches, TX 
May 2004 
Visiting Professor, Biochemistry Department, 
 
 
 
Bashkir State University Ufa, Russia 
2002-2008 
Assistant Professor, Division of Biotechnology 
 
 
 
Stephen F. Austin State University, Nacogdoches, TX 
Spring 2002 
Adj

In [6]:
a = process_text(unprocessd_text)
print(a)

  
Alexandra Martynova-Van Kley 
PROFESSIONAL PREPARATION 
B.S. Biology, Biochemistry, & Pedagogy of Chemistry & Biology  
1984 
Bashkir State University Ufa, Russia 
Ph.D. in Plant Physiology 
1990 
Institute of Experimental Botany, Belorussian Academy of Science Minsk, Belorussia 
Received tenure 
2008 
Stephen F. Austin State University Nacogdoches, TX 
PROFESSIONAL EXPERIENCE 
2013-present 
Professor, Department of Biology 
Stephen F. Austin State University, Nacogdoches, TX 
2011-2013 
Associate Professor, Department of Biology 
Stephen F. Austin State University, Nacogdoches, TX 
2008-2011 
Associate Professor, Division of Biotechnology 
Stephen F. Austin State University, Nacogdoches, TX 
May 2004 
Visiting Professor, Biochemistry Department, 
Bashkir State University Ufa, Russia 
2002-2008 
Assistant Professor, Division of Biotechnology 
Stephen F. Austin State University, Nacogdoches, TX 
Spring 2002 
Adjunct Professor, Department of Biology 
Stephen F. Austin State University

In [7]:
text_file = open("data.txt", "w",encoding="utf-8")
 
#write string to file
text_file.write(a)
 
#close file
text_file.close()

In [8]:
nlp = spacy.load("en_core_web_trf")
doc = nlp(a)
displacy.serve(doc, style="ent")

OSError: [E050] Can't find model 'en_core_web_trf'. It doesn't seem to be a Python package or a valid path to a data directory.

In [17]:
text_file.close()

In [9]:
import torch

In [14]:
torch.cuda

AssertionError: Torch not compiled with CUDA enabled