## PDF Parser

In [8]:
import pandas as pd
import numpy as np 
import nltk as nltk
import os
import re
import fitz
import spacy
from spacy import displacy
import torch
import stanza

In [14]:
def text_to_number(text):
    text = re.sub(r'[^0-9]',"",text)
    if len(text) > 0:
        return int(text)
    else:
        return 0
def load_cvs(cdir):
    '''
        Function to load all the cvs from a folder.
        
        Parameters
        ----------
        cdir - String indicating the directory where Cvs are present
        
        Returns
        ----------
        cv_s - list of document objects by PyMuPDF
    '''
    path = os.getcwd() + "\\" + cdir
    CV_folders = os.listdir(path)
    cv_s = {}
    for i in CV_folders:
        folder_path = path + "\\" + i
        files = os.listdir(folder_path)
        for j in files:
            if re.search(".pdf",j) is not None:
                name = re.sub(".pdf","",j)
                file = fitz.open(folder_path+"\\"+j)
                cv = doc_to_txt(file)
                cv_s[name]={"CV":cv,"Department": i,"n_pages":file.page_count}
                file.close()
            
    return cv_s

def process_text(up_text):
    '''
        Function to process the extracted text.
        
        Parameters
        ----------
        up_text - unprocessed text
        
        Returns
        ----------
        processed_text -  processed text
    '''
    processed_text = re.sub(r'\n (?!=\w)',"",up_text)
    processed_text = re.sub(r'\n\d{1} (?!=\d)',"",processed_text)
    processed_text = re.sub(r'\n\d{1} (?!=\d)',"",processed_text)
#     processed_text = re.sub(r'\d{1} (?!=\w)',"",processed_text)
    processed_text = re.sub(r'[_]',"",processed_text)
    processed_text = re.sub(r'[•]',"",processed_text)
    return processed_text
    

def extract_text(document):
    '''
        Function to extract and process text from documents pages.
        
        Parameters
        ----------
        document - A documents object containing CV
        
        Returns
        ----------
        txt - text extracted from the documents object
    '''
    n_pages = document.page_count
    txt = ""
    for i in range(n_pages):
        txt = txt+" "+document[i].get_text("text",flags = 1)
        txt = process_text(txt)
    return txt
    

def doc_to_txt(CV_objects):
    '''
        Function to convert the PyMuPDF document objects to processed plain text data.
        
        Parameters
        ----------
        CV_objects - Document objects containing CVs
        
        Returns
        ----------
        CV_objects- returns the object but each CV as a string  
    '''
    
    CV_objects = extract_text(CV_objects)
    return CV_objects

def extract_req_data(original_data):
    req_data = original_data.copy()
    awards = list()
    n_events = list()
    nlp = spacy.load("en_core_web_trf")
    for i in req_data["CV"]:
        doc = nlp(i)
        entities = list()
        labels = list()
        for ent in doc.ents:
            entities.append(ent.text)
            labels.append(ent.label_)
        temp = pd.DataFrame(np.c_[entities,labels],columns=["Entity","Label"])
        awards.append(temp[temp["Label"] == "MONEY"]["Entity"].apply(text_to_number).sum())
        if "EVENT" in temp["Label"].unique():
            n_events.append(temp["Label"].value_counts()["EVENT"])
        else:
             n_events.append(0)
        del doc 
        torch.cuda.empty_cache() 
    
    req_data["Awards"] = awards
    req_data["No of Event and Conferences"]= n_events
    return req_data
    

In [15]:
cv_objs =  load_cvs("Public_CVs")

In [16]:
data = pd.DataFrame(cv_objs,index=None).T
data.reset_index(inplace=True)
data.columns = ["Name","CV","Department","PageCount"]

In [48]:
# data.to_json("data.json")

In [17]:
req_data = extract_req_data(data)

Token indices sequence length is longer than the specified maximum sequence length for this model (661 > 512). Running this sequence through the model will result in indexing errors


In [23]:
ret_data = req_data.drop(columns=["CV","PageCount"])

In [25]:
ret_data.to_csv("Resume_data.csv")