In [33]:
from docx import Document
import docxpy
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
import nltk
import spacy
import pickle
import re
import glob
import pandas as pd
from nltk.corpus import stopwords
from spacy.matcher import Matcher
from datetime import date 
import datefinder
from dateutil.parser import parse
import wikipedia
import pgeocode


class exportToCSV:
    
    section_keywords = ['education', 'academic', 'academia', 
                        'work', 'experience', 'job', 'career', 
                        'skill', 'strength', 'publication', 'conference', 
                        'curricular', 'declaration', 'languages', 'personal', 'birth', 
                        'hobbies']
    
    keywords_acad = ['academic',
                        'academics',
                        'academia',
                        'education', 
                        'educational', 
                        'qualification', 
                        'qualifications', 
                        'credential', 
                        'credentials']
    
    keywords_experience = ['work experience', 'experience', 'work', 'job', 'career']
    
    nlp = spacy.load('en_core_web_lg')
    degree_cloud = None
    
    def convertDocxToText(self, path):
        return docxpy.process(path)
    
    
    
    
    def convert_pdf_to_txt(self, path):
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
        fp = open(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos=set()

        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)

        text = retstr.getvalue()

        fp.close()
        device.close()
        retstr.close()
        return text
    
    
    def getEmail(self, inputString):
        email = None
        try:
            pattern = re.compile(r'\S+@\S+')
            matches = pattern.findall(inputString)
            email = matches
        except Exception as e:
            print(e)
        return set(email)
    
    
    def getPhone(self, inputString):

        number = None
        try:
            pattern = re.compile(r'([+(]?\d+[)\-]?[ \t\r\f\v]*[(]?\d{2,}[()\-]?[ \t\r\f\v]*\d{2,}[()\-]?[ \t\r\f\v]*\d*[ \t\r\f\v]*\d*[ \t\r\f\v]*)')
            match = pattern.findall(inputString)
            match = [re.sub(r'[,.]', '', el) for el in match if len(re.sub(r'[()\-.,\s+]', '', el))>6]
            match = [re.sub(r'\D$', '', el).strip() for el in match]
            match = [el for el in match if len(re.sub(r'\D','',el)) <= 15]

            try:
                for el in list(match):
                    if len(el.split('-')) > 3: 
                        continue 
                    for x in el.split("-"):
                        try:
                            if x.strip()[-4:].isdigit():
                                if int(x.strip()[-4:]) in range(1900, 2100):
                                    match.remove(el)
                        except:
                            pass
            except:
                pass
            number = match
        except:
            pass
        
        numbers_updated = []
        for n in number:
            n = n.replace(")", "").replace("(", "")
            numbers_updated.append(n)
        return numbers_updated
    
    
    
    def getName(self, text):
        matcher = Matcher(self.nlp.vocab)
        nlp_text = self.nlp(text)
        pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
        matcher.add('NAME', None, pattern)
        matches = matcher(nlp_text)
        for match_id, start, end in matches:
            span = nlp_text[start:end]
            return span.text.upper()
    
    
            
    
    
    def readFile(self, fileName):
        
        extension = fileName.split(".")[-1]
        
        if extension == "doc" or extension == 'docx':
            return self.convertDocxToText(fileName)
            
        elif extension == "pdf":
            return self.convert_pdf_to_txt(fileName)

        else:
            print('Unsupported format')
            return None
    
    
    
    def getMarital(self, text):
        
        end = 0
        keywords = ['marital', 'married']
        
        for key in keywords:
            pos = text.upper().find(key.upper())
            if pos != -1:
                end = min(end, pos)
                
        text = text[end:]
        
        keywords_married = ['married', 'yes', 'engaged']
        keywords_else = ['unmarried', 'not married', 'single']
        
        is_married = "NA"
        for key in keywords_married:
            if text.upper().find(key.upper()) != -1:
                is_married = "YES"
                
        for key in keywords_else:
            if text.upper().find(key.upper()) != -1:
                is_married = "NO"
        
        
        return is_married
    
    
    def getGender(self, text):
        
        end = 0
        keywords = ['sex', 'gender']
        
        for key in keywords:
            pos = text.upper().find(key.upper())
            if pos != -1:
                end = min(end, pos)
                
        text = text[end:]
        
        keywords_male = ['male', 'man', 'boy']
        keywords_female = ['female', 'woman', 'girl']
        
        gender = "NA"
        for key in keywords_male:
            if text.upper().find(key.upper()) != -1:
                gender = "MALE"
                
        for key in keywords_female:
            if text.upper().find(key.upper()) != -1:
                gender = "FEMALE"
        
        return gender
    
    
    def get_languages(self, text):
    
        languages = open('languages_new.txt','r').read().lower()
        languages = languages.split("\n")
        
        end = 0
        keywords = ['language', 'languages']
        
        for key in keywords:
            pos = text.upper().find(key.upper())
            if pos != -1:
                end = min(end, pos)
                
        text = text[end: ]
        
        ans = []
        
        lines = [el.strip() for el in text.split("\n") if len(el) > 0]  
        lines = [nltk.word_tokenize(el) for el in lines]
        
        for line in lines:
            for word in line:
                if word.lower() in languages:
                    ans.append(word.upper())
        
        return set(ans)
    
    
    
        
        
    def get_location(self, text):
        
        pattern = re.compile(r'[^0-9]+([1-9]{1}[0-9]{2}\s{0,1}[0-9]{3})[^0-9]+')
        match = pattern.findall(text)
        
        dict_ans = None
        
        found = False
        
        if len(match):
            nomi = pgeocode.Nominatim('in')
            
            try:
                state = nomi.query_postal_code(match[0])['state_name']
                dict_ans = state
                found = True
                return dict_ans
            
            except Exception as e:
                print(e)
                
        end = 0
        keywords = ['address', 'location']
        
        for key in keywords:
            pos = text.upper().find(key.upper())
            if pos != -1:
                end = max(end, pos)
        
        text = text[end: ]
        text = text.replace("-", " ").replace("|", " ").replace(",", " ")
        
        unwanted = "!@#$;!*%&^~"
        
        for char in unwanted:
            text = text.replace(char, "")
        
        re.sub('[^A-Za-z ]+', '', text)
    
        locations = []
        for X in self.nlp(text).doc.ents:
            if X.label_ == 'GPE':
                locations.append(X.text)
                
        return set(locations)        

        
    
    
    def get_degree(self, text):
        
        EDUCATION = [
            'BBM', 'PGDM', 'B.A', 'BA' 'MA',
            'B.SC', 'B. SC', 'B.Sc', 'B. Sc',
            'BE', 'B.E', 
            'BS', 'B.S',
            'C.A','c.a.', 'CA' 
            'B.COM', 'B. COM', 'B.Com','B. Com', 'B.com',
            'M. Com', 'M.Com','M. COM', 'M.COM'
            'ME', 'M.E', 'MS', 'M.S',
            'BTECH', 'B.TECH', 'B. TECH', 'B TECH', 'M.TECH', 'M. TECH', 'MTECH' 'M TECH',
            'PHD', 'phd', 'ph.d', 'Ph.D.',
            'MBA', 'M.B.A',
            'graduate', 'post-graduate', 'Master', 'Bachelor', 'MASTER', 'BACHELOR'
        ]
        
        degrees = []
        for degree in EDUCATION:
            if text.find(degree) != -1:
                degrees.append(degree)
                
        return degrees
        
    
    
    
    
    def experience_years(self, text):
        
        text = text.lower()
        num = 0
        
        x = re.findall("(\d+(\.\d+)?) year", text)
        for temp in x:
            num += float(temp[0])
            break
            
        return num
        
    

    
    def experience_role(self, text):
        
        end = 0
        for key in self.keywords_experience:
            pos = text.upper().find(key.upper())
            if pos != -1:
                end = max(end, pos)
        
        text = text[end: -10]
        
        lines = [el.strip() for el in text.split("\n") if len(el) > 0]
        lines = [el.replace("\t", " ") for el in lines]
        lines = [nltk.word_tokenize(el) for el in lines]
        lines = [nltk.pos_tag(el) for el in lines]
        
        
        pattern_nltk = """mychunk:{<[NNP|NN].*>{3,5}}"""
        chunkParser = nltk.RegexpParser(pattern_nltk)
        
        org_lines = []
        for line in lines:
            chunked_line = chunkParser.parse(line)
            for subtree in chunked_line.subtrees():
                if subtree.label() == 'mychunk' and len(subtree.leaves()) and not subtree.leaves()[0][0].islower():
                    sen = ' '.join([words[0]for words in subtree.leaves()])
                    org_lines.append(sen)
        
        avoid = ['@', 'secondary', 'board', 'institute','institution','institutes','university',
                 'college','I.C.A.I','school','vishvavidyalaya','mahavidyalaya','vidyalaya',
                 'academy','shiksha','niketan',
                'BBM', 'PGDM', 'B.A', 'BA' 'MA',
                'B.SC', 'B. SC', 'B.Sc', 'B. Sc',
                'BE', 'B.E', 
                'BS', 'B.S',
                'C.A','c.a.', 'CA' 
                'B.COM', 'B. COM', 'B.Com','B. Com',
                'M. Com', 'M.Com','M. COM', 'M.COM'
                'ME', 'M.E', 'MS', 'M.S',
                'BTECH', 'B.TECH', 'B. TECH', 'B TECH', 'M.TECH', 'M. TECH', 'MTECH' 'M TECH',
                'PHD', 'phd', 'ph.d', 'Ph.D.',
                 'diploma',
                'MBA', 'M.B.A','mba',
                'graduate', 'post-graduate', 'Master', 'Bachelor', 'MASTER', 'BACHELOR']
        
        
        new_final = []
        for line in org_lines:
            found = False
            for word in avoid:
                if line.lower().find(word.lower()) != -1:
                    found = True
                    break
            if not found:
                new_final.append(line)
                
        return new_final
    
    
    def get_exp_role_try(self, text):
        
        end = 0
        for key in self.keywords_experience:
            pos = text.upper().find(key.upper())
            if pos != -1:
                end = max(end, pos)
        
        text = text[end: -10]
        
        ans = []
        for X in self.nlp(text).doc.ents:
            if X.label_ == 'ORG':
                ans.append(X.text)
        
        avoid = ['@', 'secondary', 'board', 'institute','institution','institutes','university',
                 'college','I.C.A.I','school','vishvavidyalaya','mahavidyalaya','vidyalaya',
                 'academy','shiksha','niketan',
                'BBM', 'PGDM', 'B.A', 'BA' 'MA',
                'B.SC', 'B. SC', 'B.Sc', 'B. Sc',
                'BE', 'B.E', 
                'BS', 'B.S',
                'C.A','c.a.', 'CA' 
                'B.COM', 'B. COM', 'B.Com','B. Com',
                'M. Com', 'M.Com','M. COM', 'M.COM'
                'ME', 'M.E', 'MS', 'M.S',
                'BTECH', 'B.TECH', 'B. TECH', 'B TECH', 'M.TECH', 'M. TECH', 'MTECH' 'M TECH',
                'PHD', 'phd', 'ph.d', 'Ph.D.',
                 'diploma',
                'MBA', 'M.B.A','mba',
                'graduate', 'post-graduate', 'Master', 'Bachelor', 'MASTER', 'BACHELOR']
        
        new_final = []
        for line in ans:
            found = False
            for word in avoid:
                if line.lower().find(word.lower()) != -1:
                    found = True
                    break
            if not found:
                new_final.append(line)
                
        return new_final
        
        
        
    
    
    def getAge(self, text):
        
        end = 0
        keywords = ['birth', 'dob']
        
        for key in keywords:
            pos = text.upper().find(key.upper())
            if pos != -1:
                end = max(end, pos)
                
        text = text[end: ]
        
        lines = [el.strip() for el in text.split("\n") if len(el) > 0]
        lines = [el.replace("\t", " ") for el in lines]
        lines = [nltk.word_tokenize(el) for el in lines]
        lines = [nltk.pos_tag(el) for el in lines]
        
        
        res = date.today()
        
        
        for line in lines:
            try:
                sen = " ".join([words[0] for words in line])
                res = parse(sen, fuzzy = True)
                break
            except Exception as e:
                continue
        
        today = date.today()
        age = today.year - res.year - ((today.month, today.day) < (res.month, res.day))
        return age
        
    
    
    
    def readResumes(self):
        
        doc_files = glob.glob("sample_cvs/insurance/*.doc")
        docx_files = glob.glob("sample_cvs/insurance/*.docx")
        pdf_files = glob.glob("sample_cvs/insurance/*.pdf")

        files = set(doc_files + docx_files + pdf_files)
        files = list(files)
        print ("%d files identified" %len(files))
        
        
        with open('degrees.data', 'rb') as filehandle:
            degrees = pickle.load(filehandle)
        
        self.degree_cloud = [x for x in degrees]
        extra = ['CA', 'ICWA', 'Diploma', 'Bachelor', 'Master', 'Bachelors', 'Masters', 'Doctor', 'Doctors', 'BBM', 'SSLC', 'PUC']
        for x in extra:
            self.degree_cloud.append(x)
        
        self.degree_cloud = [x.upper().replace(",", " ").replace("(", " ").replace(")", " ").replace(".", "").replace("-", " ").replace("'", "") for x in self.degree_cloud]
        self.section_keywords = [x.upper() for x in self.section_keywords]
        self.keywords_acad = [x.upper() for x in self.keywords_acad]
        
        data = []
        for f in files:
            text = self.readFile(f)
            if text is not None:
                fileName = f
                text = text.encode().decode('ascii', 'ignore')
                email = self.getEmail(text)
                phone = self.getPhone(text)
                name = self.getName(text)
                married = self.getMarital(text)
                gender = self.getGender(text)
                languages = self.get_languages(text)
                degree = self.get_degree(text)
                experience_years = self.experience_years(text)
                experience_role = self.experience_role(text)
                location = self.get_location(text)
                age = self.getAge(text)
                exp_try = self.get_exp_role_try(text)
                data.append([fileName, email, phone, name, married, gender, languages, age, location, degree, experience_years, experience_role, exp_try])
                
        
        df = pd.DataFrame(data, columns = ['FileName', 'Email', 'Phone', 'Name', 'Married', 'Gender', 'Languages', 'Age', 'Location', 'Degree', 'Experience 1', 'Experience 2', 'Exp_Try'])
        df.to_csv('results-insurance.csv')
                

In [34]:
a = exportToCSV()

In [35]:
a.readResumes()

16 files identified


