In [76]:
!pip install pdfminer



In [77]:
pip install docx2txt



In [78]:
import pdfminer
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage

In [79]:
import docx2txt
import os
import regex as re

In [111]:
rootdir = '/content/drive/MyDrive/Resumes'
files = os.listdir(rootdir)
file_names = []
for file in os.listdir(rootdir):
    file_names.append(os.path.join('/content/drive/MyDrive/Resumes', file))
file_names

['/content/drive/MyDrive/Resumes/Sajan One Page.pdf',
 '/content/drive/MyDrive/Resumes/PON SUDHIR SAJAN S.S. Resume.pdf',
 '/content/drive/MyDrive/Resumes/PS_sajan_resume.pdf',
 '/content/drive/MyDrive/Resumes/Samuel - Resume (1).pdf',
 '/content/drive/MyDrive/Resumes/Sajan New Two Page.doc']

In [112]:
def open_pdf_file(file_name):
    output = StringIO()
    manager = PDFResourceManager() 
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    pagenums = set()
    infile = open(file_name, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()

    result = []

    for line in text.split('\n'):
        line2 = line.strip()
        if line2 != '':
            result.append(line2)
    return (result)

In [113]:

def open_docx_file(file_name):
    temp = docx2txt.process(file_names[0])
    text = [line.replace('\t', ' ') for line in temp.split('\n') if line]
    text = [t for t in text if len(t) > 1]
    return (text)

In [114]:

def remove_punctuations(line):
    return re.sub(r'(\.|\,)', '', line)

def preprocess_document(document):
    for index, line in enumerate(document):
        line = line.lower()
        line = remove_punctuations(line)
        
        line = line.split(' ')
        while '' in line:
            line.remove('')
            
        while ' '  in line:
            line.remove(' ')
            
            
        document[index] = ' '.join(line)
    return (document)

In [115]:
def get_experience(document):
    pattern1 = re.compile(r'(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)(\s|\S)(\d{2,4}).*(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)(\s|\S)(\d{2,4})')
    pattern2 = re.compile(r'(\d{2}(.|..)\d{4}).{1,4}(\d{2}(.|..)\d{4})')
    pattern3 = re.compile(r'(\d{2}(.|..)\d{4}).{1,4}(present)')
    pattern4 = re.compile(r'(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)(\s|\S)(\d{2,4}).*(present)')
    patterns = [pattern1, pattern2, pattern3, pattern4]
    experience = []
    for index, line in enumerate(document):
        for pattern in patterns:
            exp = pattern.findall(line)
            if len(exp) > 0:
                experience.append(document[index:index+4])
                
    return (experience)

In [145]:
def get_project(document):
  pattern1 = re.compile(r'(project(summary)?|project(management)?|duration|start(date)?|end(date)?|about(project)?|tool|plan|executive(summary)?|project(overview)?|project(management)?|project(outline)?|project(objective)?|project(information)?|management(teams)?|budget|timeline|schedule|milestone|deliverables|conclusion|project(proposal)?|research(documents)?|project(roadmap)?)(\s|\S)(\d{2,4}).*(project(summary)?|project(management)?|duration|start(date)?|end(date)?|about(project)?|tool|plan|executive(summary)?|project(overview)?|project(management)?|project(outline)?|project(objective)?|project(information)?|management(teams)?|budget|timeline|schedule|milestone|deliverables|conclusion|project(proposal)?|research(documents)?|project(roadmap)?)(\s|\S)(\d{2,4})')
  pattern2 = re.compile(r'(\d{2}(.|..)\d{4}).{1,4}(\d{2}(.|..)\d{4})')
  pattern3 = re.compile(r'(\d{2}(.|..)\d{4}).{1,4}(present)')
  pattern4 = re.compile(r'(jan(uary)?|feb(ruary)?|mar(ch)?|apr(il)?|may|jun(e)?|jul(y)?|aug(ust)?|sep(tember)?|oct(ober)?|nov(ember)?|dec(ember)?)(\s|\S)(\d{2,4}).*(present)')
  patterns = [pattern1, pattern2, pattern3, pattern4]
  project = []
  for index, line in enumerate(document):
    for pattern in patterns:
      prg = pattern.findall(line)
      if len(prg) > 0:
        project.append(document[index:index+4])
                
  return (project)


In [146]:

experience_1 = []
experience_2 = []
project_1=[]
project_2=[]

for file_name in file_names:
    if file_name.endswith('.pdf'):
        document = open_pdf_file(file_name)
    elif file_name.endswith('.docx'):
        document = open_docx_file(file_name)
    
    
    document = preprocess_document(document)
    
    experience = get_experience(document)

    if len(experience) > 1:
        experience_1.append(experience[0])
        experience_2.append(experience[1])
    elif len(experience) == 1:
        experience_1.append(experience[0])
        experience_2.append('')
    elif len(experience) == 0:
        experience_1.append('')
        experience_2.append('')

    project = get_project(document)

    if len(project) > 1:
        project_1.append(project[0])
        project_2.append(project[1])
    elif len(project) == 1:
        project_1.append(project[0])
        project_2.append('')
    elif len(project) == 0:
        project_1.append('')
        project_2.append('')


In [147]:
import pandas as pd
df = pd.DataFrame({'experience 1':experience_1, 'experiece 2':experience_2, 'project 1':project_1, 'project 2':project_2})

In [148]:
df

Unnamed: 0,experience 1,experiece 2,project 1,project 2
0,[noorul islam centre for higher education / au...,,[noorul islam centre for higher education / au...,
1,"[aug 2014 to present, 1 imparting, technical a...","[july 2019 to present, 1 using deep learning –...","[aug 2014 to present, 1 imparting, technical a...","[july 2019 to present, 1 using deep learning –..."
2,"[(cid:17) aug2014–present, • impartingtechnica...","[(cid:17) jan2019–present, • guidingandimparti...","[(cid:17) aug2014–present, • impartingtechnica...","[(cid:17) jan2019–present, • guidingandimparti..."
3,"[(cid:17) july2010–april2012, be electronicsan...","[(cid:17) july2004–april2008, skills, python, ...","[1349031813429639, professional, reference, ms...",
4,"[(cid:17) july2010–april2012, be electronicsan...","[(cid:17) july2004–april2008, skills, python, ...","[1349031813429639, professional, reference, ms...",


In [110]:
df['experiece 2'][1]

['july 2019 to present',
 '1 using deep learning – natural language processing statistical',
 'modeling techniques to develop and evaluate algorithms to improve',
 'performance quality data management and accuracy']