### Display the PDF

In [1]:
from docx2pdf import convert
from IPython.display import display, IFrame
import os
import PyPDF2 as pdf
from pyresparser import ResumeParser
import spacy
import tempfile
from docx2pdf import convert
import re
import nltk
from spacy.matcher import Matcher
import constants as cs
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
from nltk.stem import WordNetLemmatizer


In [2]:
def display_pdf(pdf_path):
    # Convert absolute path to relative path
    notebook_dir = os.getcwd()  # Get the current working directory where the notebook is located
    relative_path = os.path.relpath(pdf_path, notebook_dir)
    # Display the PDF using an iframe
    display(IFrame(relative_path, width=800, height=600))

Parsing Structure: Identifies key sections of the resume like contact information, education, work experience, skills, etc., using techniques like keyword matching, regular expressions, or machine learning models.

Data Extraction: Extracts specific information within each section:

Contact: Name, email address, phone number, location (optional).

Education: School name, degree name, graduation year, location (optional), relevant coursework (optional).

Work Experience: Company name, job title, employment dates, location (optional), key responsibilities and achievements (bullet points).

Skills: Technical skills, soft skills, keywords.
Additional Sections: Certifications, awards, projects (optional), depending on the parser's capabilities.

In [3]:
def read_text_from_file(file_path):
    _, file_extension = os.path.splitext(file_path)
    if file_extension.lower() == '.pdf':
        display_pdf(file_path)
        return read_text_from_pdf(file_path)
    elif file_extension.lower() == '.docx':
        pdf_path = convert_docx_to_pdf(file_path)
        if pdf_path:
            display_pdf(pdf_path)
            return read_text_from_pdf(pdf_path)
        else:
            return None
    else:
        print("Unsupported file format. Supported formats are PDF and DOCX.")
        return None

def read_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as f:
            reader = pdf.PdfFileReader(f)
            text = ""
            for page_num in range(reader.numPages):
                text += reader.getPage(page_num).extractText()
        return text
    except Exception as e:
        print(f"Error reading text from PDF: {e}")
        return None

def convert_docx_to_pdf(docx_path):
    try:
        # Convert DOCX to PDF
        pdf_path = tempfile.mktemp(suffix='.pdf')
        convert(docx_path, pdf_path)
        return pdf_path
    except Exception as e:
        print(f"Error converting DOCX to PDF: {e}")
        return None

# Example usage
# file_path = "Resume Examples\Creative teaching resume.docx" # Replace with the path to your file
file_path = r"C:\Users\hotpr\OneDrive - Norwich University\Apprenticeship\New Apprenticeship\Resume.pdf"
text = read_text_from_file(file_path)
if text:
    print("Text extracted from the file:")
    print(text)
else:
    print("Failed to extract text from the file.")



Text extracted from the file:
TOLUWANI C. OLUKANNI  
Northfield, VT 05663  | +1 (267) 423 -3529  | tolukann5778@gmail.com  | linkedin.com/in/toluwani -olukanni  | https://github.com/Toluwani5778  
Education  
Norwich University, Northfield, VT                May 2025  
Bachelor of Science in Electrical and Computer Engineering  
Minors: Mathematics and Computer Science                         GPA 3.9 6 
Relevant courses and topics: Embedded Systems, Circuit Design and Analysis, Electronics and  Electrical 
Components, Programming for Microcontrollers, Signal Processing, Fundamentals of Digital Design  
 
Relevant Experience  
Information Operations /Cybersecurity  Subject Matter Expert  
Norwich University Applied Research Institute             January  2024 – Present  
Norwich University, Northfield, VT  
• Collaborate with the Software Development Team to integrate ML models seamlessly into the CI/CD 
pipeline.  
• Assist in the execution of assigned duties and tasks to support the d

In [4]:
nlp = spacy.load("en_core_web_sm")

def extract_email(text):
    '''
    Helper function to extract email id from text

    :param text: plain text extracted from resume file
    '''
    email = re.findall(r"([^@|\s]+@[^@]+\.[^@|\s]+)", text)
    if email:
        try:
            return email[0].split()[0].strip(';')
        except IndexError:
            return None
        
print(extract_email(text))


tolukann5778@gmail.com


In [22]:
import re

def extract_name(full_name):
    # Regular expression patterns for different name formats
    patterns = [
        # First name, middle initial, last name
        r'^([A-Z][a-z]+)\s+([A-Z])\.\s+([A-Z][a-z]+)$',
        # First name, last name
        r'^([A-Z][a-z]+)\s+([A-Z][a-z]+)$',
        # First name
        r'^([A-Z][a-z]+)$'
    ]
    
    # Iterate through each pattern and attempt to match
    for pattern in patterns:
        match = re.match(pattern, full_name)
        if match:
            # Extract the parts based on the matched pattern
            if len(match.groups()) == 3:
                return match.group(1), match.group(2), match.group(3)
            elif len(match.groups()) == 2:
                return match.group(1), None, match.group(2)
            else:
                return match.group(1), None, None
    
    # If no match found, return None
    return None, None, None

# Example usage
name_formats = [
    "John D. Doe",
    "John Doe",
    "John"
]

for name in name_formats:
    print(extract_name(name)[1])
    print("Full Name:", name)
    print("First Name:", first_name)
    print("Middle Initial:", middle_initial)
    print("Last Name:", last_name)
    print()


D
Full Name: John D. Doe
First Name: John
Middle Initial: None
Last Name: None

None
Full Name: John Doe
First Name: John
Middle Initial: None
Last Name: None

None
Full Name: John
First Name: John
Middle Initial: None
Last Name: None



In [5]:
# def extract_education_from_resume(text):
#     education = []

#     # List of education keywords to match against
#     education_keywords = ['Bsc', 'B. Pharmacy', 'B Pharmacy', 'Msc', 'M. Pharmacy', 'Ph.D', 'Bachelor', 'Master']

#     for keyword in education_keywords:
#         pattern = r"(?i)\b{}\b".format(re.escape(keyword))
#         match = re.search(pattern, text)
#         if match:
#             education.append(match.group())

#     return education

def extract_education_from_resume(text):
    education = []

    # Use regex pattern to find education information
    pattern = r"(?i)(?:Bsc|\bB\.\w+|\bM\.\w+|\bPh\.D\.\w+|\bBachelor(?:'s)?|\bMaster(?:'s)?|\bPh\.D)\s(?:\w+\s)*\w+"
    matches = re.findall(pattern, text)
    for match in matches:
        education.append(match.strip())

    return education

extract_education_from_resume(text)

['Bachelor of Science in Electrical and Computer Engineering']

In [6]:

def remove_newlines(text):
  """Removes all newline characters (`\n`) from a string."""
  return text.replace('\n', '')

def extract_phone_numbers(text):
    '''
    Function to extract phone numbers from text

    :param text: plain text containing phone numbers
    :return: list of phone numbers found in the text
    '''
    pattern = re.compile(r'(?<!\n)(\+?\d{0,3}\s?[-\.\(\)]?\s?\(?\d{3}\)?\s?[-\.\(\)]?\s?\d{3}\s?[-\.\(\)]?\s?\d{4})')

    # Find all matches of the pattern in the text
    matches = re.findall(pattern, text)
    clean_matches = [remove_newlines(match).lstrip() for match in matches]
    # Return the list of phone numbers found
    return clean_matches


print(extract_phone_numbers(text))


['+1 (267) 423 -3529']


In [7]:

nlp_text = nlp(text)
def extract_name(nlp_text):
    '''
    Helper function to extract name from spacy nlp text

    :param nlp_text: object of `spacy.tokens.doc.Doc`
    :param matcher: object of `spacy.matcher.Matcher`
    :return: string of full name
    '''
    matcher = Matcher(nlp.vocab)
    for pattern in cs.NAME_PATTERN:
        matcher.add('NAME', [pattern])
    matches = matcher(nlp_text)

    for _, start, end in matches:
        span = nlp_text[start:end]
        if 'name' not in span.text.lower():
            return span.text
        
print(extract_name(nlp_text))

TOLUWANI C.


In [8]:
from datetime import datetime

def detect_date_format(date_str):
    '''
    Detects the format of the input date string.

    :param date_str: Input date string
    :return: Detected date format
    '''
    formats = [
        ('%b %Y', 'Month YYYY'),
        ('%Y-%m-%d', 'YYYY-MM-DD'),  # ISO 8601 format
        ('%m/%d/%Y', 'MM/DD/YYYY'),  # US format
        ('%d-%m-%Y', 'DD-MM-YYYY'),  # European/African format
        ('%B %dst, %Y', 'Month DDst, YYYY'),  # Full month name with ordinal day
        ('%B %dnd, %Y', 'Month DDnd, YYYY'),  # Full month name with ordinal day
        ('%B %drd, %Y', 'Month DDrd, YYYY'),  # Full month name with ordinal day
        ('%B %dth, %Y', 'Month DDth, YYYY')   # Full month name with ordinal day
    ]

    for date_format, format_name in formats:
        try:
            datetime.strptime(date_str, date_format)
            return format_name
        except ValueError:
            continue
    
    return 'Unknown'

In [9]:
def get_total_experience(experience_list):
    '''
    Wrapper function to extract total months of experience from a resume

    :param experience_list: list of experience text extracted
    :return: total months of experience
    '''
    exp_ = []
    for line in experience_list:
        experience = re.search(
            r'(?P<fmonth>\w+.\d+)\s*(\D|to)\s*(?P<smonth>\w+.\d+|present)',
            line,
            re.I
        )
        if experience:
            exp_.append(experience.groups())
    total_exp = sum(
        [get_number_of_months_from_dates(i[0], i[2]) for i in exp_]
    )
    total_experience_in_months = total_exp
    return total_experience_in_months

In [10]:
def get_number_of_months_from_dates(date1, date2):
    '''
    Helper function to extract total months of experience from a resume

    :param date1: Starting date
    :param date2: Ending date
    :return: months of experience from date1 to date2
    '''
    months_of_experience = 0  # Default value
    
    if detect_date_format(date1) == 'YYYY-MM-DD':
        if date2.lower() == 'present':
            date2 = datetime.now().strftime('%Y-%m-%d')
        try:
            start_date = datetime.strptime(str(date1), '%Y-%m-%d')
            end_date = datetime.strptime(str(date2), '%Y-%m-%d')
            months_of_experience = (end_date.year - start_date.year) * 12 + (
                end_date.month - start_date.month)
        except ValueError:
            return 0

    elif detect_date_format(date1) == 'MM/DD/YYYY':
        if date2.lower() == 'present':
            date2 = datetime.now().strftime('%m/%d/%Y')
        try:
            start_date = datetime.strptime(str(date1), '%m/%d/%Y')
            end_date = datetime.strptime(str(date2), '%m/%d/%Y')
            months_of_experience = (end_date.year - start_date.year) * 12 + (
                end_date.month - start_date.month)
        except ValueError:
            return 0

    elif detect_date_format(date1) == 'DD-MM-YYYY':
        if date2.lower() == 'present':
            date2 = datetime.now().strftime('%d-%m-%Y')
        try:
            start_date = datetime.strptime(str(date1), '%d-%m-%Y')
            end_date = datetime.strptime(str(date2), '%d-%m-%Y')
            months_of_experience = (end_date.year - start_date.year) * 12 + (
                end_date.month - start_date.month)
        except ValueError:
            return 0

    elif detect_date_format(date1) == 'Month DDst, YYYY':
        if date2.lower() == 'present':
            date2 = datetime.now().strftime('%B %dst, %Y')
        try:
            start_date = datetime.strptime(str(date1), '%B %dst, %Y')
            end_date = datetime.strptime(str(date2), '%B %dst, %Y')
            months_of_experience = (end_date.year - start_date.year) * 12 + (
                end_date.month - start_date.month)
        except ValueError:
            return 0
    elif detect_date_format(date1) == 'Month DDnd, YYYY':
        if date2.lower() == 'present':
            date2 = datetime.now().strftime('%B %dnd, %Y')
        try:
            start_date = datetime.strptime(str(date1), '%B %dnd, %Y')
            end_date = datetime.strptime(str(date2), '%B %dnd, %Y')
            months_of_experience = (end_date.year - start_date.year) * 12 + (
                end_date.month - start_date.month)
        except ValueError:
            return 0
    elif detect_date_format(date1) == 'Month DDrd, YYYY':
        if date2.lower() == 'present':
            date2 = datetime.now().strftime('%B %drd, %Y')
        try:
            start_date = datetime.strptime(str(date1), '%B %drd, %Y')
            end_date = datetime.strptime(str(date2), '%B %drd, %Y')
            months_of_experience = (end_date.year - start_date.year) * 12 + (
                end_date.month - start_date.month)
        except ValueError:
            return 0
    elif detect_date_format(date1) == 'Month DDth, YYYY':
        if date2.lower() == 'present':
            date2 = datetime.now().strftime('%B %dth, %Y')
        try:
            start_date = datetime.strptime(str(date1), '%B %dth, %Y')
            end_date = datetime.strptime(str(date2), '%B %dth, %Y')
            months_of_experience = (end_date.year - start_date.year) * 12 + (
                end_date.month - start_date.month)
        except ValueError:
            return 0
    elif detect_date_format(date1) == 'Month YYYY':
        
        if date2.lower() == 'present':
            date2 = datetime.now().strftime('%b %Y')
        try:
            if len(date1.split()[0]) > 3:
                date1 = date1.split()
                date1 = date1[0][:3] + ' ' + date1[1]
            if len(date2.split()[0]) > 3:
                date2 = date2.split()
                date2 = date2[0][:3] + ' ' + date2[1]
        except IndexError:
            return 0
        try:
            date1 = datetime.strptime(str(date1), '%b %Y')
            date2 = datetime.strptime(str(date2), '%b %Y')
            months_of_experience = relativedelta(date2, date1)
            months_of_experience = (months_of_experience.years * 12 +
                                    months_of_experience.months)
        except ValueError:
            return 0
    else:
        try:
            pass
        except Exception as e:
            return "Can't calculate"

    return months_of_experience


date1 = 'Jan 2019'  # Starting date
date2 = 'Dec 2021'  # Ending date
months_of_experience = get_number_of_months_from_dates(date1, date2)
print("Months of experience:", months_of_experience)

Months of experience: 35


In [11]:
def extract_skills(nlp_text, skills_file=None):
    '''
    Helper function to extract skills from spacy nlp text

    :param nlp_text: object of `spacy.tokens.doc.Doc`
    :param noun_chunks: noun chunks extracted from nlp text
    :return: list of skills extracted
    '''
    tokens = [token.text for token in nlp_text if not token.is_stop]
    noun_chunks = nlp_text.noun_chunks
    if not skills_file:
        data = pd.read_csv(
            os.path.join(os.getcwd(), 'skills.csv')
        )
    else:
        data = pd.read_csv(skills_file)
    skills = list(data.columns.values)
    skillset = []
    # check for one-grams
    for token in tokens:
        if token.lower() in skills:
            skillset.append(token)

    # check for bi-grams and tri-grams
    for token in noun_chunks:
        token = token.text.lower().strip()
        if token in skills:
            skillset.append(token)
    return [i.capitalize() for i in set([i.lower() for i in skillset])]

print(extract_skills(nlp_text))

## Edit the skills csv to make sure no rubbish skills is considered 
# investigate if time management is there

['Analysis', 'C++', 'Adobe', 'Algorithms', 'Excel', 'Collaborative', 'Design', 'Adobe photoshop', 'C', 'Matrix', 'Training', 'Programming', 'Modeling', 'Cloud', 'Mathematics', 'Python', 'International', 'Assembly', 'Pandas', 'Science', 'Research projects', 'Data', 'Distribution', 'Solidworks', 'Electrical', 'Troubleshooting', 'Software', 'Medical', 'Circuits', 'Matlab', 'Interactive', 'Machine learning', 'Technology', 'Architecture', 'Deep learning', 'Publishing', 'Accuracy', 'Electronics', 'Experimental', 'Photoshop', 'Research', 'Powerpoint', 'Word', 'Technical', 'Testing', 'Computer', 'Ms excel', 'Ai', 'Engineering', 'Corel draw', 'Certification', 'Operations', 'Languages']


In [12]:
def extract_education(nlp_text):
    '''
    Helper function to extract education from spacy nlp text

    :param nlp_text: object of `spacy.tokens.doc.Doc`
    :return: tuple of education degree and year if year if found
             else only returns education degree
    '''
    edu = {}
    # Extract education degree
    try:
        for token in nlp_text:
            token_text = token.text.strip()
            token_text = re.sub(r'[?|$|.|!|,]', r'', token_text)
            if token_text.upper() in cs.EDUCATION and token_text not in cs.STOPWORDS:
                # Concatenate current token with next token if available
                next_token = nlp_text[token.i + 1].text if token.i + 1 < len(nlp_text) else ''
                edu[token_text] = token.text + next_token
    except IndexError:
        pass

    # Extract year
    education = []
    for key in edu.keys():
        year = re.search(re.compile(cs.YEAR), edu[key])
        if year:
            education.append((key, ''.join(year.group(0))))
        else:
            education.append(key)
    return education

print(extract_education(nlp_text))


['Bachelor', 'MS']


In [13]:
def extract_experience(resume_text):
    '''
    Helper function to extract experience from resume text

    :param resume_text: Plain resume text
    :return: list of experience
    '''
    wordnet_lemmatizer = WordNetLemmatizer()

    # word tokenization
    word_tokens = nltk.word_tokenize(resume_text)

    # remove stop words and lemmatize
    filtered_sentence = [
            w for w in word_tokens if w not
            in cs.STOPWORDS and wordnet_lemmatizer.lemmatize(w)
            not in cs.STOPWORDS
        ]
    sent = nltk.pos_tag(filtered_sentence)

    # parse regex
    cp = nltk.RegexpParser('P: {<NNP>+}')
    cse = cp.parse(sent)

    # for i in cs.subtrees(filter=lambda x: x.label() == 'P'):
    #     print(i)

    test = []

    for vp in list(
        cse.subtrees(filter=lambda x: x.label() == 'P')
    ):
        test.append(" ".join([
            i[0] for i in vp.leaves()
            if len(vp.leaves()) >= 2])
        )

    # Search the word 'experience' in the chunk and
    # then print out the text after it
    x = [
        x[x.lower().index('experience') + 10:]
        for i, x in enumerate(test)
        if x and 'experience' in x.lower()
    ]
    return x

print(extract_experience(text))

[' Information Operations']


In [14]:
def extract_entity_sections_professional(text):
    '''
    Helper function to extract all the raw text from sections of
    resume specifically for professionals

    :param text: Raw text of resume
    :return: dictionary of entities
    '''
    text_split = [i.strip() for i in text.split('\n')]
    entities = {}
    key = False
    for phrase in text_split:
        if len(phrase) == 1:
            p_key = phrase
        else:
            p_key = set(phrase.lower().split()) \
                    & set(cs.RESUME_SECTIONS_PROFESSIONAL)
        try:
            p_key = list(p_key)[0]
        except IndexError:
            pass
        if p_key in cs.RESUME_SECTIONS_PROFESSIONAL:
            entities[p_key] = []
            key = p_key
        elif key and phrase.strip():
            entities[key].append(phrase)
    return entities

extract_entity_sections_professional(text)

{'education': ['Norwich University, Northfield, VT                May 2025',
  'Bachelor of Science in Electrical and Computer Engineering',
  'Minors: Mathematics and Computer Science                         GPA 3.9 6',
  'Relevant courses and topics: Embedded Systems, Circuit Design and Analysis, Electronics and  Electrical',
  'Components, Programming for Microcontrollers, Signal Processing, Fundamentals of Digital Design'],
 'experience': ['surfaces, utilizing various tools such as files, Dremel tools, and buffing wheels to achieve a high -quality',
  'finish.',
  'Projects & Work Portfolio',
  '• Inductive Cross -Sectional Area Sensor for Bore Tube Measurement: A collaborative project with',
  'peers involving the principle of mutual induction between a sensing coil and a virtual coil to measure',
  'the distance between them.',
  'Expected Completion:  December 2025',
  '• Search and Rescue AI -enabled drones: A collaborative project utilizing Python to develop intelligent',
  'd

In [15]:
def extract_entity_sections(text):
    '''
    Helper function to extract all the raw text from sections of
    resume specifically for graduates and undergraduates

    :param text: Raw text of resume
    :return: dictionary of entities
    '''
    text_split = [i.strip() for i in text.split('\n')]
    entities = {}
    key = False
    for phrase in text_split:
        if len(phrase) == 1:
            p_key = phrase
        else:
            p_key = set(phrase.lower().split()) \
                    & set(cs.RESUME_SECTIONS)
        try:
            p_key = list(p_key)[0]
        except IndexError:
            pass
        if p_key in cs.RESUME_SECTIONS:
            entities[p_key] = []
            key = p_key
        elif key and phrase.strip():
            entities[key].append(phrase)

    return entities

extract_entity_sections(text)

{'education': ['Norwich University, Northfield, VT                May 2025',
  'Bachelor of Science in Electrical and Computer Engineering',
  'Minors: Mathematics and Computer Science                         GPA 3.9 6',
  'Relevant courses and topics: Embedded Systems, Circuit Design and Analysis, Electronics and  Electrical',
  'Components, Programming for Microcontrollers, Signal Processing, Fundamentals of Digital Design'],
 'experience': ['surfaces, utilizing various tools such as files, Dremel tools, and buffing wheels to achieve a high -quality',
  'finish.'],
 'projects': ['• Inductive Cross -Sectional Area Sensor for Bore Tube Measurement: A collaborative project with',
  'peers involving the principle of mutual induction between a sensing coil and a virtual coil to measure',
  'the distance between them.',
  'Expected Completion:  December 2025',
  '• Search and Rescue AI -enabled drones: A collaborative project utilizing Python to develop intelligent',
  'drones with enhance

In [16]:
sections = extract_entity_sections(text)
if "projects" in sections:
    print(sections["projects"])

['• Inductive Cross -Sectional Area Sensor for Bore Tube Measurement: A collaborative project with', 'peers involving the principle of mutual induction between a sensing coil and a virtual coil to measure', 'the distance between them.', 'Expected Completion:  December 2025', '• Search and Rescue AI -enabled drones: A collaborative project utilizing Python to develop intelligent', 'drones with enhanced search and rescue capabilities, actively participating in integrating AI algorithms', 'and drone technology to address autonomous navigation, obstacle avoidance, and real -time data', 'analysis challenges.', '• Enhancing Lung Cancer Diagnosis with Regularized CNNs for Histopathological Image', 'Classification: The purpose of this project was to explore multiple methods of regularization to', 'minimize overfitting in Convolutional Neural Networks and increase validation accuracy and', 'generalization of the data.', '• 8x8 LED Matrix MAX7219 Display controlled by Arduino via Bluetooth: An i