# Resume Parsing

In [1]:
import fitz  # PyMuPDF
import spacy
from spacy.matcher import PhraseMatcher

1. Using PhraseMatcher

In [2]:
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error opening or reading PDF: {e}")
        return None

def extract_skills_and_points(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)

    skills = []
    key_points = []

    # Define some common skills to search for
    skill_phrases = [
        "machine learning", "deep learning", "data analysis", "python", "java",
        "project management", "communication", "teamwork", "sql", "excel"
    ]
    matcher = PhraseMatcher(nlp.vocab)
    patterns = [nlp(skill) for skill in skill_phrases]
    matcher.add("SKILLS", patterns)

    # Use matcher to find skills
    matches = matcher(doc)
    for match_id, start, end in matches:
        skills.append(doc[start:end].text)

    # Extract key points
    for token in doc:
        if token.pos_ == 'VERB' and token.dep_ == 'ROOT':
            key_points.append(token.text)

    return skills, key_points

if __name__ == "__main__":
    # Replace with your actual path to the resume PDF file
    pdf_path = '/content/resume.pdf'

    resume_text = extract_text_from_pdf(pdf_path)

    if resume_text:
        skills, key_points = extract_skills_and_points(resume_text)

        print("Skills:")
        for skill in skills:
            print(skill)

        print("\nKey Points:")
        for point in key_points:
            print(point)

Error opening or reading PDF: no such file: '/content/resume.pdf'


2. Using BST

In [3]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_phrase = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert(self, phrase):
        node = self.root
        for char in phrase:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_phrase = True

    def search(self, text):
        node = self.root
        matched_phrases = []
        i = 0
        while i < len(text):
            char = text[i]
            if char in node.children:
                temp_node = node
                temp_phrase = ''
                while i < len(text) and char in temp_node.children:
                    temp_phrase += char
                    temp_node = temp_node.children[char]
                    if temp_node.is_end_of_phrase:
                        matched_phrases.append(temp_phrase)
                    i += 1
                    if i < len(text):
                        char = text[i]
                i -= len(temp_phrase) - 1
            i += 1
        return matched_phrases

In [4]:

# Usage of Trie in the main script
def extract_skills_and_points(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)

    skills = []
    key_points = []

    # Define some common skills to search for
    skill_phrases = [
        "machine learning", "deep learning", "data analysis", "python", "java",
        "project management", "communication", "teamwork", "sql", "excel"
    ]

    # Build the trie
    trie = Trie()
    for phrase in skill_phrases:
        trie.insert(phrase.lower())

    # Use trie to find skills
    lower_text = text.lower()
    skills = trie.search(lower_text)

    # Extract key points
    for token in doc:
        if token.pos_ == 'VERB' and token.dep_ == 'ROOT':
            key_points.append(token.text)

    return skills, key_points

if __name__ == "__main__":
    # Replace with your actual path to the resume PDF file
    pdf_path = '/content/resume.pdf'

    resume_text = extract_text_from_pdf(pdf_path)

    if resume_text:
        skills, key_points = extract_skills_and_points(resume_text)

        print("Skills:")
        for skill in skills:
            print(skill)

        print("\nKey Points:")
        for point in key_points:
            print(point)


Error opening or reading PDF: no such file: '/content/resume.pdf'
