In [12]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-win_amd64.whl (16.6 MB)
   ---------------------------------------- 0.0/16.6 MB ? eta -:--:--
    --------------------------------------- 0.3/16.6 MB ? eta -:--:--
   -- ------------------------------------- 1.0/16.6 MB 3.6 MB/s eta 0:00:05
   ---- ----------------------------------- 1.8/16.6 MB 3.7 MB/s eta 0:00:04
   ------ --------------------------------- 2.6/16.6 MB 3.7 MB/s eta 0:00:04
   ------- -------------------------------- 3.1/16.6 MB 3.5 MB/s eta 0:00:04
   --------- ------------------------------ 3.9/16.6 MB 3.3 MB/s eta 0:00:04
   ---------- ----------------------------- 4.5/16.6 MB 3.4 MB/s eta 0:00:04
   ------------- -------------------------- 5.5/16.6 MB 3.6 MB/s eta 0:00:04
   --------------- ------------------------ 6.3/16.6 MB 3.5 MB/s eta 0:00:03
   ----------------- ---------------------- 7.1/16.6 MB 3.5 MB/s eta 0:00:03
   -------------

In [None]:
# Final version of the code


import fitz  # PyMuPDF
import json
import re

# STEP 1: Extract raw text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

# STEP 2: Normalize text (remove extra whitespace, replace | with space)
def normalize_text(text):
    text = text.replace('|', ' ')  # Treat '|' as space separator
    text = re.sub(r'\s+', ' ', text)  # Collapse multiple spaces/newlines into one
    return text.strip().lower()

def normalize_skill(skill):
    return re.sub(r'\s+', ' ', skill).strip().lower()

# STEP 3: Load skills from a JSON file
def load_skills_from_json(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return set(data.get('skills', []))

# STEP 4: Match each normalized skill against the normalized text
def extract_skills_with_exact_match(text, skill_set):
    found_skills = set()
    normalized_text = normalize_text(text)
    for skill in skill_set:
        normalized_skill = normalize_skill(skill)
        # Check if the skill is a single word (like "R", "C")
        if len(normalized_skill.split()) == 1:
            # Use word boundary regex to avoid partial matches
            if re.search(r'\b' + re.escape(normalized_skill) + r'\b', normalized_text):
                found_skills.add(skill)
        else:
            # For multi-word skills, simple substring check is enough
            if normalized_skill in normalized_text:
                found_skills.add(skill)
    return {'skills': sorted(found_skills)}


# Example usage
pdf_path = "Profile_2.pdf"  # Replace with your actual file
skills_file = "skills.json"

text = extract_text_from_pdf(pdf_path)
skill_set = load_skills_from_json(skills_file)
extracted_skills = extract_skills_with_exact_match(text, skill_set)

print(extracted_skills)

{'skills': ['Communication', 'Data Analysis', 'Data Science', 'Data Visualization', 'Excel', 'HTML', 'Machine Learning', 'Mathematics', 'Problem-Solving', 'Teamwork', 'Time Management']}
