In [1]:
import re
import spacy
from pdfminer.high_level import extract_text

In [2]:
# Load pre-trained SpaCy model for NER
nlp = spacy.load("en_core_web_sm")

In [3]:
def extract_text_from_pdf(file_path):
    """Extract text from a PDF file."""
    try:
        text = extract_text(file_path)
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

In [4]:
def extract_entities(text):
    """Extract named entities from the text using SpaCy."""
    doc = nlp(text)
    entities = {
        "PERSON": [],
        "ORG": [],
        "DATE": [],
        "GPE": [],
        "SKILLS": []
    }
    for ent in doc.ents:
        if ent.label_ in entities:
            entities[ent.label_].append(ent.text)
    return entities

In [5]:
def extract_skills(text):
    """Extract skills using keyword matching."""
    skills_list = ["Python", "Java", "C++", "Machine Learning", "Data Analysis", "Project Management", "SQL", "React", "Node.js"]
    found_skills = []
    for skill in skills_list:
        if re.search(rf"\\b{re.escape(skill)}\\b", text, re.IGNORECASE):
            found_skills.append(skill)
    return found_skills

In [6]:
def structure_data(entities, skills):
    """Organize extracted data into a structured format."""
    structured_data = {
        "Name": entities.get("PERSON", [None])[0],
        "Organizations": entities.get("ORG", []),
        "Dates": entities.get("DATE", []),
        "Locations": entities.get("GPE", []),
        "Skills": skills
    }
    return structured_data

In [20]:
if __name__ == "__main__":
    # Example file path for a resume PDF
    file_path = r"D:\certificate\Rahulduttaresume.pdf"

    # Step 1: Extract text from PDF
    resume_text = extract_text_from_pdf(r"D:\certificate\Rahulduttaresume.pdf")

    if resume_text:
        # Step 2: Extract entities using SpaCy
        entities = extract_entities(resume_text)

        # Step 3: Extract skills
        skills = extract_skills(resume_text)

        # Step 4: Structure the data
        structured_resume = structure_data(entities, skills)

        # Output structured data
        print("Extracted and Organized Resume Information:")
        print(structured_resume)

Extracted and Organized Resume Information:
{'Name': 'Html', 'Organizations': ['|rahuldutta1237@gmail.com', 'Artificial Intelligence and Machine Learning', 'St. Xavier’s College\n JAC Board', 'Holy Cross School', 'CBSE Board', 'CSS', 'CSS', 'HTML', 'CSS', 'HTML', 'CSS', 'HTML', 'CSS', 'Firebase \n\n• Developed a Ecommerce', 'SkillCraft Technology', 'Pinnacle Labs (', 'C/C++', 'SQL', 'JavaScript', 'HTML/CSS\nDeveloper Tools', 'Team Management\n\nCertifications', 'Hadoop Training\n\nWeb Development\n\nData Structure', 'Python\n\nPython Fundamentals\n\nAchievements\n\nMedal of', '2nd Semester\n\nCertificate Of Participation(Hackathon', 'Science Olympiad Foundation(SOF'], 'Dates': ['2022 - 2026', '2020 - 2022', '2019 - 2020', '2024 - Dec 2024'], 'Locations': ['Bengaluru', 'Karnataka', 'Ranchi', 'Jharkhand', 'Jharkhand', 'NumPy', 'Merit', 'Banglore'], 'Skills': []}
