In [None]:
# 🧾 Resume Parser using NLP (spaCy)

This project uses Natural Language Processing to extract structured information from unstructured resume text files.

## 🎯 Goals

- Parse resume files using spaCy and regex
- Extract name, email, phone, education, and skills
- Format results into structured output


In [None]:
## 📂 Load Resume & spaCy Pipeline


In [None]:
## 🔍 Extract Key Information
- Name (first PERSON entity)
- Email (regex)
- Phone number (regex)
- Education (keyword scan)
- Skills (section-based pattern)


In [None]:
## 🧹 Cleaned & Structured Output
Resume summary is stored as a Python dictionary and saved to JSON/CSV.


In [1]:
# Resume Parser Starter – Load Text + spaCy Setup

import spacy
import re

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load sample resume text
with open("Sample Resumes/resume1.txt", "r", encoding="utf-8") as file:
    resume_text = file.read()

# Run through spaCy pipeline
doc = nlp(resume_text)

# Preview first few named entities
for ent in doc.ents[:10]:
    print(f"{ent.text} — {ent.label_}")


Jeff Lebowski
Anytown — PERSON
888 — CARDINAL
Professional Summary
Learning & Development Specialist — ORG
Organizational Strategist — ORG
25+ years — DATE
L&D — ORG
first — ORDINAL
AI — GPE
Core Competencies
Learning Strategy & Curriculum Design • Organizational Development • Change Management — ORG
Science & Technology Corporation — ORG


In [2]:
# Extract name: first PERSON entity
name = None
for ent in doc.ents:
    if ent.label_ == "PERSON":
        name = ent.text
        break

print("Name:", name)


Name: Jeff Lebowski
Anytown


In [4]:
# Extract email using regex
email = re.findall(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", resume_text)
print("Email:", email[0] if email else "Not found")


Email: thedude@compuserve.com


In [5]:
# Extract phone number (basic pattern match)
phone = re.findall(r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}", resume_text)
print("Phone:", phone[0] if phone else "Not found")


Phone: 888-555-1212


In [6]:
# Education keywords to scan for
education_keywords = ["B.A.", "B.S.", "M.A.", "M.S.", "MBA", "PhD", "Bachelor", "Master", "Doctorate", "Associate", "High School"]

education_found = []
for line in resume_text.split('\n'):
    for keyword in education_keywords:
        if keyword.lower() in line.lower():
            education_found.append(line.strip())

print("Education Found:", education_found)


Education Found: ['Bachelor of Applied Science, Human Resource Management — Pensacola State College', 'Bachelor of Applied Science, Business Administration — Pensacola State College']


In [9]:
# Clean skill lines (remove bullets, tabs, etc.)
cleaned_skills = []
for skill in skills_section:
    # Remove leading bullets, dashes, or tabs
    skill = skill.lstrip("•-–\t ").strip()
    cleaned_skills.append(skill)

print("Cleaned Skills:", cleaned_skills)


Cleaned Skills: ['Learning Strategy & Curriculum Design • Organizational Development • Change Management', 'Training Needs Analysis • Instructional Design • AI-Integrated Training Programs', 'Leadership Coaching • Process Improvement • Cross-Functional Collaboration', 'Learning Management Systems (LMS) • Performance Enablement • Compliance Training', '________________________________________', 'Professional Experience', 'TISTA Science & Technology Corporation — Remote, FL', 'Learning & Development Specialist | Nov 2024 – Present', 'Designed and piloted AI/Data Analytics curriculum projected to improve workforce training effectiveness by 30%', 'Conducted organization-wide learning needs assessments, mapping results to strategic workforce planning', 'Partnered with HR and Compliance teams to align L&D initiatives with policy, DEI, and culture goals', 'Integrated AI tools into course delivery and analysis to monitor training efficacy in real-time', 'Strategic Initiatives Consultant | Apr 

In [10]:
# Final resume summary
resume_summary = {
    "Name": name,
    "Email": email[0] if email else None,
    "Phone": phone[0] if phone else None,
    "Education": education_found,
    "Skills": cleaned_skills
}

from pprint import pprint
pprint(resume_summary)


{'Education': ['Bachelor of Applied Science, Human Resource Management — '
               'Pensacola State College',
               'Bachelor of Applied Science, Business Administration — '
               'Pensacola State College'],
 'Email': 'thedude@compuserve.com',
 'Name': 'Jeff Lebowski\nAnytown',
 'Phone': '888-555-1212',
 'Skills': ['Learning Strategy & Curriculum Design • Organizational '
            'Development • Change Management',
            'Training Needs Analysis • Instructional Design • AI-Integrated '
            'Training Programs',
            'Leadership Coaching • Process Improvement • Cross-Functional '
            'Collaboration',
            'Learning Management Systems (LMS) • Performance Enablement • '
            'Compliance Training',
            '________________________________________',
            'Professional Experience',
            'TISTA Science & Technology Corporation — Remote, FL',
            'Learning & Development Specialist | Nov 2024 – Pres

import json

with open("parsed_resume.json", "w", encoding="utf-8") as f:
    json.dump(resume_summary, f, indent=4)

print("Saved as parsed_resume.json")


In [12]:
import pandas as pd

# Flatten single-row summary to DataFrame
df_summary = pd.DataFrame([resume_summary])
df_summary.to_csv("parsed_resume.csv", index=False)

print("Saved as parsed_resume.csv")


Saved as parsed_resume.csv


In [None]:
## ✅ Summary

This project demonstrates a practical application of NLP using spaCy and regex to extract structured data from real-world resume documents.

It lays the foundation for building more advanced parsing tools, such as:
- Support for PDF files
- Enhanced skill classification
- Resume-job description matching
