In [1]:
# ✅ Here I am installing all the necessary Python packages needed for this project
# pdfplumber and PyMuPDF are for reading PDF files
# python-docx is for reading Word documents
# spacy and nltk are for natural language processing (like extracting names, skills)
# phonenumbers and email-validator are for validating contact info
!pip install pdfplumber PyMuPDF python-docx spacy nltk phonenumbers email-validator

# ✅ Downloading the English small model for spaCy
# This is used for entity recognition like names, organizations, etc.
!python -m spacy download en_core_web_sm

print("✅ Installation complete!")


Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting PyMuPDF
  Downloading pymupdf-1.26.4-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting phonenumbers
  Downloading phonenumbers-9.0.13-py2.py3-none-any.whl.metadata (11 kB)
Collecting email-validator
  Downloading email_validator-2.3.0-py3-none-any.whl.metadata (26 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90

In [2]:
# ✅ Importing all libraries used in the project

import os
import re
import json
from typing import Dict, List
import warnings

# Ignore warnings to keep output clean
warnings.filterwarnings('ignore')

# PDF and Word document processing
import pdfplumber  # Extract text from PDFs
import fitz  # PyMuPDF for PDF extraction
import docx  # For extracting text from DOCX files

# NLP libraries
import spacy  # For extracting entities like names
import nltk  # Tokenizing text, removing stopwords

# Validation for emails
from email_validator import validate_email, EmailNotValidError

# Download basic NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load spaCy English model for entity recognition
nlp = spacy.load('en_core_web_sm')

print("✅ Libraries loaded successfully!")


✅ Libraries loaded successfully!


In [3]:
# ✅ Here I have created a class called ResumeParser
# This class contains all the functions needed to parse a resume
# I am handling PDF, DOCX, extracting name, email, phone, skills, education, experience

class ResumeParser:
    def __init__(self):
        # Load spaCy NLP model
        self.nlp = nlp

        # List of common technical skills to search in resumes
        self.tech_skills = [
            'python', 'java', 'javascript', 'c++', 'c#', 'php', 'ruby', 'go', 'swift',
            'html', 'css', 'react', 'angular', 'vue', 'node.js', 'django', 'flask',
            'sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'aws', 'azure', 'docker',
            'kubernetes', 'git', 'machine learning', 'tensorflow', 'pytorch', 'pandas'
        ]

        # List of common soft skills
        self.soft_skills = [
            'leadership', 'communication', 'teamwork', 'problem solving', 'project management',
            'analytical', 'creative', 'adaptable', 'detail-oriented', 'organized'
        ]

    # ================= PDF and DOCX extraction =================
    def extract_text_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF file using pdfplumber first, then fallback to PyMuPDF"""
        try:
            with pdfplumber.open(file_path) as pdf:
                text = ""
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
                if text.strip():
                    return self.clean_text(text)

            # Fallback using PyMuPDF
            doc = fitz.open(file_path)
            text = ""
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                text += page.get_text() + "\n"
            doc.close()
            return self.clean_text(text)

        except Exception as e:
            raise Exception(f"Failed to extract PDF text: {e}")

    def extract_text_from_docx(self, file_path: str) -> str:
        """Extract text from DOCX file"""
        try:
            doc = docx.Document(file_path)
            text = [p.text for p in doc.paragraphs if p.text.strip()]
            return '\n'.join(text)
        except Exception as e:
            raise Exception(f"Failed to extract DOCX text: {e}")

    # ================= Cleaning Text =================
    def clean_text(self, text: str) -> str:
        """Clean text by removing extra spaces and newlines"""
        text = re.sub(r'\n\s*\n', '\n\n', text)  # Remove multiple blank lines
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
        return text.strip()

    # ================= Contact Info =================
    def extract_email(self, text: str) -> str:
        """Find first valid email in text"""
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        emails = re.findall(email_pattern, text)
        for email in emails:
            try:
                validate_email(email)  # Validate email format
                return email
            except:
                continue
        return ""

    def extract_phone(self, text: str) -> str:
        """Find first phone number in text"""
        phone_patterns = [
            r'\+?1?[-.\s]?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})',
            r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'
        ]
        for pattern in phone_patterns:
            matches = re.findall(pattern, text)
            if matches:
                if isinstance(matches[0], tuple):
                    return ''.join(matches[0])
                else:
                    return matches[0]
        return ""

    # ================= Name Extraction =================
    def extract_name(self, text: str) -> str:
        """Extract candidate name using NLP or fallback pattern matching"""
        doc = self.nlp(text[:500])  # Only check first 500 characters
        for ent in doc.ents:
            if ent.label_ == "PERSON" and len(ent.text.split()) >= 2:
                return ent.text

        # Fallback: check first 5 lines for Name pattern
        lines = text.split('\n')[:5]
        for line in lines:
            if re.match(r'^[A-Z][a-z]+ [A-Z][a-z]+', line.strip()):
                words = line.strip().split()[:2]
                if len(words) == 2:
                    return ' '.join(words)
        return ""

    # ================= Skills Extraction =================
    def extract_skills(self, text: str) -> Dict[str, List[str]]:
        """Find both technical and soft skills mentioned in text"""
        text_lower = text.lower()
        found_tech = [s.title() for s in self.tech_skills if s in text_lower]
        found_soft = [s.title() for s in self.soft_skills if s in text_lower]
        return {'technical': found_tech, 'soft': found_soft}

    # ================= Education =================
    def extract_education(self, text: str) -> Dict:
        """Extract education details like degree, university, field, year"""
        education = {'degree': '', 'university': '', 'field': '', 'year': ''}

        # Degree pattern
        degree_patterns = [r'(bachelor|master|phd|b\.s|m\.s|b\.a|m\.a|b\.tech|m\.tech)', r'(diploma|certificate)']
        for pattern in degree_patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                education['degree'] = match.group(0)
                break

        # University keywords
        uni_keywords = ['university', 'college', 'institute', 'iit', 'nit']
        for line in text.split('\n'):
            for keyword in uni_keywords:
                if keyword.lower() in line.lower():
                    education['university'] = line.strip()
                    break

        # Graduation year
        years = re.findall(r'\b(19|20)\d{2}\b', text)
        if years:
            education['year'] = max(years)

        return education

    # ================= Work Experience =================
    def extract_experience(self, text: str) -> List[Dict]:
        """Extract work experience from resume text"""
        experiences = []
        lines = text.split('\n')
        in_experience = False
        current_exp = {}

        for line in lines:
            line_lower = line.lower().strip()

            # Check if experience section starts
            if re.search(r'\b(experience|employment|work)\b', line_lower) and len(line) < 30:
                in_experience = True
                continue

            # Stop if experience section ends
            if in_experience and re.search(r'\b(education|skills|projects)\b', line_lower) and len(line) < 30:
                break

            # Record experience
            if in_experience and line.strip():
                if re.search(r'\b(company|corp|inc|ltd|pvt|llc)\b', line_lower):
                    if current_exp:
                        experiences.append(current_exp)
                    current_exp = {'company': line.strip(), 'position': '', 'duration': ''}
                elif current_exp and not current_exp.get('position'):
                    current_exp['position'] = line.strip()

        if current_exp:
            experiences.append(current_exp)

        return experiences

    # ================= Main Parsing Function =================
    def parse_resume(self, file_path: str) -> Dict:
        """Parse resume file and return structured data"""
        try:
            if file_path.lower().endswith('.pdf'):
                text = self.extract_text_from_pdf(file_path)
            elif file_path.lower().endswith(('.docx', '.doc')):
                text = self.extract_text_from_docx(file_path)
            else:
                raise ValueError("Unsupported file format. Use PDF or DOCX.")

            result = {
                'filename': os.path.basename(file_path),
                'raw_text': text,
                'personal_info': {
                    'name': self.extract_name(text),
                    'email': self.extract_email(text),
                    'phone': self.extract_phone(text)
                },
                'skills': self.extract_skills(text),
                'education': self.extract_education(text),
                'experience': self.extract_experience(text),
                'parsing_success': True
            }
            return result

        except Exception as e:
            return {'filename': os.path.basename(file_path) if file_path else 'unknown', 'error': str(e), 'parsing_success': False}

# Initialize parser instance
parser = ResumeParser()
print("✅ Resume Parser initialized and ready!")


✅ Resume Parser initialized and ready!


In [4]:
# ✅ Here I am allowing the user to upload a resume file (PDF or DOCX)
# This can be replaced with an input path if running locally
from google.colab import files  # Only for Google Colab

# Prompt user to upload a file
uploaded_files = files.upload()

# Save uploaded file to local
for filename in uploaded_files.keys():
    with open(filename, 'wb') as f:
        f.write(uploaded_files[filename])
    print(f"✅ File '{filename}' uploaded successfully!")


Saving Prabhakar_Rayal_Resume.pdf to Prabhakar_Rayal_Resume.pdf
✅ File 'Prabhakar_Rayal_Resume.pdf' uploaded successfully!


In [5]:
# ✅ Here I am using the ResumeParser instance to parse the uploaded file
# The parser will extract name, email, phone, skills, education, and experience
for filename in uploaded_files.keys():
    result = parser.parse_resume(filename)

    # Save parsed result as JSON (optional)
    json_filename = filename.split('.')[0] + "_parsed.json"
    with open(json_filename, 'w') as f:
        import json
        json.dump(result, f, indent=4)

    print(f"✅ Parsing complete for '{filename}'. Results saved as '{json_filename}'")


✅ Parsing complete for 'Prabhakar_Rayal_Resume.pdf'. Results saved as 'Prabhakar_Rayal_Resume_parsed.json'


In [6]:
# ✅ Here I am displaying parsed data in a readable format
# This is helpful to quickly check all extracted details

import pprint  # Pretty print dictionary

for filename in uploaded_files.keys():
    print(f"\n📄 Parsed Data for '{filename}':")
    pprint.pprint(result)



📄 Parsed Data for 'Prabhakar_Rayal_Resume.pdf':
{'education': {'degree': 'B.Tech',
               'field': '',
               'university': 'Prabhakar Rayal Email: '
                             'prabhakarrayalarcy@gmail.com • Phone: 90 XXXX XXXX '
                             'LinkedIn: '
                             'https://www.linkedin.com/in/prabhakar-rayal-663968259/ '
                             '• GitHub: https://github.com/Prabhakarrayal '
                             'Professional Summary Proactive Computer Science '
                             'graduate aspiring for roles as a Software '
                             'Engineer, AI/ML Engineer, or Web Developer. '
                             'Skilled in building scalable applications and '
                             'intelligent systems with expertise in Python, '
                             'Flask, JavaScript, and cloud technologies. '
                             'Completed hands-on projects in AI, machine '
         

In [10]:
# ✅ Here I am demonstrating how to search the parsed data for specific info
# Example: Find if the candidate knows 'Python' or has 'Machine Learning' experience

search_skill = 'Python'

# Convert both sides to lowercase for consistent matching
technical_skills = [s.lower() for s in result['skills']['technical']]
if search_skill.lower() in technical_skills:
    print(f"✅ Candidate has technical skill: {search_skill}")
else:
    print(f"❌ Candidate does not have technical skill: {search_skill}")

# 🔎 Search for experience in a specific company
company_search = 'Google'
companies = [exp.get('company','').lower() for exp in result['experience']]
if company_search.lower() in companies:
    print(f"✅ Candidate has worked at {company_search}")
else:
    print(f"❌ Candidate has not worked at {company_search}")


✅ Candidate has technical skill: Python
❌ Candidate has not worked at Google


In [11]:
# ✅ Here I am saving all parsed resume data into a JSON file
# This can be later used for batch processing or building a database

all_results = []  # List to store multiple resume results
all_results.append(result)

# Save results in a single JSON file
with open("all_parsed_resumes.json", "w") as f:
    json.dump(all_results, f, indent=4)

print("✅ All parsed resumes saved in 'all_parsed_resumes.json'")


✅ All parsed resumes saved in 'all_parsed_resumes.json'


In [12]:
# ✅ Here I am showing simple statistics from parsed resumes
# For example, total resumes parsed, common skills found, etc.

print(f"Total resumes processed: {len(all_results)}")

# Count of each technical skill
skill_count = {}
for res in all_results:
    for skill in res['skills']['technical']:
        skill_count[skill] = skill_count.get(skill, 0) + 1

print("📊 Technical Skills Frequency:")
for skill, count in skill_count.items():
    print(f"{skill}: {count}")


Total resumes processed: 1
📊 Technical Skills Frequency:
Python: 1
Java: 1
Javascript: 1
C++: 1
Php: 1
Go: 1
Html: 1
Css: 1
Flask: 1
Sql: 1
Mysql: 1
Aws: 1
Git: 1
Machine Learning: 1
Pandas: 1
