In [1]:
import os
from pathlib import Path
import fitz  # PyMuPDF
import pandas as pd

In [2]:
# Path to the data folder
data_path = Path("../data/data/data")

# List specialities (each is a folder containing resumes)
sectors = [folder.name for folder in data_path.iterdir() if folder.is_dir()]
print("Sectors found:", sectors)

Sectors found: ['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS', 'AUTOMOBILE', 'AVIATION', 'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT', 'CHEF', 'CONSTRUCTION', 'CONSULTANT', 'DESIGNER', 'DIGITAL-MEDIA', 'ENGINEERING', 'FINANCE', 'FITNESS', 'HEALTHCARE', 'HR', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS', 'SALES', 'TEACHER']


In [3]:
# Counting Resumes
resume_count = {}

for sector in sectors:
    resumes = list((data_path / sector).glob("*.pdf"))
    resume_count[sector] = len(resumes)

print("Resumes per sector:")
for sector, count in resume_count.items():
    print(f"{sector}: {count} resumes")


Resumes per sector:
ACCOUNTANT: 118 resumes
ADVOCATE: 118 resumes
AGRICULTURE: 63 resumes
APPAREL: 97 resumes
ARTS: 104 resumes
AUTOMOBILE: 36 resumes
AVIATION: 117 resumes
BANKING: 115 resumes
BPO: 22 resumes
BUSINESS-DEVELOPMENT: 120 resumes
CHEF: 118 resumes
CONSTRUCTION: 112 resumes
CONSULTANT: 115 resumes
DESIGNER: 107 resumes
DIGITAL-MEDIA: 96 resumes
ENGINEERING: 118 resumes
FINANCE: 118 resumes
FITNESS: 117 resumes
HEALTHCARE: 115 resumes
HR: 110 resumes
INFORMATION-TECHNOLOGY: 120 resumes
PUBLIC-RELATIONS: 111 resumes
SALES: 116 resumes
TEACHER: 102 resumes


In [4]:
from random import sample
# A test to make sure we can access our sample files!
for sector in sectors:
    resume_files = list((data_path / sector).glob("*.pdf"))
    print(f"\n{sector} - Sample files:")
    for f in sample(resume_files, min(2, len(resume_files))):
        print(" * ", f.name)



ACCOUNTANT - Sample files:
 *  31602598.pdf
 *  25749150.pdf

ADVOCATE - Sample files:
 *  18997135.pdf
 *  13342150.pdf

AGRICULTURE - Sample files:
 *  28247753.pdf
 *  69336473.pdf

APPAREL - Sample files:
 *  25142074.pdf
 *  13764840.pdf

ARTS - Sample files:
 *  13272204.pdf
 *  14150896.pdf

AUTOMOBILE - Sample files:
 *  24592627.pdf
 *  23522150.pdf

AVIATION - Sample files:
 *  24773845.pdf
 *  16850314.pdf

BANKING - Sample files:
 *  11065180.pdf
 *  18645964.pdf

BPO - Sample files:
 *  16492045.pdf
 *  26829350.pdf

BUSINESS-DEVELOPMENT - Sample files:
 *  16519708.pdf
 *  11088337.pdf

CHEF - Sample files:
 *  11432686.pdf
 *  92985983.pdf

CONSTRUCTION - Sample files:
 *  16378091.pdf
 *  12839152.pdf

CONSULTANT - Sample files:
 *  19936735.pdf
 *  22259768.pdf

DESIGNER - Sample files:
 *  37263609.pdf
 *  25949631.pdf

DIGITAL-MEDIA - Sample files:
 *  15484097.pdf
 *  16509761.pdf

ENGINEERING - Sample files:
 *  28630325.pdf
 *  13149176.pdf

FINANCE - Sample file

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [6]:
# Example: Get one PDF from the Tech sector
sample_pdf = list((data_path / "ENGINEERING").glob("*.pdf"))[0]

# Extract text
sample_text = extract_text_from_pdf(sample_pdf)

# Show part of it
print(sample_text[:1000])  # print the first 1000 characters


ENGINEERING LAB TECHNICIAN
Career Focus
My main objective in seeking employment with Triumph Actuation Systems Inc. is to work in a professional atmosphere where I can utilize my
skills and continue to gain experience in the aerospace industry to advance in my career.
Professional Experience
Engineering Lab Technician Oct 2016 to Current 
Company Name ï¼​ City , State
Responsible for testing various seat structures to meet specific certification requirements. Â 
Maintain and calibrate test instruments to ensure testing capabilities are maintained.
Ensure data is captured and recorded correctly for certification test reports.
Duties also dynamic test set-up and static suite testing. 
Engineering Lab Technician, Sr. Specialist Apr 2012 to Oct 2016 
Company Name ï¼​ City , State
Utilized skills learned from LabView Course 1 training to construct and maintain LabView VI programs.
Responsible for fabricating and maintaining hydraulic/electrical test equipment to complete development and qua

In [7]:
resumes_data = []

for sector in sectors:
    for pdf_file in (data_path / sector).glob("*.pdf"):
        text = extract_text_from_pdf(pdf_file)
        resumes_data.append({
            "sector": sector,
            "filename": pdf_file.name,
            "text": text
        })

print(f" Parsed {len(resumes_data)} resumes in total.")


 Parsed 2485 resumes in total.


In [8]:
import os
import re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import spacy
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

class ResumeRanker:
    def __init__(self):
        """Initialize the Resume Ranking System"""
        try:
            # Load spaCy model (download with: python -m spacy download en_core_web_sm)
            self.nlp = spacy.load('en_core_web_sm')
        except OSError:
            print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None
        
        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            ngram_range=(1, 2),
            lowercase=True
        )
        
        # Common skill keywords for different categories
        self.skill_categories = {
            'programming': ['python', 'java', 'javascript', 'c++', 'c#', 'php', 'ruby', 'go', 'rust', 'scala', 'kotlin'],
            'web_development': ['html', 'css', 'react', 'angular', 'vue', 'node', 'express', 'django', 'flask'],
            'data_science': ['pandas', 'numpy', 'matplotlib', 'seaborn', 'scikit-learn', 'tensorflow', 'pytorch', 'r'],
            'databases': ['sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'oracle'],
            'cloud': ['aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform', 'jenkins'],
            'soft_skills': ['leadership', 'communication', 'teamwork', 'problem-solving', 'analytical', 'creative']
        }
    
    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF file"""
        try:
            text = ""
            with fitz.open(pdf_path) as doc:
                for page in doc:
                    text += page.get_text()
            return text.strip()
        except Exception as e:
            print(f"Error extracting text from {pdf_path}: {e}")
            return ""
    
    def clean_text(self, text):
        """Clean and preprocess text"""
        if not text:
            return ""
        
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\-\.\,\(\)]', '', text)
        return text.lower().strip()
    
    def extract_skills(self, text):
        """Extract skills from text based on predefined categories"""
        text_lower = text.lower()
        found_skills = {}
        
        for category, skills in self.skill_categories.items():
            found_skills[category] = []
            for skill in skills:
                if skill in text_lower:
                    found_skills[category].append(skill)
        
        return found_skills
    
    def extract_experience_years(self, text):
        """Extract years of experience from text"""
        patterns = [
            r'(\d+)\+?\s*years?\s*(?:of\s*)?experience',
            r'(\d+)\+?\s*years?\s*in',
            r'experience\s*(?:of\s*)?(\d+)\+?\s*years?',
            r'(\d+)\+?\s*yrs?\s*(?:of\s*)?experience'
        ]
        
        years = []
        for pattern in patterns:
            matches = re.findall(pattern, text.lower())
            years.extend([int(match) for match in matches])
        
        return max(years) if years else 0
    
    def extract_education(self, text):
        """Extract education information"""
        degrees = ['phd', 'ph.d', 'doctorate', 'masters', 'master', 'bachelor', 'bachelors', 'mba', 'ms', 'bs', 'ba', 'ma']
        found_degrees = []
        
        text_lower = text.lower()
        for degree in degrees:
            if degree in text_lower:
                found_degrees.append(degree)
        
        # Score based on highest degree
        degree_scores = {'phd': 4, 'ph.d': 4, 'doctorate': 4, 'masters': 3, 'master': 3, 'mba': 3, 'ms': 3, 'ma': 3, 'bachelor': 2, 'bachelors': 2, 'bs': 2, 'ba': 2}
        
        if found_degrees:
            return max(degree_scores.get(degree, 1) for degree in found_degrees)
        return 1
    
    def parse_resume(self, resume_text, filename):
        """Parse resume and extract structured information"""
        cleaned_text = self.clean_text(resume_text)
        
        resume_data = {
            'filename': filename,
            'text': cleaned_text,
            'skills': self.extract_skills(resume_text),
            'experience_years': self.extract_experience_years(resume_text),
            'education_score': self.extract_education(resume_text),
            'text_length': len(cleaned_text.split())
        }
        
        return resume_data
    
    def calculate_text_similarity(self, job_description, resume_texts):
        """Calculate TF-IDF similarity between job description and resumes"""
        all_texts = [job_description] + resume_texts
        
        try:
            tfidf_matrix = self.vectorizer.fit_transform(all_texts)
            job_vector = tfidf_matrix[0:1]
            resume_vectors = tfidf_matrix[1:]
            
            similarities = cosine_similarity(job_vector, resume_vectors).flatten()
            return similarities
        except Exception as e:
            print(f"Error calculating text similarity: {e}")
            return np.zeros(len(resume_texts))
    
    def calculate_skill_match(self, job_skills, resume_skills):
        """Calculate skill matching score"""
        total_score = 0
        max_possible = 0
        
        for category in self.skill_categories.keys():
            job_category_skills = set(job_skills.get(category, []))
            resume_category_skills = set(resume_skills.get(category, []))
            
            if job_category_skills:
                intersection = len(job_category_skills.intersection(resume_category_skills))
                union = len(job_category_skills)
                score = intersection / union if union > 0 else 0
                total_score += score
                max_possible += 1
        
        return total_score / max_possible if max_possible > 0 else 0
    
    def rank_resumes(self, job_description, resumes_data):
        """Rank resumes based on job description"""
        job_skills = self.extract_skills(job_description)
        job_exp_years = self.extract_experience_years(job_description)
        cleaned_job_desc = self.clean_text(job_description)
        
        # Calculate text similarities
        resume_texts = [resume['text'] for resume in resumes_data]
        text_similarities = self.calculate_text_similarity(cleaned_job_desc, resume_texts)
        
        results = []
        
        for i, resume in enumerate(resumes_data):
            # Calculate various scores
            skill_score = self.calculate_skill_match(job_skills, resume['skills'])
            text_similarity = text_similarities[i]
            
            # Experience score (normalized)
            exp_score = min(resume['experience_years'] / max(job_exp_years, 1), 1.0) if job_exp_years > 0 else 0.5
            
            # Education score (normalized)
            edu_score = min(resume['education_score'] / 4.0, 1.0)
            
            # Combined score with weights
            combined_score = (
                0.4 * text_similarity +
                0.3 * skill_score +
                0.2 * exp_score +
                0.1 * edu_score
            )
            
            results.append({
                'filename': resume['filename'],
                'combined_score': combined_score,
                'text_similarity': text_similarity,
                'skill_score': skill_score,
                'experience_score': exp_score,
                'education_score': edu_score,
                'experience_years': resume['experience_years'],
                'skills_found': resume['skills']
            })
        
        # Sort by combined score (descending)
        results.sort(key=lambda x: x['combined_score'], reverse=True)
        
        return results
    
    def process_resume_folder(self, data_path):
        """Process all resumes in the data folder"""
        resumes_data = []
        
        for sector_folder in Path(data_path).iterdir():
            if sector_folder.is_dir():
                print(f"Processing {sector_folder.name}...")
                
                for pdf_file in sector_folder.glob("*.pdf"):
                    try:
                        text = self.extract_text_from_pdf(pdf_file)
                        if text:
                            resume_data = self.parse_resume(text, pdf_file.name)
                            resume_data['sector'] = sector_folder.name
                            resumes_data.append(resume_data)
                            print(f"  Processed: {pdf_file.name}")
                        else:
                            print(f"  Warning: No text extracted from {pdf_file.name}")
                    except Exception as e:
                        print(f"  Error processing {pdf_file.name}: {e}")
        
        return resumes_data
    
    def display_results(self, ranked_results, top_n=10):
        """Display ranking results in a formatted way"""
        print(f"\n{'='*80}")
        print(f"TOP {min(top_n, len(ranked_results))} RANKED RESUMES")
        print(f"{'='*80}")
        
        for i, result in enumerate(ranked_results[:top_n], 1):
            print(f"\n{i}. {result['filename']}")
            print(f"   Combined Score: {result['combined_score']:.3f}")
            print(f"   Text Similarity: {result['text_similarity']:.3f}")
            print(f"   Skill Match: {result['skill_score']:.3f}")
            print(f"   Experience Score: {result['experience_score']:.3f} ({result['experience_years']} years)")
            print(f"   Education Score: {result['education_score']:.3f}")
            
            # Display found skills
            skills_summary = []
            for category, skills in result['skills_found'].items():
                if skills:
                    skills_summary.append(f"{category}: {', '.join(skills[:3])}")
            
            if skills_summary:
                print(f"   Key Skills: {' | '.join(skills_summary[:3])}")
            
            print("-" * 80)


def main():
    """Main function to demonstrate the resume ranking system"""
    # Initialize the ranker
    ranker = ResumeRanker()
    
    # Path to your data folder (adjust as needed)
    data_path = Path("../data/data/data")
    
    if not data_path.exists():
        print(f"Data path {data_path} does not exist. Please check the path.")
        return
    
    # Process all resumes
    print("Processing resumes...")
    resumes_data = ranker.process_resume_folder(data_path)
    print(f"\nTotal resumes processed: {len(resumes_data)}")
    
    # Example job description
    job_description = """
    We are looking for a Senior Software Engineer with 5+ years of experience in Python and web development.
    The ideal candidate should have:
    - Strong experience with Python, Django, and REST APIs
    - Experience with JavaScript, React, and modern web technologies
    - Knowledge of SQL databases and cloud platforms (AWS preferred)
    - Bachelor's degree in Computer Science or related field
    - Excellent problem-solving and communication skills
    - Experience with agile development methodologies
    """
    
    print("\nJob Description:")
    print("-" * 50)
    print(job_description)
    
    # Rank resumes
    print("\nRanking resumes...")
    ranked_results = ranker.rank_resumes(job_description, resumes_data)
    
    # Display results
    ranker.display_results(ranked_results, top_n=10)
    
    # Save results to CSV
    df = pd.DataFrame(ranked_results)
    df.to_csv('resume_ranking_results.csv', index=False)
    print(f"\nResults saved to 'resume_ranking_results.csv'")


if __name__ == "__main__":
    main()


Processing resumes...
Processing ACCOUNTANT...
  Processed: 10554236.pdf
  Processed: 10674770.pdf
  Processed: 11163645.pdf
  Processed: 11759079.pdf
  Processed: 12065211.pdf
  Processed: 12202337.pdf
  Processed: 12338274.pdf
  Processed: 12442909.pdf
  Processed: 12780508.pdf
  Processed: 12802330.pdf
  Processed: 13072019.pdf
  Processed: 13130984.pdf
  Processed: 13294301.pdf
  Processed: 13491889.pdf
  Processed: 13701259.pdf
  Processed: 14055988.pdf
  Processed: 14126433.pdf
  Processed: 14224370.pdf
  Processed: 14449423.pdf
  Processed: 14470533.pdf
  Processed: 14491649.pdf
  Processed: 14496667.pdf
  Processed: 15289348.pdf
  Processed: 15363277.pdf
  Processed: 15592167.pdf
  Processed: 15821633.pdf
  Processed: 15906625.pdf
  Processed: 16237710.pdf
  Processed: 17306905.pdf
  Processed: 17407184.pdf
  Processed: 17556527.pdf
  Processed: 18132924.pdf
  Processed: 18365791.pdf
  Processed: 18569929.pdf
  Processed: 18635654.pdf
  Processed: 18669563.pdf
  Processed: 1944