In [1]:
import os
import re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import spacy
from collections import Counter
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Path to the data folder
data_path = Path("../data/data/data")

# List specialities (each is a folder containing resumes)
sectors = [folder.name for folder in data_path.iterdir() if folder.is_dir()]
print("Sectors found:", sectors)

Sectors found: ['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS', 'AUTOMOBILE', 'AVIATION', 'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT', 'CHEF', 'CONSTRUCTION', 'CONSULTANT', 'DESIGNER', 'DIGITAL-MEDIA', 'ENGINEERING', 'FINANCE', 'FITNESS', 'HEALTHCARE', 'HR', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS', 'SALES', 'TEACHER']


In [3]:
# Counting Resumes
resume_count = {}

for sector in sectors:
    resumes = list((data_path / sector).glob("*.pdf"))
    resume_count[sector] = len(resumes)

print("Resumes per sector:")
for sector, count in resume_count.items():
    print(f"{sector}: {count} resumes")


Resumes per sector:
ACCOUNTANT: 118 resumes
ADVOCATE: 118 resumes
AGRICULTURE: 63 resumes
APPAREL: 97 resumes
ARTS: 104 resumes
AUTOMOBILE: 36 resumes
AVIATION: 117 resumes
BANKING: 115 resumes
BPO: 22 resumes
BUSINESS-DEVELOPMENT: 120 resumes
CHEF: 118 resumes
CONSTRUCTION: 112 resumes
CONSULTANT: 115 resumes
DESIGNER: 107 resumes
DIGITAL-MEDIA: 96 resumes
ENGINEERING: 118 resumes
FINANCE: 118 resumes
FITNESS: 117 resumes
HEALTHCARE: 115 resumes
HR: 110 resumes
INFORMATION-TECHNOLOGY: 120 resumes
PUBLIC-RELATIONS: 111 resumes
SALES: 116 resumes
TEACHER: 102 resumes


In [4]:
from random import sample
# A test to make sure we can access our sample files!
for sector in sectors:
    resume_files = list((data_path / sector).glob("*.pdf"))
    print(f"\n{sector} - Sample files:")
    for f in sample(resume_files, min(2, len(resume_files))):
        print(" * ", f.name)



ACCOUNTANT - Sample files:
 *  25462793.pdf
 *  23734441.pdf

ADVOCATE - Sample files:
 *  91051945.pdf
 *  15313140.pdf

AGRICULTURE - Sample files:
 *  28247753.pdf
 *  24001783.pdf

APPAREL - Sample files:
 *  13386301.pdf
 *  35121930.pdf

ARTS - Sample files:
 *  43622023.pdf
 *  73497035.pdf

AUTOMOBILE - Sample files:
 *  25047127.pdf
 *  14455622.pdf

AVIATION - Sample files:
 *  11137306.pdf
 *  94137171.pdf

BANKING - Sample files:
 *  25624652.pdf
 *  29093426.pdf

BPO - Sample files:
 *  69097572.pdf
 *  13964744.pdf

BUSINESS-DEVELOPMENT - Sample files:
 *  20317319.pdf
 *  24647386.pdf

CHEF - Sample files:
 *  12155206.pdf
 *  25924968.pdf

CONSTRUCTION - Sample files:
 *  17342969.pdf
 *  21782152.pdf

CONSULTANT - Sample files:
 *  39441617.pdf
 *  20176584.pdf

DESIGNER - Sample files:
 *  93301686.pdf
 *  18979238.pdf

DIGITAL-MEDIA - Sample files:
 *  20490741.pdf
 *  11270462.pdf

ENGINEERING - Sample files:
 *  19124258.pdf
 *  12472574.pdf

FINANCE - Sample file

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [6]:
# Example: Get one PDF from the Tech sector
sample_pdf = list((data_path / "ENGINEERING").glob("*.pdf"))[0]

# Extract text
sample_text = extract_text_from_pdf(sample_pdf)

# Show part of it
print(sample_text[:1000])  # print the first 1000 characters


ENGINEERING LAB TECHNICIAN
Career Focus
My main objective in seeking employment with Triumph Actuation Systems Inc. is to work in a professional atmosphere where I can utilize my
skills and continue to gain experience in the aerospace industry to advance in my career.
Professional Experience
Engineering Lab Technician Oct 2016 to Current 
Company Name ï¼​ City , State
Responsible for testing various seat structures to meet specific certification requirements. Â 
Maintain and calibrate test instruments to ensure testing capabilities are maintained.
Ensure data is captured and recorded correctly for certification test reports.
Duties also dynamic test set-up and static suite testing. 
Engineering Lab Technician, Sr. Specialist Apr 2012 to Oct 2016 
Company Name ï¼​ City , State
Utilized skills learned from LabView Course 1 training to construct and maintain LabView VI programs.
Responsible for fabricating and maintaining hydraulic/electrical test equipment to complete development and qua

In [7]:
resumes_data = []

for sector in sectors:
    for pdf_file in (data_path / sector).glob("*.pdf"):
        text = extract_text_from_pdf(pdf_file)
        resumes_data.append({
            "sector": sector,
            "filename": pdf_file.name,
            "text": text
        })

print(f" Parsed {len(resumes_data)} resumes in total.")


 Parsed 2485 resumes in total.


In [16]:
class ResumeRanker:
    def __init__(self):
        """Initialize the Resume Ranking System"""
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except OSError:
            print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None

        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            ngram_range=(1, 2),
            lowercase=True
        )

        self.skill_categories = {
            'programming': ['python', 'java', 'javascript', 'c++', 'c#', 'php', 'ruby', 'go', 'rust', 'scala', 'kotlin'],
            'web_development': ['html', 'css', 'react', 'angular', 'vue', 'node', 'express', 'django', 'flask'],
            'data_science': ['pandas', 'numpy', 'matplotlib', 'seaborn', 'scikit-learn', 'tensorflow', 'pytorch', 'r'],
            'databases': ['sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'oracle'],
            'cloud': ['aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform', 'jenkins'],
            'soft_skills': ['leadership', 'communication', 'teamwork', 'problem-solving', 'analytical', 'creative']
        }
        
    def extract_text_from_pdf(self, pdf_path):
            """Extract text from PDF file"""
            try:
                text = ""
                with fitz.open(pdf_path) as doc:
                    for page in doc:
                        text += page.get_text()
                return text.strip()
            except Exception as e:
                print(f"Error extracting text from {pdf_path}: {e}")
                return ""

    def clean_text(self, text):
        """Clean and preprocess text"""
        if not text:
            return ""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\-\.\,\(\)]', '', text)
        return text.lower().strip()
    def extract_skills(self, text):
        """Extract skills from text based on predefined categories"""
        text_lower = text.lower()
        found_skills = {}
        for category, skills in self.skill_categories.items():
            found_skills[category] = [skill for skill in skills if skill in text_lower]
        return found_skills

    def extract_experience_years(self, text):
        """Extract years of experience from text"""
        patterns = [
            r'(\d+)\+?\s*years?\s*(?:of\s*)?experience',
            r'(\d+)\+?\s*years?\s*in',
            r'experience\s*(?:of\s*)?(\d+)\+?\s*years?',
            r'(\d+)\+?\s*yrs?\s*(?:of\s*)?experience'
        ]
        years = []
        for pattern in patterns:
            matches = re.findall(pattern, text.lower())
            years.extend([int(match) for match in matches])
        return max(years) if years else 0

    def extract_education(self, text):
        """Extract education information"""
        degrees = ['phd', 'ph.d', 'doctorate', 'masters', 'master', 'bachelor', 'bachelors', 'mba', 'ms', 'bs', 'ba', 'ma']
        degree_scores = {'phd': 4, 'ph.d': 4, 'doctorate': 4, 'masters': 3, 'master': 3, 'mba': 3, 'ms': 3, 'ma': 3, 'bachelor': 2, 'bachelors': 2, 'bs': 2, 'ba': 2}
        found_degrees = [degree for degree in degrees if degree in text.lower()]
        return max([degree_scores.get(degree, 1) for degree in found_degrees], default=1)
        
    def parse_resume(self, resume_text, filename):
        """Parse resume and extract structured information"""
        cleaned_text = self.clean_text(resume_text)
        return {
            'filename': filename,
            'text': cleaned_text,
            'skills': self.extract_skills(resume_text),
            'experience_years': self.extract_experience_years(resume_text),
            'education_score': self.extract_education(resume_text),
            'text_length': len(cleaned_text.split())
        }

    def calculate_text_similarity(self, job_description, resume_texts):
        """Calculate TF-IDF similarity between job description and resumes"""
        all_texts = [job_description] + resume_texts
        try:
            tfidf_matrix = self.vectorizer.fit_transform(all_texts)
            job_vector = tfidf_matrix[0:1]
            resume_vectors = tfidf_matrix[1:]
            return cosine_similarity(job_vector, resume_vectors).flatten()
        except Exception as e:
            print(f"Error calculating text similarity: {e}")
            return np.zeros(len(resume_texts))

    def calculate_skill_match(self, job_skills, resume_skills):
        """Calculate skill matching score"""
        total_score = 0
        max_possible = 0
        for category in self.skill_categories.keys():
            job_set = set(job_skills.get(category, []))
            resume_set = set(resume_skills.get(category, []))
            if job_set:
                total_score += len(job_set & resume_set) / len(job_set)
                max_possible += 1
        return total_score / max_possible if max_possible > 0 else 0

    def rank_resumes(self, job_description, resumes_data):
        """Rank resumes based on job description"""
        job_skills = self.extract_skills(job_description)
        job_exp_years = self.extract_experience_years(job_description)
        cleaned_job_desc = self.clean_text(job_description)
        resume_texts = [resume['text'] for resume in resumes_data]
        text_similarities = self.calculate_text_similarity(cleaned_job_desc, resume_texts)

        results = []
        for i, resume in enumerate(resumes_data):
            skill_score = self.calculate_skill_match(job_skills, resume['skills'])
            text_similarity = text_similarities[i]
            exp_score = min(resume['experience_years'] / max(job_exp_years, 1), 1.0)
            edu_score = min(resume['education_score'] / 4.0, 1.0)
            combined_score = (
                0.4 * text_similarity +
                0.3 * skill_score +
                0.2 * exp_score +
                0.1 * edu_score
            )
            results.append({
                'filename': resume['filename'],
                'combined_score': combined_score,
                'text_similarity': text_similarity,
                'skill_score': skill_score,
                'experience_score': exp_score,
                'education_score': edu_score,
                'experience_years': resume['experience_years'],
                'skills_found': resume['skills']
            })

        return sorted(results, key=lambda x: x['combined_score'], reverse=True)
        
    def process_resume_folder(self, data_path):
        """Process all resumes in the data folder"""
        resumes_data = []
        for sector_folder in Path(data_path).iterdir():
            if sector_folder.is_dir():
                print(f"Processing {sector_folder.name}...")
                for pdf_file in sector_folder.glob("*.pdf"):
                    try:
                        text = self.extract_text_from_pdf(pdf_file)
                        if text:
                            resume_data = self.parse_resume(text, pdf_file.name)
                            resume_data['sector'] = sector_folder.name
                            resumes_data.append(resume_data)
                            #print(f"  Processed: {pdf_file.name}")
                        else:
                            print(f"  Warning: No text extracted from {pdf_file.name}")
                    except Exception as e:
                        print(f"  Error processing {pdf_file.name}: {e}")
        return resumes_data

    def display_results(self, ranked_results, top_n=10):
        """Display ranking results in a formatted way"""
        print(f"\n{'='*80}")
        print(f"TOP {min(top_n, len(ranked_results))} RANKED RESUMES")
        print(f"{'='*80}")
        for i, result in enumerate(ranked_results[:top_n], 1):
            print(f"\n{i}. {result['filename']}")
            print(f"   Combined Score: {result['combined_score']:.3f}")
            print(f"   Text Similarity: {result['text_similarity']:.3f}")
            print(f"   Skill Match: {result['skill_score']:.3f}")
            print(f"   Experience Score: {result['experience_score']:.3f} ({result['experience_years']} years)")
            print(f"   Education Score: {result['education_score']:.3f}")
            skills_summary = [f"{cat}: {', '.join(skills[:3])}" for cat, skills in result['skills_found'].items() if skills]
            if skills_summary:
                print(f"   Key Skills: {' | '.join(skills_summary[:3])}")
            print("-" * 80)


In [20]:
def main():
    """Main function to demonstrate the resume ranking system"""
    ranker = ResumeRanker()
    data_path = Path("../data/data/data")
    if not data_path.exists():
        print(f"Data path {data_path} does not exist. Please check the path.")
        return
    print("Processing resumes...")
    resumes_data = ranker.process_resume_folder(data_path)
    print(f"\nTotal resumes processed: {len(resumes_data)}")
    job_description = """Ministry of Finance

Rural Financial Inclusion Project in Palestine (RUFIPP)

Terms of Reference

for Hiring a Full-Time Senior Monitoring and Evaluation (M&E) Specialist

Job Announcement: Senior Monitoring and Evaluation (M&E) Specialist
Project: Rural Financial Inclusion Project in Palestine (RUFIPP)
Lead Executing Agency (EA): Ministry of Finance (MOF)
Implementing Agencies (IA): Ministry of Finance (MOF) and Ministry of Agriculture (MOA)
Location: Ramallah, with possible travel in West Bank areas.
Contract Duration: One-year contract with possibility of extension based on satisfactory performance. A three-month probationary period applies.

Background:

The Palestinian Authority has received financing from the International Fund for Agricultural Development (IFAD), the European Union (EU), and the Spanish Agency for International Development Cooperation (AECID) towards the cost of the Rural Financial Inclusion Project in Palestine (RUFIPP). 

The Ministry of Finance is seeking a highly qualified Senior Monitoring and Evaluation (M&E) Specialist to join the RUFIPP Project Management Unit (PMU) in Ramallah.
The project aims to enhance resilient economic growth and incomes of smallholder farmers and rural enterprises by increasing access to affordable financial products, raising financial literacy, and mainstreaming social inclusion. Specifically, RUFIPP focuses on reaching marginalized and vulnerable groups, including women and youth. 

RUFIPP will be implemented over a four-year period starting in the second semester of 2025. Project activities are organized into three major components: (i) Inclusive rural finance literacy and awareness raising; (ii) Providing inclusive and green rural finance products and services; (iii) Capacity development and strengthening of rural finance institutions.

Job Purpose:

The Senior M&E Specialist will report directly to the Project Director. S/he will ensure that the project has a robust, effective, and manageable M & E system in place. The Specialist will also ensure that project component leads, key implementing partners, ministries, and agencies are capable of, and timely respond to planning, monitoring, and evaluation needs with respect to tracking and reporting against targets and agreed results-based indicators as per the project log frame (LF). The Senior M&E Specialist will also ensure quality assurance, integrity, and accountability across the M&E function and information. S/he will ensure the M & E system supports the Knowledge Management, Learning and Communication system of the project.

Key Duties and Responsibilities:

Study the Theory of Change, Logical Framework, and project components to conceptualize the M&E system.
Develop and implement a cost-effective monitoring and evaluation strategy to track all project inputs, outputs, outcomes, and impacts.
Develop data collection systems for beneficiaries and financing institutions.
Create participatory M&E tools and processes to enhance project effectiveness and accountability.
Conduct data analysis and prepare regular progress and results reports (quarterly, annual, and thematic).
Train Project Implementation Teams in M&E system use and tools.
Establish a reporting framework for all project beneficiaries, financial institutions, and enterprises.
Support studies, annual outcome surveys, mid-term review (MTR), and end-of-project evaluations, including preparation of terms of reference.
Ensure quality control of data collection, analysis, and reporting inline with IFAD, AECID and EU reporting guidelines.
Maintain internal communication on M&E findings and impacts.
Integrate feedback from beneficiaries and partners into project implementation.
Support the development and management of the project’s Management Information System (MIS). Ensure alignment with IFAD, AECID and EU reporting guidelines. 
Facilitate the design and implementation of studies, annual outcome surveys, mid-term review and end of Project and impact evaluations as required including the preparation of terms of references ensuring adherence to the IFAD “Core Outcome Indicator” survey guidelines;
Facilitate supervision missions by providing timely M&E and progress information.
Perform any other duties assigned by the Project director.
Job Performance Indicators:

M&E system/Strategy for RUFIPP developed and implemented.
Project Implementation Teams trained on M&E.
Data collection, reporting, and learning activities coordinated effectively.
Timely preparation of quarterly, biannual, annual, and thematic reports.
Ongoing monitoring of project activities against targets and schedules.
Effective design and implementation of studies, MTR, and end-of-project evaluations.

Qualifications and Experience Required:

Postgraduate degree in Monitoring and Evaluation, Demography, Population Studies, Statistics, Economics, Rural Development, or a related field.
At least 8 years of experience in M&E roles in rural development and donor-funded projects.
Strong experience in designing and implementing M&E systems and large-scale survey data management.
Demonstrated use of statistical and data analysis software at least 3 years (e.g., SAS, SPSS, Advanced Excel).
Direct experience in donor-funded projects (experience with IFAD-funded projects or relevant experiences is an advantage).
Experience with research studies, full evaluation cycles, and impact assessments.
Strong report writing, analytical, negotiation, and communication skills.
Excellent command of written and spoken English.
High integrity and ability to work under pressure and tight deadlines.
Experience in using IT solutions for M&E is an added advantage.

"""
    print("\nJob Description:")
    print("-" * 50)
    print(job_description)
    print("\nRanking resumes...")
    ranked_results = ranker.rank_resumes(job_description, resumes_data)
    ranker.display_results(ranked_results, top_n=10)
    pd.DataFrame(ranked_results).to_csv('resume_ranking_results.csv', index=False)
    print(f"\nResults saved to 'resume_ranking_results.csv'")


In [24]:
import os
import re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import spacy
from collections import Counter
import warnings
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.chunk import ne_chunk
from nltk.tag import pos_tag
warnings.filterwarnings('ignore')

# Download required NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
    nltk.data.find('taggers/averaged_perceptron_tagger')
    nltk.data.find('chunkers/maxent_ne_chunker')
    nltk.data.find('corpora/words')
except LookupError:
    print("Downloading required NLTK data...")
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('maxent_ne_chunker')
    nltk.download('words')

class UniversalResumeRanker:
    def __init__(self):
        """Initialize the Universal Resume Ranking System"""
        try:
            # Load spaCy model with more components
            self.nlp = spacy.load('en_core_web_sm')
        except OSError:
            print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None
        
        self.vectorizer = TfidfVectorizer(
            max_features=8000,
            stop_words='english',
            ngram_range=(1, 3),  # Include trigrams for better skill matching
            lowercase=True,
            min_df=2  # Ignore terms that appear in less than 2 documents
        )
        
        # Universal skill categories covering all major sectors
        self.universal_skills = {
            # Technical Skills
            'programming': ['python', 'java', 'javascript', 'c++', 'c#', 'php', 'ruby', 'go', 'rust', 'scala', 'kotlin', 'swift'],
            'web_development': ['html', 'css', 'react', 'angular', 'vue', 'node', 'express', 'django', 'flask', 'bootstrap'],
            'data_analysis': ['excel', 'sql', 'tableau', 'power bi', 'sas', 'spss', 'r', 'pandas', 'numpy', 'statistics'],
            'databases': ['mysql', 'postgresql', 'mongodb', 'oracle', 'sqlite', 'redis', 'elasticsearch'],
            'cloud_tech': ['aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform', 'jenkins'],
            
            # Financial & Accounting
            'accounting': ['gaap', 'ifrs', 'quickbooks', 'sap', 'accounts payable', 'accounts receivable', 'financial reporting', 'tax preparation', 'audit', 'reconciliation'],
            'finance': ['financial modeling', 'valuation', 'investment', 'portfolio management', 'risk management', 'derivatives', 'forex', 'bloomberg', 'capital markets'],
            'banking': ['credit analysis', 'loan processing', 'compliance', 'kyc', 'aml', 'banking regulations', 'retail banking', 'investment banking'],
            
            # Healthcare
            'healthcare': ['patient care', 'medical records', 'hipaa', 'clinical research', 'pharmacy', 'nursing', 'diagnosis', 'treatment planning'],
            'medical_tech': ['emr', 'ehr', 'medical imaging', 'laboratory', 'radiology', 'pathology', 'telemedicine'],
            
            # Legal
            'legal': ['litigation', 'contract law', 'corporate law', 'intellectual property', 'compliance', 'legal research', 'case management', 'court proceedings'],
            
            # Sales & Marketing
            'sales': ['lead generation', 'crm', 'salesforce', 'cold calling', 'negotiation', 'account management', 'b2b sales', 'b2c sales', 'sales forecasting'],
            'marketing': ['digital marketing', 'seo', 'sem', 'social media', 'content marketing', 'email marketing', 'brand management', 'market research'],
            'advertising': ['google ads', 'facebook ads', 'campaign management', 'media planning', 'creative development', 'brand strategy'],
            
            # Human Resources
            'hr': ['recruitment', 'talent acquisition', 'employee relations', 'performance management', 'compensation', 'training', 'hris', 'payroll'],
            
            # Education
            'education': ['curriculum development', 'lesson planning', 'classroom management', 'student assessment', 'educational technology', 'special education'],
            
            # Engineering & Construction
            'mechanical': ['autocad', 'solidworks', 'catia', 'manufacturing', 'quality control', 'lean manufacturing', 'six sigma'],
            'civil': ['structural design', 'project management', 'construction', 'surveying', 'environmental engineering', 'building codes'],
            'electrical': ['circuit design', 'power systems', 'plc programming', 'matlab', 'electrical safety', 'automation'],
            
            # Design & Creative
            'design': ['photoshop', 'illustrator', 'indesign', 'figma', 'sketch', 'ui/ux', 'graphic design', 'web design', 'brand design'],
            'creative': ['photography', 'video editing', 'animation', 'creative writing', 'content creation', 'storytelling'],
            
            # Operations & Supply Chain
            'operations': ['process improvement', 'supply chain', 'logistics', 'inventory management', 'vendor management', 'quality assurance'],
            'project_management': ['pmp', 'agile', 'scrum', 'waterfall', 'jira', 'project planning', 'risk management', 'stakeholder management'],
            
            # Agriculture & Food
            'agriculture': ['crop management', 'soil science', 'irrigation', 'pest control', 'organic farming', 'agricultural technology'],
            'food_service': ['food safety', 'menu planning', 'kitchen management', 'culinary arts', 'nutrition', 'food cost control'],
            
            # Automotive & Aviation
            'automotive': ['automotive repair', 'diagnostics', 'engine repair', 'transmission', 'brake systems', 'electrical systems'],
            'aviation': ['flight operations', 'aircraft maintenance', 'aviation safety', 'flight planning', 'air traffic control'],
            
            # Soft Skills (Universal)
            'communication': ['public speaking', 'presentation', 'writing', 'interpersonal', 'customer service', 'conflict resolution'],
            'leadership': ['team leadership', 'mentoring', 'strategic planning', 'decision making', 'change management', 'coaching'],
            'analytical': ['problem solving', 'critical thinking', 'data analysis', 'research', 'analytical thinking', 'attention to detail'],
            'organizational': ['time management', 'multitasking', 'organization', 'planning', 'prioritization', 'workflow management']
        }
        
        # Industry-specific certifications and qualifications
        self.certifications = {
            'finance': ['cfa', 'cpa', 'frm', 'caia', 'series 7', 'series 63'],
            'it': ['cissp', 'ccna', 'ccnp', 'aws certified', 'azure certified', 'comptia'],
            'project_management': ['pmp', 'prince2', 'capm', 'agile certified'],
            'hr': ['phr', 'sphr', 'shrm-cp', 'shrm-scp'],
            'healthcare': ['rn', 'md', 'pharmd', 'cna', 'medical license'],
            'legal': ['bar admission', 'juris doctor', 'paralegal certification']
        }
    
    def extract_text_from_pdf(self, pdf_path):
        """Extract text from PDF file"""
        try:
            text = ""
            with fitz.open(pdf_path) as doc:
                for page in doc:
                    text += page.get_text()
            return text.strip()
        except Exception as e:
            print(f"Error extracting text from {pdf_path}: {e}")
            return ""
    
    def clean_text(self, text):
        """Clean and preprocess text"""
        if not text:
            return ""
        
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\-\.\,\(\)]', '', text)
        return text.lower().strip()
    
    def extract_dynamic_skills(self, text, reference_skills=None):
        """
        Extract skills dynamically using NLP techniques
        If reference_skills provided (from job description), focus on those areas
        """
        if not text:
            return {}
        
        text_lower = text.lower()
        found_skills = {}
        
        # If we have reference skills from job description, prioritize those categories
        if reference_skills:
            skill_categories_to_check = {k: v for k, v in self.universal_skills.items() 
                                       if k in reference_skills or any(skill in text_lower for skill in v)}
        else:
            skill_categories_to_check = self.universal_skills
        
        # Check each skill category
        for category, skills in skill_categories_to_check.items():
            found_skills[category] = []
            for skill in skills:
                # Use more flexible matching
                if self._flexible_skill_match(skill, text_lower):
                    found_skills[category].append(skill)
        
        # Extract additional skills using NLP
        nlp_skills = self._extract_skills_with_nlp(text)
        
        # Merge NLP-extracted skills
        for skill in nlp_skills:
            # Try to categorize the skill
            categorized = False
            for category, known_skills in self.universal_skills.items():
                if any(known_skill in skill.lower() for known_skill in known_skills):
                    if category not in found_skills:
                        found_skills[category] = []
                    if skill not in found_skills[category]:
                        found_skills[category].append(skill)
                    categorized = True
                    break
            
            # If not categorized, add to general skills
            if not categorized:
                if 'general' not in found_skills:
                    found_skills['general'] = []
                found_skills['general'].append(skill)
        
        return found_skills
    
    def _flexible_skill_match(self, skill, text):
        """More flexible skill matching"""
        # Direct match
        if skill in text:
            return True
        
        # Handle variations (e.g., "c++" vs "c plus plus")
        skill_variations = {
            'c++': ['c plus plus', 'cplusplus'],
            'c#': ['c sharp', 'csharp'],
            'node': ['node.js', 'nodejs'],
            'react': ['react.js', 'reactjs'],
            'vue': ['vue.js', 'vuejs'],
            'ui/ux': ['ui ux', 'user interface', 'user experience']
        }
        
        if skill in skill_variations:
            return any(var in text for var in skill_variations[skill])
        
        return False
    
    def _extract_skills_with_nlp(self, text):
        """Extract potential skills using NLP techniques"""
        if not self.nlp:
            return []
        
        doc = self.nlp(text)
        potential_skills = []
        
        # Extract noun phrases that might be skills
        for chunk in doc.noun_chunks:
            chunk_text = chunk.text.lower().strip()
            # Filter out common non-skill phrases
            if (len(chunk_text.split()) <= 3 and 
                len(chunk_text) > 2 and 
                chunk_text not in ['experience', 'years', 'work', 'company', 'position', 'role']):
                potential_skills.append(chunk_text)
        
        # Extract entities that might be technologies or tools
        for ent in doc.ents:
            if ent.label_ in ['ORG', 'PRODUCT'] and len(ent.text) > 2:
                # Common technology companies/products that are also skills
                tech_indicators = ['microsoft', 'google', 'amazon', 'oracle', 'adobe', 'salesforce']
                if any(indicator in ent.text.lower() for indicator in tech_indicators):
                    potential_skills.append(ent.text.lower())
        
        return list(set(potential_skills))
    
    def extract_certifications(self, text):
        """Extract professional certifications"""
        text_lower = text.lower()
        found_certs = []
        
        for category, certs in self.certifications.items():
            for cert in certs:
                if cert in text_lower:
                    found_certs.append(cert)
        
        return found_certs
    
    def extract_experience_years(self, text):
        """Extract years of experience from text with better patterns"""
        patterns = [
            r'(\d+)\+?\s*years?\s*(?:of\s*)?(?:experience|exp)',
            r'(\d+)\+?\s*years?\s*in',
            r'experience\s*(?:of\s*)?(\d+)\+?\s*years?',
            r'(\d+)\+?\s*yrs?\s*(?:of\s*)?(?:experience|exp)',
            r'over\s*(\d+)\s*years',
            r'more than\s*(\d+)\s*years',
            r'(\d+)\s*to\s*\d+\s*years',  # Range like "3 to 5 years"
        ]
        
        years = []
        text_lower = text.lower()
        
        for pattern in patterns:
            matches = re.findall(pattern, text_lower)
            years.extend([int(match) for match in matches if match.isdigit()])
        
        return max(years) if years else 0
    
    def extract_education(self, text):
        """Extract education information with more comprehensive matching"""
        education_keywords = {
            'phd': ['phd', 'ph.d', 'doctorate', 'doctoral'],
            'masters': ['masters', 'master', 'mba', 'ms', 'ma', 'msc', 'meng', 'med'],
            'bachelors': ['bachelors', 'bachelor', 'bs', 'ba', 'bsc', 'beng', 'bcom', 'bba'],
            'diploma': ['diploma', 'associate', 'certification'],
            'high_school': ['high school', 'secondary', 'matriculation']
        }
        
        text_lower = text.lower()
        found_education = []
        
        for level, keywords in education_keywords.items():
            for keyword in keywords:
                if keyword in text_lower:
                    found_education.append(level)
                    break
        
        # Score based on highest education level
        education_scores = {
            'phd': 5,
            'masters': 4,
            'bachelors': 3,
            'diploma': 2,
            'high_school': 1
        }
        
        if found_education:
            return max(education_scores.get(edu, 1) for edu in found_education)
        return 1
    
    def parse_resume(self, resume_text, filename, job_skills=None):
        """Parse resume and extract structured information"""
        cleaned_text = self.clean_text(resume_text)
        
        resume_data = {
            'filename': filename,
            'text': cleaned_text,
            'skills': self.extract_dynamic_skills(resume_text, job_skills),
            'certifications': self.extract_certifications(resume_text),
            'experience_years': self.extract_experience_years(resume_text),
            'education_score': self.extract_education(resume_text),
            'text_length': len(cleaned_text.split())
        }
        
        return resume_data
    
    def calculate_advanced_skill_match(self, job_skills, resume_skills, job_text, resume_text):
        """
        Advanced skill matching that considers:
        1. Exact skill matches
        2. Skill category relevance
        3. Context similarity
        4. Skill importance weighting
        """
        if not job_skills or not resume_skills:
            return 0.0
        
        total_score = 0
        total_weight = 0
        
        # Calculate skill category matches
        for job_category, job_category_skills in job_skills.items():
            if not job_category_skills:
                continue
                
            category_weight = len(job_category_skills)  # More skills = higher importance
            resume_category_skills = resume_skills.get(job_category, [])
            
            if job_category_skills and resume_category_skills:
                # Calculate Jaccard similarity for this category
                job_set = set(job_category_skills)
                resume_set = set(resume_category_skills)
                
                intersection = len(job_set.intersection(resume_set))
                union = len(job_set.union(resume_set))
                
                category_score = intersection / union if union > 0 else 0
                
                # Bonus for exact matches
                exact_matches = len(job_set.intersection(resume_set))
                if exact_matches > 0:
                    category_score += 0.1 * exact_matches
                
                total_score += category_score * category_weight
                total_weight += category_weight
        
        # Normalize score
        base_score = total_score / total_weight if total_weight > 0 else 0
        
        # Add bonus for having skills in multiple relevant categories
        common_categories = set(job_skills.keys()).intersection(set(resume_skills.keys()))
        category_diversity_bonus = min(len(common_categories) * 0.05, 0.2)
        
        return min(base_score + category_diversity_bonus, 1.0)
    
    def calculate_text_similarity(self, job_description, resume_texts):
        """Calculate TF-IDF similarity between job description and resumes"""
        all_texts = [job_description] + resume_texts
        
        try:
            tfidf_matrix = self.vectorizer.fit_transform(all_texts)
            job_vector = tfidf_matrix[0:1]
            resume_vectors = tfidf_matrix[1:]
            
            similarities = cosine_similarity(job_vector, resume_vectors).flatten()
            return similarities
        except Exception as e:
            print(f"Error calculating text similarity: {e}")
            return np.zeros(len(resume_texts))
    
    def rank_resumes(self, job_description, resumes_data):
        """Rank resumes based on job description with advanced scoring"""
        # Extract job requirements
        job_skills = self.extract_dynamic_skills(job_description)
        job_exp_years = self.extract_experience_years(job_description)
        job_education = self.extract_education(job_description)
        job_certs = self.extract_certifications(job_description)
        cleaned_job_desc = self.clean_text(job_description)
        
        # Process all resumes with job context
        processed_resumes = []
        for resume in resumes_data:
            processed_resume = self.parse_resume(resume.get('text', ''), resume['filename'], job_skills)
            processed_resume['sector'] = resume.get('sector', 'Unknown')
            processed_resumes.append(processed_resume)
        
        # Calculate text similarities
        resume_texts = [resume['text'] for resume in processed_resumes]
        text_similarities = self.calculate_text_similarity(cleaned_job_desc, resume_texts)
        
        results = []
        
        for i, resume in enumerate(processed_resumes):
            # Advanced skill matching
            skill_score = self.calculate_advanced_skill_match(
                job_skills, resume['skills'], job_description, resume['text']
            )
            
            # Text similarity
            text_similarity = text_similarities[i]
            
            # Experience score (with diminishing returns for over-qualification)
            if job_exp_years > 0:
                exp_ratio = resume['experience_years'] / job_exp_years
                if exp_ratio <= 1:
                    exp_score = exp_ratio
                else:
                    # Diminishing returns for over-qualification
                    exp_score = 1.0 - min((exp_ratio - 1) * 0.1, 0.3)
            else:
                exp_score = min(resume['experience_years'] / 5, 1.0)  # Assume 5 years as baseline
            
            # Education score
            edu_score = min(resume['education_score'] / max(job_education, 3), 1.0)
            
            # Certification bonus
            cert_score = 0
            if job_certs and resume['certifications']:
                job_cert_set = set(job_certs)
                resume_cert_set = set(resume['certifications'])
                cert_matches = len(job_cert_set.intersection(resume_cert_set))
                cert_score = min(cert_matches / len(job_cert_set), 1.0) if job_cert_set else 0
            
            # Combined score with adaptive weights
            weights = self._calculate_adaptive_weights(job_skills, job_exp_years, job_certs)
            
            combined_score = (
                weights['text'] * text_similarity +
                weights['skills'] * skill_score +
                weights['experience'] * exp_score +
                weights['education'] * edu_score +
                weights['certifications'] * cert_score
            )
            
            results.append({
                'filename': resume['filename'],
                'sector': resume['sector'],
                'combined_score': combined_score,
                'text_similarity': text_similarity,
                'skill_score': skill_score,
                'experience_score': exp_score,
                'education_score': edu_score,
                'certification_score': cert_score,
                'experience_years': resume['experience_years'],
                'education_level': resume['education_score'],
                'skills_found': resume['skills'],
                'certifications': resume['certifications']
            })
        
        # Sort by combined score (descending)
        results.sort(key=lambda x: x['combined_score'], reverse=True)
        
        return results
    
    def _calculate_adaptive_weights(self, job_skills, job_exp_years, job_certs):
        """Calculate adaptive weights based on job requirements emphasis"""
        base_weights = {
            'text': 0.3,
            'skills': 0.35,
            'experience': 0.2,
            'education': 0.1,
            'certifications': 0.05
        }
        
        # Adjust weights based on job emphasis
        total_skills = sum(len(skills) for skills in job_skills.values())
        
        if total_skills > 15:  # Skill-heavy job
            base_weights['skills'] += 0.1
            base_weights['text'] -= 0.05
            base_weights['experience'] -= 0.05
        
        if job_exp_years > 7:  # Experience-heavy job
            base_weights['experience'] += 0.1
            base_weights['education'] -= 0.05
            base_weights['text'] -= 0.05
        
        if job_certs:  # Certification-required job
            base_weights['certifications'] += 0.1
            base_weights['text'] -= 0.05
            base_weights['skills'] -= 0.05
        
        return base_weights
    
    def process_resume_folder(self, data_path):
        """Process all resumes in the data folder"""
        resumes_data = []
        
        for sector_folder in Path(data_path).iterdir():
            if sector_folder.is_dir():
                print(f"Processing {sector_folder.name}...")
                
                for pdf_file in sector_folder.glob("*.pdf"):
                    try:
                        text = self.extract_text_from_pdf(pdf_file)
                        if text and len(text.strip()) > 50:  # Minimum text threshold
                            resumes_data.append({
                                'filename': pdf_file.name,
                                'sector': sector_folder.name,
                                'text': text
                            })
                            #print(f"  Processed: {pdf_file.name}")
                        else:
                            print(f"  Warning: Insufficient text extracted from {pdf_file.name}")
                    except Exception as e:
                        print(f"  Error processing {pdf_file.name}: {e}")
        
        return resumes_data
    
    def display_results(self, ranked_results, top_n=15):
        """Display ranking results in a formatted way"""
        print(f"\n{'='*100}")
        print(f"TOP {min(top_n, len(ranked_results))} RANKED RESUMES")
        print(f"{'='*100}")
        
        for i, result in enumerate(ranked_results[:top_n], 1):
            print(f"\n{i}. {result['filename']} (Sector: {result['sector']})")
            print(f"   Overall Score: {result['combined_score']:.3f}")
            print(f"   📄 Text Match: {result['text_similarity']:.3f}")
            print(f"   🛠️  Skill Match: {result['skill_score']:.3f}")
            print(f"   💼 Experience: {result['experience_score']:.3f} ({result['experience_years']} years)")
            print(f"   🎓 Education: {result['education_score']:.3f}")
            if result['certification_score'] > 0:
                print(f"   📜 Certifications: {result['certification_score']:.3f}")
            
            # Display top skills found
            skills_summary = []
            skill_count = 0
            for category, skills in result['skills_found'].items():
                if skills and skill_count < 15:  # Limit display
                    remaining_slots = 15 - skill_count
                    display_skills = skills[:remaining_slots]
                    skills_summary.append(f"{category}: {', '.join(display_skills)}")
                    skill_count += len(display_skills)
            
            if skills_summary:
                print(f"   🔧 Key Skills: {' | '.join(skills_summary[:4])}")
            
            if result['certifications']:
                print(f"   📋 Certifications: {', '.join(result['certifications'][:5])}")
            
            print("-" * 100)
    
    def export_detailed_results(self, ranked_results, filename='detailed_resume_ranking.csv'):
        """Export detailed results to CSV with all information"""
        export_data = []
        
        for result in ranked_results:
            # Flatten skills data
            all_skills = []
            for category, skills in result['skills_found'].items():
                all_skills.extend([f"{category}:{skill}" for skill in skills])
            
            export_data.append({
                'Rank': ranked_results.index(result) + 1,
                'Filename': result['filename'],
                'Sector': result['sector'],
                'Overall_Score': result['combined_score'],
                'Text_Similarity': result['text_similarity'],
                'Skill_Score': result['skill_score'],
                'Experience_Score': result['experience_score'],
                'Education_Score': result['education_score'],
                'Certification_Score': result['certification_score'],
                'Experience_Years': result['experience_years'],
                'Education_Level': result['education_level'],
                'All_Skills': ' | '.join(all_skills),
                'Certifications': ' | '.join(result['certifications']) if result['certifications'] else ''
            })
        
        df = pd.DataFrame(export_data)
        df.to_csv(filename, index=False)
        print(f"\nDetailed results saved to '{filename}'")


def main():
    """Main function to demonstrate the universal resume ranking system"""
    # Initialize the ranker
    ranker = UniversalResumeRanker()
    
    # Path to your data folder (adjust as needed)
    data_path = Path("../data/data/data")
    
    if not data_path.exists():
        print(f"Data path {data_path} does not exist. Please check the path.")
        return
    
    # Process all resumes
    print("Processing resumes from all sectors...")
    resumes_data = ranker.process_resume_folder(data_path)
    print(f"\nTotal resumes processed: {len(resumes_data)}")
    
    # Example job descriptions for different sectors
    job_examples = {
        'software_engineer': """
        We are looking for a Senior Software Engineer with 5+ years of experience.
        Required skills: Python, JavaScript, React, SQL, AWS
        Experience with agile development and team leadership preferred.
        Bachelor's degree in Computer Science required.
        """,
        
        'financial_analyst': """
        Seeking a Financial Analyst with 3+ years experience in financial modeling.
        Required: Excel, SQL, financial reporting, budgeting, forecasting
        CFA or MBA preferred. Strong analytical and communication skills.
        Experience with Bloomberg or similar financial systems a plus.
        """,
        
        'marketing_manager': """
        Marketing Manager needed with 4+ years digital marketing experience.
        Required: Google Ads, Facebook Ads, SEO, content marketing, analytics
        Experience with CRM systems and marketing automation.
        Bachelor's degree in Marketing or related field.
        """,
        
        'hr_specialist': """
        HR Specialist position requiring 2+ years recruitment experience.
        Skills needed: talent acquisition, employee relations, HRIS, payroll
        PHR or SHRM certification preferred.
        Strong interpersonal and communication skills essential.
        """
    }
    
    # Let user choose or use default
    print("\nAvailable job description examples:")
    for i, (key, desc) in enumerate(job_examples.items(), 1):
        print(f"{i}. {key.replace('_', ' ').title()}")
    
    print(f"\nUsing Software Engineer job description as example:")
    job_description = job_examples['marketing_manager']
    
    print("\nJob Description:")
    print("-" * 80)
    print(job_description)
    
    # Rank resumes
    print("\nRanking resumes...")
    ranked_results = ranker.rank_resumes(job_description, resumes_data)
    
    # Display results
    ranker.display_results(ranked_results, top_n=15)
    
    # Export detailed results
    ranker.export_detailed_results(ranked_results)
    
    # Show sector distribution in top results
    print(f"\n{'='*60}")
    print("SECTOR DISTRIBUTION IN TOP 20 RESULTS")
    print(f"{'='*60}")
    
    sector_counts = {}
    for result in ranked_results[:20]:
        sector = result['sector']
        sector_counts[sector] = sector_counts.get(sector, 0) + 1
    
    for sector, count in sorted(sector_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"{sector}: {count} resumes")

if __name__ == "__main__":
    main()



Processing resumes from all sectors...
Processing ACCOUNTANT...
Processing ADVOCATE...
Processing AGRICULTURE...
Processing APPAREL...
Processing ARTS...
Processing AUTOMOBILE...
Processing AVIATION...
Processing BANKING...
Processing BPO...
Processing BUSINESS-DEVELOPMENT...
Processing CHEF...
Processing CONSTRUCTION...
Processing CONSULTANT...
Processing DESIGNER...
Processing DIGITAL-MEDIA...
Processing ENGINEERING...
Processing FINANCE...
Processing FITNESS...
Processing HEALTHCARE...
Processing HR...
Processing INFORMATION-TECHNOLOGY...
Processing PUBLIC-RELATIONS...
Processing SALES...
Processing TEACHER...

Total resumes processed: 2484

Available job description examples:
1. Software Engineer
2. Financial Analyst
3. Marketing Manager
4. Hr Specialist

Using Software Engineer job description as example:

Job Description:
--------------------------------------------------------------------------------

        Marketing Manager needed with 4+ years digital marketing experience.
 

In [22]:
if __name__ == "__main__":
    main()


Processing resumes...
Processing ACCOUNTANT...
Processing ADVOCATE...
Processing AGRICULTURE...
Processing APPAREL...
Processing ARTS...
Processing AUTOMOBILE...
Processing AVIATION...
Processing BANKING...
Processing BPO...
Processing BUSINESS-DEVELOPMENT...
Processing CHEF...
Processing CONSTRUCTION...
Processing CONSULTANT...
Processing DESIGNER...
Processing DIGITAL-MEDIA...
Processing ENGINEERING...
Processing FINANCE...
Processing FITNESS...
Processing HEALTHCARE...
Processing HR...
Processing INFORMATION-TECHNOLOGY...
Processing PUBLIC-RELATIONS...
Processing SALES...
Processing TEACHER...

Total resumes processed: 2484

Job Description:
--------------------------------------------------
Ministry of Finance

Rural Financial Inclusion Project in Palestine (RUFIPP)

Terms of Reference

for Hiring a Full-Time Senior Monitoring and Evaluation (M&E) Specialist

Job Announcement: Senior Monitoring and Evaluation (M&E) Specialist
Project: Rural Financial Inclusion Project in Palestine 