In [1]:
import os
import re
import fitz  # PyMuPDF
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import spacy
from collections import Counter
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Path to the data folder
data_path = Path("../data/data/data")

# List specialities (each is a folder containing resumes)
sectors = [folder.name for folder in data_path.iterdir() if folder.is_dir()]
print("Sectors found:", sectors)

Sectors found: ['ACCOUNTANT', 'ADVOCATE', 'AGRICULTURE', 'APPAREL', 'ARTS', 'AUTOMOBILE', 'AVIATION', 'BANKING', 'BPO', 'BUSINESS-DEVELOPMENT', 'CHEF', 'CONSTRUCTION', 'CONSULTANT', 'DESIGNER', 'DIGITAL-MEDIA', 'ENGINEERING', 'FINANCE', 'FITNESS', 'HEALTHCARE', 'HR', 'INFORMATION-TECHNOLOGY', 'PUBLIC-RELATIONS', 'SALES', 'TEACHER']


In [3]:
# Counting Resumes
resume_count = {}

for sector in sectors:
    resumes = list((data_path / sector).glob("*.pdf"))
    resume_count[sector] = len(resumes)

print("Resumes per sector:")
for sector, count in resume_count.items():
    print(f"{sector}: {count} resumes")


Resumes per sector:
ACCOUNTANT: 118 resumes
ADVOCATE: 118 resumes
AGRICULTURE: 63 resumes
APPAREL: 97 resumes
ARTS: 104 resumes
AUTOMOBILE: 36 resumes
AVIATION: 117 resumes
BANKING: 115 resumes
BPO: 22 resumes
BUSINESS-DEVELOPMENT: 120 resumes
CHEF: 118 resumes
CONSTRUCTION: 112 resumes
CONSULTANT: 115 resumes
DESIGNER: 107 resumes
DIGITAL-MEDIA: 96 resumes
ENGINEERING: 118 resumes
FINANCE: 118 resumes
FITNESS: 117 resumes
HEALTHCARE: 115 resumes
HR: 110 resumes
INFORMATION-TECHNOLOGY: 120 resumes
PUBLIC-RELATIONS: 111 resumes
SALES: 116 resumes
TEACHER: 102 resumes


In [4]:
from random import sample
# A test to make sure we can access our sample files!
for sector in sectors:
    resume_files = list((data_path / sector).glob("*.pdf"))
    print(f"\n{sector} - Sample files:")
    for f in sample(resume_files, min(2, len(resume_files))):
        print(" * ", f.name)



ACCOUNTANT - Sample files:
 *  25462793.pdf
 *  23734441.pdf

ADVOCATE - Sample files:
 *  91051945.pdf
 *  15313140.pdf

AGRICULTURE - Sample files:
 *  28247753.pdf
 *  24001783.pdf

APPAREL - Sample files:
 *  13386301.pdf
 *  35121930.pdf

ARTS - Sample files:
 *  43622023.pdf
 *  73497035.pdf

AUTOMOBILE - Sample files:
 *  25047127.pdf
 *  14455622.pdf

AVIATION - Sample files:
 *  11137306.pdf
 *  94137171.pdf

BANKING - Sample files:
 *  25624652.pdf
 *  29093426.pdf

BPO - Sample files:
 *  69097572.pdf
 *  13964744.pdf

BUSINESS-DEVELOPMENT - Sample files:
 *  20317319.pdf
 *  24647386.pdf

CHEF - Sample files:
 *  12155206.pdf
 *  25924968.pdf

CONSTRUCTION - Sample files:
 *  17342969.pdf
 *  21782152.pdf

CONSULTANT - Sample files:
 *  39441617.pdf
 *  20176584.pdf

DESIGNER - Sample files:
 *  93301686.pdf
 *  18979238.pdf

DIGITAL-MEDIA - Sample files:
 *  20490741.pdf
 *  11270462.pdf

ENGINEERING - Sample files:
 *  19124258.pdf
 *  12472574.pdf

FINANCE - Sample file

In [5]:
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

In [6]:
# Example: Get one PDF from the Tech sector
sample_pdf = list((data_path / "ENGINEERING").glob("*.pdf"))[0]

# Extract text
sample_text = extract_text_from_pdf(sample_pdf)

# Show part of it
print(sample_text[:1000])  # print the first 1000 characters


ENGINEERING LAB TECHNICIAN
Career Focus
My main objective in seeking employment with Triumph Actuation Systems Inc. is to work in a professional atmosphere where I can utilize my
skills and continue to gain experience in the aerospace industry to advance in my career.
Professional Experience
Engineering Lab Technician Oct 2016 to Current 
Company Name ï¼​ City , State
Responsible for testing various seat structures to meet specific certification requirements. Â 
Maintain and calibrate test instruments to ensure testing capabilities are maintained.
Ensure data is captured and recorded correctly for certification test reports.
Duties also dynamic test set-up and static suite testing. 
Engineering Lab Technician, Sr. Specialist Apr 2012 to Oct 2016 
Company Name ï¼​ City , State
Utilized skills learned from LabView Course 1 training to construct and maintain LabView VI programs.
Responsible for fabricating and maintaining hydraulic/electrical test equipment to complete development and qua

In [7]:
resumes_data = []

for sector in sectors:
    for pdf_file in (data_path / sector).glob("*.pdf"):
        text = extract_text_from_pdf(pdf_file)
        resumes_data.append({
            "sector": sector,
            "filename": pdf_file.name,
            "text": text
        })

print(f" Parsed {len(resumes_data)} resumes in total.")


 Parsed 2485 resumes in total.


In [16]:
class ResumeRanker:
    def __init__(self):
        """Initialize the Resume Ranking System"""
        try:
            self.nlp = spacy.load('en_core_web_sm')
        except OSError:
            print("Warning: spaCy model not found. Install with: python -m spacy download en_core_web_sm")
            self.nlp = None

        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            ngram_range=(1, 2),
            lowercase=True
        )

        self.skill_categories = {
            'programming': ['python', 'java', 'javascript', 'c++', 'c#', 'php', 'ruby', 'go', 'rust', 'scala', 'kotlin'],
            'web_development': ['html', 'css', 'react', 'angular', 'vue', 'node', 'express', 'django', 'flask'],
            'data_science': ['pandas', 'numpy', 'matplotlib', 'seaborn', 'scikit-learn', 'tensorflow', 'pytorch', 'r'],
            'databases': ['sql', 'mysql', 'postgresql', 'mongodb', 'redis', 'elasticsearch', 'oracle'],
            'cloud': ['aws', 'azure', 'gcp', 'docker', 'kubernetes', 'terraform', 'jenkins'],
            'soft_skills': ['leadership', 'communication', 'teamwork', 'problem-solving', 'analytical', 'creative']
        }
        
    def extract_text_from_pdf(self, pdf_path):
            """Extract text from PDF file"""
            try:
                text = ""
                with fitz.open(pdf_path) as doc:
                    for page in doc:
                        text += page.get_text()
                return text.strip()
            except Exception as e:
                print(f"Error extracting text from {pdf_path}: {e}")
                return ""

    def clean_text(self, text):
        """Clean and preprocess text"""
        if not text:
            return ""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\-\.\,\(\)]', '', text)
        return text.lower().strip()
    def extract_skills(self, text):
        """Extract skills from text based on predefined categories"""
        text_lower = text.lower()
        found_skills = {}
        for category, skills in self.skill_categories.items():
            found_skills[category] = [skill for skill in skills if skill in text_lower]
        return found_skills

    def extract_experience_years(self, text):
        """Extract years of experience from text"""
        patterns = [
            r'(\d+)\+?\s*years?\s*(?:of\s*)?experience',
            r'(\d+)\+?\s*years?\s*in',
            r'experience\s*(?:of\s*)?(\d+)\+?\s*years?',
            r'(\d+)\+?\s*yrs?\s*(?:of\s*)?experience'
        ]
        years = []
        for pattern in patterns:
            matches = re.findall(pattern, text.lower())
            years.extend([int(match) for match in matches])
        return max(years) if years else 0

    def extract_education(self, text):
        """Extract education information"""
        degrees = ['phd', 'ph.d', 'doctorate', 'masters', 'master', 'bachelor', 'bachelors', 'mba', 'ms', 'bs', 'ba', 'ma']
        degree_scores = {'phd': 4, 'ph.d': 4, 'doctorate': 4, 'masters': 3, 'master': 3, 'mba': 3, 'ms': 3, 'ma': 3, 'bachelor': 2, 'bachelors': 2, 'bs': 2, 'ba': 2}
        found_degrees = [degree for degree in degrees if degree in text.lower()]
        return max([degree_scores.get(degree, 1) for degree in found_degrees], default=1)
        
    def parse_resume(self, resume_text, filename):
        """Parse resume and extract structured information"""
        cleaned_text = self.clean_text(resume_text)
        return {
            'filename': filename,
            'text': cleaned_text,
            'skills': self.extract_skills(resume_text),
            'experience_years': self.extract_experience_years(resume_text),
            'education_score': self.extract_education(resume_text),
            'text_length': len(cleaned_text.split())
        }

    def calculate_text_similarity(self, job_description, resume_texts):
        """Calculate TF-IDF similarity between job description and resumes"""
        all_texts = [job_description] + resume_texts
        try:
            tfidf_matrix = self.vectorizer.fit_transform(all_texts)
            job_vector = tfidf_matrix[0:1]
            resume_vectors = tfidf_matrix[1:]
            return cosine_similarity(job_vector, resume_vectors).flatten()
        except Exception as e:
            print(f"Error calculating text similarity: {e}")
            return np.zeros(len(resume_texts))

    def calculate_skill_match(self, job_skills, resume_skills):
        """Calculate skill matching score"""
        total_score = 0
        max_possible = 0
        for category in self.skill_categories.keys():
            job_set = set(job_skills.get(category, []))
            resume_set = set(resume_skills.get(category, []))
            if job_set:
                total_score += len(job_set & resume_set) / len(job_set)
                max_possible += 1
        return total_score / max_possible if max_possible > 0 else 0

    def rank_resumes(self, job_description, resumes_data):
        """Rank resumes based on job description"""
        job_skills = self.extract_skills(job_description)
        job_exp_years = self.extract_experience_years(job_description)
        cleaned_job_desc = self.clean_text(job_description)
        resume_texts = [resume['text'] for resume in resumes_data]
        text_similarities = self.calculate_text_similarity(cleaned_job_desc, resume_texts)

        results = []
        for i, resume in enumerate(resumes_data):
            skill_score = self.calculate_skill_match(job_skills, resume['skills'])
            text_similarity = text_similarities[i]
            exp_score = min(resume['experience_years'] / max(job_exp_years, 1), 1.0)
            edu_score = min(resume['education_score'] / 4.0, 1.0)
            combined_score = (
                0.4 * text_similarity +
                0.3 * skill_score +
                0.2 * exp_score +
                0.1 * edu_score
            )
            results.append({
                'filename': resume['filename'],
                'combined_score': combined_score,
                'text_similarity': text_similarity,
                'skill_score': skill_score,
                'experience_score': exp_score,
                'education_score': edu_score,
                'experience_years': resume['experience_years'],
                'skills_found': resume['skills']
            })

        return sorted(results, key=lambda x: x['combined_score'], reverse=True)
        
    def process_resume_folder(self, data_path):
        """Process all resumes in the data folder"""
        resumes_data = []
        for sector_folder in Path(data_path).iterdir():
            if sector_folder.is_dir():
                print(f"Processing {sector_folder.name}...")
                for pdf_file in sector_folder.glob("*.pdf"):
                    try:
                        text = self.extract_text_from_pdf(pdf_file)
                        if text:
                            resume_data = self.parse_resume(text, pdf_file.name)
                            resume_data['sector'] = sector_folder.name
                            resumes_data.append(resume_data)
                            #print(f"  Processed: {pdf_file.name}")
                        else:
                            print(f"  Warning: No text extracted from {pdf_file.name}")
                    except Exception as e:
                        print(f"  Error processing {pdf_file.name}: {e}")
        return resumes_data

    def display_results(self, ranked_results, top_n=10):
        """Display ranking results in a formatted way"""
        print(f"\n{'='*80}")
        print(f"TOP {min(top_n, len(ranked_results))} RANKED RESUMES")
        print(f"{'='*80}")
        for i, result in enumerate(ranked_results[:top_n], 1):
            print(f"\n{i}. {result['filename']}")
            print(f"   Combined Score: {result['combined_score']:.3f}")
            print(f"   Text Similarity: {result['text_similarity']:.3f}")
            print(f"   Skill Match: {result['skill_score']:.3f}")
            print(f"   Experience Score: {result['experience_score']:.3f} ({result['experience_years']} years)")
            print(f"   Education Score: {result['education_score']:.3f}")
            skills_summary = [f"{cat}: {', '.join(skills[:3])}" for cat, skills in result['skills_found'].items() if skills]
            if skills_summary:
                print(f"   Key Skills: {' | '.join(skills_summary[:3])}")
            print("-" * 80)


In [20]:
def main():
    """Main function to demonstrate the resume ranking system"""
    ranker = ResumeRanker()
    data_path = Path("../data/data/data")
    if not data_path.exists():
        print(f"Data path {data_path} does not exist. Please check the path.")
        return
    print("Processing resumes...")
    resumes_data = ranker.process_resume_folder(data_path)
    print(f"\nTotal resumes processed: {len(resumes_data)}")
    job_description = """Ministry of Finance

Rural Financial Inclusion Project in Palestine (RUFIPP)

Terms of Reference

for Hiring a Full-Time Senior Monitoring and Evaluation (M&E) Specialist

Job Announcement: Senior Monitoring and Evaluation (M&E) Specialist
Project: Rural Financial Inclusion Project in Palestine (RUFIPP)
Lead Executing Agency (EA): Ministry of Finance (MOF)
Implementing Agencies (IA): Ministry of Finance (MOF) and Ministry of Agriculture (MOA)
Location: Ramallah, with possible travel in West Bank areas.
Contract Duration: One-year contract with possibility of extension based on satisfactory performance. A three-month probationary period applies.

Background:

The Palestinian Authority has received financing from the International Fund for Agricultural Development (IFAD), the European Union (EU), and the Spanish Agency for International Development Cooperation (AECID) towards the cost of the Rural Financial Inclusion Project in Palestine (RUFIPP). 

The Ministry of Finance is seeking a highly qualified Senior Monitoring and Evaluation (M&E) Specialist to join the RUFIPP Project Management Unit (PMU) in Ramallah.
The project aims to enhance resilient economic growth and incomes of smallholder farmers and rural enterprises by increasing access to affordable financial products, raising financial literacy, and mainstreaming social inclusion. Specifically, RUFIPP focuses on reaching marginalized and vulnerable groups, including women and youth. 

RUFIPP will be implemented over a four-year period starting in the second semester of 2025. Project activities are organized into three major components: (i) Inclusive rural finance literacy and awareness raising; (ii) Providing inclusive and green rural finance products and services; (iii) Capacity development and strengthening of rural finance institutions.

Job Purpose:

The Senior M&E Specialist will report directly to the Project Director. S/he will ensure that the project has a robust, effective, and manageable M & E system in place. The Specialist will also ensure that project component leads, key implementing partners, ministries, and agencies are capable of, and timely respond to planning, monitoring, and evaluation needs with respect to tracking and reporting against targets and agreed results-based indicators as per the project log frame (LF). The Senior M&E Specialist will also ensure quality assurance, integrity, and accountability across the M&E function and information. S/he will ensure the M & E system supports the Knowledge Management, Learning and Communication system of the project.

Key Duties and Responsibilities:

Study the Theory of Change, Logical Framework, and project components to conceptualize the M&E system.
Develop and implement a cost-effective monitoring and evaluation strategy to track all project inputs, outputs, outcomes, and impacts.
Develop data collection systems for beneficiaries and financing institutions.
Create participatory M&E tools and processes to enhance project effectiveness and accountability.
Conduct data analysis and prepare regular progress and results reports (quarterly, annual, and thematic).
Train Project Implementation Teams in M&E system use and tools.
Establish a reporting framework for all project beneficiaries, financial institutions, and enterprises.
Support studies, annual outcome surveys, mid-term review (MTR), and end-of-project evaluations, including preparation of terms of reference.
Ensure quality control of data collection, analysis, and reporting inline with IFAD, AECID and EU reporting guidelines.
Maintain internal communication on M&E findings and impacts.
Integrate feedback from beneficiaries and partners into project implementation.
Support the development and management of the project’s Management Information System (MIS). Ensure alignment with IFAD, AECID and EU reporting guidelines. 
Facilitate the design and implementation of studies, annual outcome surveys, mid-term review and end of Project and impact evaluations as required including the preparation of terms of references ensuring adherence to the IFAD “Core Outcome Indicator” survey guidelines;
Facilitate supervision missions by providing timely M&E and progress information.
Perform any other duties assigned by the Project director.
Job Performance Indicators:

M&E system/Strategy for RUFIPP developed and implemented.
Project Implementation Teams trained on M&E.
Data collection, reporting, and learning activities coordinated effectively.
Timely preparation of quarterly, biannual, annual, and thematic reports.
Ongoing monitoring of project activities against targets and schedules.
Effective design and implementation of studies, MTR, and end-of-project evaluations.

Qualifications and Experience Required:

Postgraduate degree in Monitoring and Evaluation, Demography, Population Studies, Statistics, Economics, Rural Development, or a related field.
At least 8 years of experience in M&E roles in rural development and donor-funded projects.
Strong experience in designing and implementing M&E systems and large-scale survey data management.
Demonstrated use of statistical and data analysis software at least 3 years (e.g., SAS, SPSS, Advanced Excel).
Direct experience in donor-funded projects (experience with IFAD-funded projects or relevant experiences is an advantage).
Experience with research studies, full evaluation cycles, and impact assessments.
Strong report writing, analytical, negotiation, and communication skills.
Excellent command of written and spoken English.
High integrity and ability to work under pressure and tight deadlines.
Experience in using IT solutions for M&E is an added advantage.

"""
    print("\nJob Description:")
    print("-" * 50)
    print(job_description)
    print("\nRanking resumes...")
    ranked_results = ranker.rank_resumes(job_description, resumes_data)
    ranker.display_results(ranked_results, top_n=10)
    pd.DataFrame(ranked_results).to_csv('resume_ranking_results.csv', index=False)
    print(f"\nResults saved to 'resume_ranking_results.csv'")


In [22]:
if __name__ == "__main__":
    main()


Processing resumes...
Processing ACCOUNTANT...
Processing ADVOCATE...
Processing AGRICULTURE...
Processing APPAREL...
Processing ARTS...
Processing AUTOMOBILE...
Processing AVIATION...
Processing BANKING...
Processing BPO...
Processing BUSINESS-DEVELOPMENT...
Processing CHEF...
Processing CONSTRUCTION...
Processing CONSULTANT...
Processing DESIGNER...
Processing DIGITAL-MEDIA...
Processing ENGINEERING...
Processing FINANCE...
Processing FITNESS...
Processing HEALTHCARE...
Processing HR...
Processing INFORMATION-TECHNOLOGY...
Processing PUBLIC-RELATIONS...
Processing SALES...
Processing TEACHER...

Total resumes processed: 2484

Job Description:
--------------------------------------------------
Ministry of Finance

Rural Financial Inclusion Project in Palestine (RUFIPP)

Terms of Reference

for Hiring a Full-Time Senior Monitoring and Evaluation (M&E) Specialist

Job Announcement: Senior Monitoring and Evaluation (M&E) Specialist
Project: Rural Financial Inclusion Project in Palestine 