In [3]:
import fitz
import os
import spacy

In [17]:
def extract_text_from_pdf(pdf_path):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(document.page_count):
        page = document.load_page(page_num)
        text += page.get_text()
    return text

# pdf_path = "Vishwaa D A - Batch 2025 - B.Tech. - Information Technology - mjXj5ZC (1).pdf"
pdf_path = "September.pdf"
resume_text = extract_text_from_pdf(pdf_path)
print(resume_text)

SUDHAKARAN
THIRUNAVUKKARASU
B.Tech. - Information Technology
Ph: +91-9952655991
Email: sudhakaran2110507@ssn.edu.in
Vandavasi, Tamil Nadu, India - 604501
LinkedIn: https://www.linkedin.com/in/sudhakaran-gt 
Python  Java  C++  C  Javascript  Web Development  ReactJS  Node.js  Express.js  Spring Boot  SQL
PostgreSQL  MongoDB  R  Excel  Power BI  Tableau
BRIEF SUMMARY
I am a final year IT student who focuses on Full stack development and data analysis, with expertise in Python, C++, R, SQL, and
Java, excelling in handling intricate datasets and extracting valuable insights. Proficient in data visualization with Tableau and Excel,
and skilled in web development with HTML, CSS, and JavaScript, I am excited to take on real-world challenges. I utilise React.js for
interactive frontend development, Node.js/Express for flexible backend APIs, and Spring Boot for Java-based backend solutions to
incorporate advanced ML and AI techniques in deploying predictive models and optimizing data processing

In [26]:
nlp = spacy.load("en_core_web_sm")

predefined_skills = ["Python","Java","C","C++","ReactJS","javascript"]
predefined_skills_lower = [skill.lower() for skill in predefined_skills]

def extract_skills(text, predefined_skills):
    doc = nlp(text)
    skills = []
    for token in doc:
        if token.text.lower() in predefined_skills:
            skills.append(token.text)
    return skills

skills_extracted = list(set(extract_skills(resume_text, predefined_skills_lower)))
print("Extracted skills: ",skills_extracted)

Extracted skills:  ['Python', 'C', 'ReactJS', 'Java', 'C++', 'JavaScript', 'Javascript']


In [30]:
def match_and_rank_resumes(resume_texts, required_skills):
    required_skills_lower = [skill.lower() for skill in required_skills]
    resume_rankings = []
    for resume_text in resume_texts:
        resume_text_lower = resume_text.lower()
        extracted_skills = extract_skills(resume_text_lower, predefined_skills_lower)
        matching_skills = set([skill.lower() for skill in extracted_skills]) & set(required_skills_lower)
        score = len(matching_skills)
        resume_rankings.append((resume_text, score))
    
    ranked_resumes = sorted(resume_rankings, key=lambda x: x[1], reverse=True)
    return ranked_resumes



def process_resumes(pdf_folder, required_skills):
    resume_texts = []

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            resume_text = extract_text_from_pdf(pdf_path)
            resume_texts.append(resume_text)
    
    #print(resume_texts)
    ranked_resumes = match_and_rank_resumes(resume_texts, required_skills)
    
    return ranked_resumes

pdf_folder = "resumes"
required_skills =  ["Python","Java","C","C++","ReactJs","javascript"]
ranked_resumes = process_resumes(pdf_folder, required_skills)

for idx, (resume, score) in enumerate(ranked_resumes):
    print(f"Resume {idx+1}: Score {score}")

Resume 1: Score 6
Resume 2: Score 3
Resume 3: Score 2
Resume 4: Score 2
Resume 5: Score 1


In [33]:
def match_and_rank_resumes2(resume_texts_with_paths, required_skills):
    required_skills_lower = [skill.lower() for skill in required_skills]
    resume_rankings = []
    for resume_text, path in resume_texts_with_paths:
        resume_text_lower = resume_text.lower()
        extracted_skills = extract_skills(resume_text_lower, required_skills_lower)
        matching_skills = set(skill.lower() for skill in extracted_skills) & set(required_skills_lower)
        score = len(matching_skills)
        resume_rankings.append((path, score))
    
    ranked_resumes = sorted(resume_rankings, key=lambda x: x[1], reverse=True)
    return ranked_resumes

def process_resumes2(pdf_folder, required_skills):
    resume_texts_with_paths = []

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            resume_text = extract_text_from_pdf(pdf_path)
            resume_texts_with_paths.append((resume_text, pdf_path))
    

    ranked_resumes = match_and_rank_resumes2(resume_texts_with_paths, required_skills)
    
    return ranked_resumes

pdf_folder = "resumes"
required_skills =  ["Python","Sql","Flask","API","OOPs"]
ranked_resumes = process_resumes2(pdf_folder, required_skills)

for idx, (path, score) in enumerate(ranked_resumes):
    print(f"Resume {idx+1}: Path {path}, Score {score}")


Resume 1: Path resumes\Vijay Veerasekaran - Batch 2025 - B.Tech. - Information Technology - iF5nlmC.pdf, Score 5
Resume 2: Path resumes\Sujay R - Batch 2025 - B.Tech. - Information Technology - ydPZlmC (1).pdf, Score 4
Resume 3: Path resumes\Vasundhara Boominathan - Batch 2025 - B.Tech. - Information Technology - 30S8lmC.pdf, Score 4
Resume 4: Path resumes\Vishwaa D A - Batch 2025 - B.Tech. - Information Technology - mjXj5ZC (1).pdf, Score 4
Resume 5: Path resumes\September.pdf, Score 3
