In [1]:
import os
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd
import fitz
import spacy
import re
import json
import spacy

In [2]:

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text +=page.get_text()
    return text


In [3]:

def extract_all_resumes_text(folder_path):
    resume_texts = []
    filenames = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            path = os.path.join(folder_path, file)
            text = extract_text_from_pdf(path)
            resume_texts.append(text)
            filenames.append(file)
    return filenames, resume_texts

In [4]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [5]:
#from utils.pdf_reader import extract_all_resumes_text
resume_names, resume_texts =  extract_all_resumes_text("../data/resumes/INFORMATION-TECHNOLOGY")
with open("../data/job_descs/job1.txt", "r") as f:
    job_description = f.read()


resume_embeddings = model.encode(resume_texts, convert_to_tensor=True)
job_embedding = model.encode(job_description, convert_to_tensor=True)

# Compute similarity
import torch
cos_scores = util.cos_sim(job_embedding, resume_embeddings)[0]

In [6]:
def extract_cgpa(text):
    match = re.search(r"CGPA[:\s]*([0-9]+\.[0-9]+)", text)
    if match:
        return float(match.group(1))
    return None


In [7]:
nlp = spacy.load("en_core_web_sm")

SKILLS_DB = [
    "python", "java", "c++", "javascript", "html", "css", "sql", "react",
    "django", "flask", "machine learning", "deep learning", "data analysis",
    "pandas", "numpy", "git", "docker", "aws", "fastapi", "tensorflow", "keras",
    "linux", "bash", "api", "mongodb", "power bi", "excel", "matplotlib", "scikit-learn"
]

def extract_skills_from_text(text):
    doc = nlp(text.lower())
    tokens = set([token.text for token in doc if not token.is_stop and not token.is_punct])
    
    matched_skills = [skill for skill in SKILLS_DB if skill in tokens]
    return matched_skills

required_skills = extract_skills_from_text(job_description)


def match_resume_skills(resume_text, required_skills):
    resume_doc = nlp(resume_text.lower())
    resume_tokens = set([token.text for token in resume_doc if not token.is_stop and not token.is_punct])

    matched = [skill for skill in required_skills if skill in resume_tokens]
    match_score = len(matched) / len(required_skills) if required_skills else 0

    return match_score, matched



In [8]:
def compute_final_score(similarity, cgpa=None, skills_matched=0):
    score = 0.6 * similarity
    score += 0.2 * (cgpa / 10 if cgpa else 0.5)
    score += 0.2 * skills_matched  # Out of 1.0
    return round(score, 4)

# Extract required skills dynamically from job description
required_skills = extract_skills_from_text(job_description)

results = []

for i, text in enumerate(resume_texts):
    sim = float(cos_scores[i])
    cgpa = extract_cgpa(text)

    # Skill matching
    skills_score, matched_skills = match_resume_skills(text, required_skills)

    # Final score
    score = compute_final_score(sim, cgpa, skills_score)
    results.append({
        "resume": resume_names[i],
        "score": score,
        "cgpa": cgpa,
        "similarity": sim,
        "skills_matched_count": len(matched_skills),
        "matched_skills": matched_skills
    })

# Sort by score
results = sorted(results, key=lambda x: x["score"], reverse=True)


In [9]:
results

[{'resume': '20674668.pdf',
  'score': 0.4038,
  'cgpa': None,
  'similarity': 0.3952873647212982,
  'skills_matched_count': 1,
  'matched_skills': ['python']},
 {'resume': '26480367.pdf',
  'score': 0.3487,
  'cgpa': None,
  'similarity': 0.303388774394989,
  'skills_matched_count': 1,
  'matched_skills': ['python']},
 {'resume': '37242217.pdf',
  'score': 0.3476,
  'cgpa': None,
  'similarity': 0.4127435088157654,
  'skills_matched_count': 0,
  'matched_skills': []},
 {'resume': '13405733.pdf',
  'score': 0.3407,
  'cgpa': None,
  'similarity': 0.29011568427085876,
  'skills_matched_count': 1,
  'matched_skills': ['python']},
 {'resume': '12635195.pdf',
  'score': 0.3385,
  'cgpa': None,
  'similarity': 0.286440372467041,
  'skills_matched_count': 1,
  'matched_skills': ['python']},
 {'resume': '10265057.pdf',
  'score': 0.3112,
  'cgpa': None,
  'similarity': 0.2409076690673828,
  'skills_matched_count': 1,
  'matched_skills': ['python']},
 {'resume': '20237244.pdf',
  'score': 0.31

In [10]:
df = pd.DataFrame(results)
df

Unnamed: 0,resume,score,cgpa,similarity,skills_matched_count,matched_skills
0,20674668.pdf,0.4038,,0.395287,1,[python]
1,26480367.pdf,0.3487,,0.303389,1,[python]
2,37242217.pdf,0.3476,,0.412744,0,[]
3,13405733.pdf,0.3407,,0.290116,1,[python]
4,12635195.pdf,0.3385,,0.286440,1,[python]
...,...,...,...,...,...,...
115,39718499.pdf,0.1446,,0.074376,0,[]
116,27536013.pdf,0.1375,,0.062529,0,[]
117,18159866.pdf,0.1357,,0.059446,0,[]
118,41344156.pdf,0.1241,,0.040208,0,[]
