In [30]:
import os
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd
import fitz
import spacy
import re
import json

In [31]:

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text +=page.get_text()
    return text


In [32]:

def extract_all_resumes_text(folder_path):
    resume_texts = []
    filenames = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            path = os.path.join(folder_path, file)
            text = extract_text_from_pdf(path)
            resume_texts.append(text)
            filenames.append(file)
    return filenames, resume_texts

In [33]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [34]:
#from utils.pdf_reader import extract_all_resumes_text
resume_names, resume_texts =  extract_all_resumes_text("../data/resumes/INFORMATION-TECHNOLOGY")
with open("../data/job_descs/job1.txt", "r") as f:
    job_description = f.read()


resume_embeddings = model.encode(resume_texts, convert_to_tensor=True)
job_embedding = model.encode(job_description, convert_to_tensor=True)

# Compute similarity
import torch
cos_scores = util.cos_sim(job_embedding, resume_embeddings)[0]

In [35]:
def extract_cgpa(text):
    match = re.search(r"CGPA[:\s]*([0-9]+\.[0-9]+)", text)
    if match:
        return float(match.group(1))
    return None


In [36]:
nlp = spacy.load("en_core_web_sm")

def extract_skills_from_text(text):
    doc = nlp(text.lower())
    skills = set()

    for chunk in doc.noun_chunks:
        if 1 <= len(chunk.text.split()) <= 3:
            skills.add(chunk.text.strip())

    return list(skills)

# Extract from job description
required_skills = extract_skills_from_text(job_description)

# Then proceed as before
def match_resume_skills(resume_text, required_skills):
    resume_text = resume_text.lower()
    matched = [skill for skill in required_skills if skill in resume_text]
    match_score = len(matched) / len(required_skills) if required_skills else 0
    return match_score, matched


In [37]:
def compute_final_score(similarity, cgpa=None, skills_matched=0):
    score = 0.6 * similarity
    score += 0.2 * (cgpa / 10 if cgpa else 0.5)
    score += 0.2 * skills_matched  # Out of 1.0
    return round(score, 4)

# Extract required skills dynamically from job description
required_skills = extract_skills_from_text(job_description)

results = []

for i, text in enumerate(resume_texts):
    sim = float(cos_scores[i])
    cgpa = extract_cgpa(text)

    # Skill matching
    skills_score, matched_skills = match_resume_skills(text, required_skills)

    # Final score
    score = compute_final_score(sim, cgpa, skills_score)
    results.append({
        "resume": resume_names[i],
        "score": score,
        "cgpa": cgpa,
        "similarity": sim,
        "skills_matched_count": len(matched_skills),
        "matched_skills": matched_skills
    })

# Sort by score
results = sorted(results, key=lambda x: x["score"], reverse=True)


In [38]:
results

[{'resume': '20674668.pdf',
  'score': 0.4372,
  'cgpa': None,
  'similarity': 0.3952873647212982,
  'skills_matched_count': 4,
  'matched_skills': ['python', 'we', 'knowledge', 'experience']},
 {'resume': '37242217.pdf',
  'score': 0.3976,
  'cgpa': None,
  'similarity': 0.4127435088157654,
  'skills_matched_count': 2,
  'matched_skills': ['we', 'experience']},
 {'resume': '11580408.pdf',
  'score': 0.3831,
  'cgpa': None,
  'similarity': 0.3468329906463623,
  'skills_matched_count': 3,
  'matched_skills': ['we', 'knowledge', 'experience']},
 {'resume': '26480367.pdf',
  'score': 0.382,
  'cgpa': None,
  'similarity': 0.303388774394989,
  'skills_matched_count': 4,
  'matched_skills': ['python', 'we', 'knowledge', 'experience']},
 {'resume': '91635250.pdf',
  'score': 0.3796,
  'cgpa': None,
  'similarity': 0.340921550989151,
  'skills_matched_count': 3,
  'matched_skills': ['we', 'knowledge', 'experience']},
 {'resume': '39413067.pdf',
  'score': 0.3743,
  'cgpa': None,
  'similarity

In [39]:
df = pd.DataFrame(results)
df

Unnamed: 0,resume,score,cgpa,similarity,skills_matched_count,matched_skills
0,20674668.pdf,0.4372,,0.395287,4,"[python, we, knowledge, experience]"
1,37242217.pdf,0.3976,,0.412744,2,"[we, experience]"
2,11580408.pdf,0.3831,,0.346833,3,"[we, knowledge, experience]"
3,26480367.pdf,0.3820,,0.303389,4,"[python, we, knowledge, experience]"
4,91635250.pdf,0.3796,,0.340922,3,"[we, knowledge, experience]"
...,...,...,...,...,...,...
115,17681064.pdf,0.1957,,0.076095,2,"[we, experience]"
116,39718499.pdf,0.1946,,0.074376,2,"[we, experience]"
117,27295996.pdf,0.1813,,0.093803,1,[experience]
118,41344156.pdf,0.1741,,0.040208,2,"[we, experience]"
