# Baseline Resume-JD Similarity

## Imports

In [19]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
import re


## Sample Inputs

In [11]:
resumes = {
    "Strong ML Engineer": """
Machine Learning Engineer with experience in end-to-end model development and deployment.
Skilled in Python, SQL, pandas, numpy, scikit-learn, TensorFlow, and PyTorch.
Worked on classification, regression, and NLP projects including spam detection and expense prediction.
Familiar with Docker and cloud technologies like AWS.
""",
    "Cloud/MLOps Engineer": """
Hands-on experience with AWS ECS, EC2, Docker, CI/CD pipelines.
Knowledge of machine learning deployment and monitoring.
Proficient in Python and Linux administration.
""",
    "Junior ML Fresher": """
Entry-level ML engineer with knowledge of Python, pandas, numpy and sklearn.
Looking for opportunity to contribute to ML projects.
Familiar with classification and regression algorithms.
""",
    "Data Analyst": """
Strong skills in SQL, Power BI, Excel and data visualization.
Some exposure to Python and pandas.
""",
    "Frontend Developer": """
Frontend Developer skilled in JavaScript, HTML, CSS and React.
Built responsive single-page web applications and worked with REST APIs.
Most of my experience is in UI development and performance optimisation.
No direct experience with AWS, Docker or machine learning.
"""
}


## Text Cleaning

In [12]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

## Embedding Function

In [13]:
def embedding_base_score(resume_text: str, jd_text: str) -> float:
    texts = [resume_text, jd_text]
    embeddings = embed_model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)

    v_resume = embeddings[0].reshape(1, -1)
    v_jd = embeddings[1].reshape(1, -1)

    cos_sim = cosine_similarity(v_resume, v_jd)[0][0]
    base_score = cos_sim * 100
    return base_score


## Skills List & Extraction

In [14]:
skills_list = [
    # ML Core
    "python", "sql",
    "machine learning", "deep learning",
    "pandas", "numpy", "scikit-learn",
    "tensorflow", "pytorch",
    "nlp", "computer vision",

    # Cloud / DevOps
    "aws", "gcp", "azure",
    "docker", "kubernetes",
    "terraform", "cloudformation",

    # Deployment / Infra
    "linux", "networking",
]


In [15]:
def extract_skills(text: str, skills: list[str]) -> list[str]:
    text_lower = text.lower()
    found = []
    for skill in skills:
        # \b makes sure we match whole words/phrases
        pattern = r'\b' + re.escape(skill.lower()) + r'\b'
        if re.search(pattern, text_lower):
            found.append(skill)
    return sorted(list(set(found)))


## Final Score & Summary

In [21]:
# Compute final score using both similarity and skill coverage

def compute_final_score(base_score: float,
                        matched_skills: list[str],
                        jd_skills: list[str]) -> tuple[float, float]:
    if len(jd_skills) > 0:
        skill_coverage = (len(matched_skills) / len(jd_skills)) * 100
    else:
        skill_coverage = 0.0

    final = 0.6 * base_score + 0.4 * skill_coverage
    return final, skill_coverage


def build_summary(final_score: float,
                  matched_skills: list[str],
                  missing_skills: list[str]) -> str:
    # Decide fit label based on score
    if final_score >= 75:
        fit_label = "Strong fit"
    elif final_score >= 50:
        fit_label = "Moderate fit"
    else:
        fit_label = "Weak fit"

    matched_text = ", ".join(matched_skills) if matched_skills else "None"
    missing_text = ", ".join(missing_skills) if missing_skills else "None"

    summary_lines = [
        f"{fit_label} for this role.",
        f"Matched skills: {matched_text}.",
        f"Missing skills: {missing_text}."
    ]

    return "\n".join(summary_lines)


## Evaluation 

In [22]:
def evaluate_resume(resume_text: str, jd_text: str) -> dict:
    # 1. Clean input text
    resume_clean = clean_text(resume_text)
    jd_clean = clean_text(jd_text)

    # 2. TF-IDF similarity
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([resume_clean, jd_clean])
    cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    tfidf_base = cos_sim * 100

    # 3. Embedding similarity
    emb_base = embedding_base_score(resume_text, jd_text)

    # 4. Combine similarity scores
    combined_base = 0.3 * tfidf_base + 0.7 * emb_base

    # 5. Skill extraction
    resume_skills = extract_skills(resume_clean, skills_list)
    jd_skills = extract_skills(jd_clean, skills_list)
    matched_skills = sorted(list(set(resume_skills) & set(jd_skills)))
    missing_skills = sorted(list(set(jd_skills) - set(resume_skills)))

    # 6. Final match score + summary
    final_score, skill_coverage = compute_final_score(combined_base, matched_skills, jd_skills)
    summary = build_summary(final_score, matched_skills, missing_skills)

    return {
        "tfidf_base": tfidf_base,
        "emb_base": emb_base,
        "base_score": combined_base,
        "skill_coverage": skill_coverage,
        "final_score": final_score,
        "resume_skills": resume_skills,
        "jd_skills": jd_skills,
        "matched_skills": matched_skills,
        "missing_skills": missing_skills,
        "summary": summary,
    }


In [23]:
results = []
for name, text in resumes.items():
    res = evaluate_resume(text, jd_text)
    results.append({
        "name": name,
        "score": res["final_score"],
        "summary": res["summary"]
    })
results_sorted = sorted(results, key=lambda x: x["score"], reverse=True)


print("Good candidate score:", result_good["final_score"])
print("Bad candidate score:", result_bad["final_score"])
print("\nGood summary:\n", result_good["summary"])
print("\nBad summary:\n", result_bad["summary"])


Good candidate score: 73.37789022604483
Bad candidate score: 34.545398316302176

Good summary:
 Moderate fit for this role.
Matched skills: aws, docker, machine learning, nlp, numpy, pandas, python, pytorch, sql, tensorflow.
Missing skills: computer vision, gcp.

Bad summary:
 Weak fit for this role.
Matched skills: aws, docker, machine learning.
Missing skills: computer vision, gcp, nlp, numpy, pandas, python, pytorch, sql, tensorflow.


In [24]:
print("Ranked Candidates:\n")
for i, r in enumerate(results_sorted, 1):
    print(f"{i}. {r['name']} — Score: {round(r['score'], 2)}")


Ranked Candidates:

1. Strong ML Engineer — Score: 74.8
2. Cloud/MLOps Engineer — Score: 45.59
3. Junior ML Fresher — Score: 43.72
4. Frontend Developer — Score: 33.18
5. Data Analyst — Score: 29.4
