In [17]:
# Install dependencies

!pip install tika PyMuPDF python-docx sentence-transformers rapidfuzz pandas scikit-learn spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [18]:
# Import libraries

import os
import fitz   # PyMuPDF
from docx import Document
from tika import parser
import re
import pandas as pd
from rapidfuzz import fuzz
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import spacy
from google.colab import files

In [19]:
#  Text extraction

def extract_text(path):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".pdf":
        try:
            text = ""
            with fitz.open(path) as doc:
                for page in doc:
                    text += page.get_text()
            return text
        except:
            parsed = parser.from_file(path)
            return parsed.get("content", "") or ""
    elif ext in [".docx", ".doc"]:
        try:
            doc = Document(path)
            return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
        except:
            parsed = parser.from_file(path)
            return parsed.get("content", "") or ""
    elif ext == ".txt":
        with open(path, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    else:
        parsed = parser.from_file(path)
        return parsed.get("content", "") or ""

In [20]:
# Expanded Skills Dictionary
SKILLS = [
    # Programming & ML
    "python","java","c++","machine learning","deep learning","nlp",
    "pytorch","tensorflow","scikit-learn","xgboost","transformers","huggingface","bert","spacy",
    # Data & Visualization
    "sql","postgresql","mysql","mongodb","excel","power bi","tableau","data visualization",
    # Cloud & DevOps
    "docker","kubernetes","aws","gcp","azure","git","bash","linux",
    # Soft skills
    "teamwork","leadership","problem solving","communication","critical thinking"
]


def extract_skills(text, skills_list=SKILLS, threshold=80):
    found = {}
    if not text: return found
    t = text.lower()
    for skill in skills_list:
        score = fuzz.partial_ratio(skill.lower(), t)
        if score >= threshold:
            found[skill] = score
    return found

In [21]:
# JD keyword extractor (noun phrases)
nlp = spacy.load("en_core_web_sm", disable=["ner"])
def extract_jd_keywords(jd_text, top_k=30):
    doc = nlp(jd_text)
    phrases = [chunk.text.lower() for chunk in doc.noun_chunks if len(chunk.text) > 2]
    tokens = [t.text.lower() for t in doc if not t.is_stop and not t.is_punct]
    keywords = list(set(phrases + tokens))
    return keywords[:top_k]

In [22]:
# TF-IDF similarity
def compute_tfidf_similarity(jd, resumes):
    texts = [jd] + resumes
    vect = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
    X = vect.fit_transform(texts)
    sims = linear_kernel(X[0:1], X[1:]).flatten()
    return sims.tolist()

In [23]:
# Embedding similarity
model = SentenceTransformer("all-MiniLM-L6-v2")
def embed(texts):
    return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

In [24]:
# Score calculator
def parse_years_experience(text):
    m = re.search(r'(\d{1,2})\+?\s*(?:years|yrs|year)', text, re.I)
    return int(m.group(1)) if m else 0

def compute_score(resume_text, jd_text, matched_skills, jd_keywords, resume_emb, jd_emb, tfidf_score):
    kw_coverage = sum(1 for k in jd_keywords if k in resume_text.lower()) / max(1, len(jd_keywords))
    sem = util.cos_sim(resume_emb, jd_emb).item()
    years = parse_years_experience(resume_text)
    exp_score = min(years/10.0, 1.0)

    weights = {"skills":0.4, "tfidf":0.2, "semantic":0.3, "experience":0.1}
    final = (weights["skills"]*kw_coverage +
             weights["tfidf"]*tfidf_score +
             weights["semantic"]*((sem+1)/2) +
             weights["experience"]*exp_score)
    return {
        "final":final,
        "kw_coverage":kw_coverage,
        "tfidf_score":tfidf_score,
        "semantic_score":(sem+1)/2,
        "experience_score":exp_score,
        "years_experience":years,
        "skills":list(matched_skills.keys())
    }

In [25]:
# Upload Files

print(" Upload Job Description file (txt/pdf/docx)")
jd_file = files.upload()
jd_path = list(jd_file.keys())[0]
jd_text = extract_text(jd_path)

print(" Upload Resume files (pdf/docx/txt) - you can select multiple")
res_files = files.upload()
resume_paths = list(res_files.keys())

 Upload Job Description file (txt/pdf/docx)


Saving Pranay_Soni_Resume (1).pdf to Pranay_Soni_Resume (1) (2).pdf
 Upload Resume files (pdf/docx/txt) - you can select multiple


Saving Pranay_Soni_Resume (1).pdf to Pranay_Soni_Resume (1) (3).pdf


In [26]:
# Run Matching

jd_keywords = extract_jd_keywords(jd_text, top_k=30)
res_texts = [extract_text(p) for p in resume_paths]
tfidf_scores = compute_tfidf_similarity(jd_text, res_texts)
jd_emb = embed(jd_text)
res_embs = embed(res_texts)

results = []
for i, text in enumerate(res_texts):
    skills = extract_skills(text)
    score = compute_score(text, jd_text, skills, jd_keywords, res_embs[i], jd_emb, tfidf_scores[i])
    results.append({
        "file": resume_paths[i],
        **score
    })

df = pd.DataFrame(results).sort_values("final", ascending=False)
df["match_percent"] = (df["final"]*100).round(2)
df = df[["file","match_percent","kw_coverage","tfidf_score","semantic_score","experience_score","years_experience","skills"]]

In [27]:
# Output
print("\n Ranked candidates:")
display(df)


 Ranked candidates:


Unnamed: 0,file,match_percent,kw_coverage,tfidf_score,semantic_score,experience_score,years_experience,skills
0,Pranay_Soni_Resume (1) (3).pdf,90.0,1.0,1.0,1.0,0.0,0,"[python, machine learning, nlp, scikit-learn, ..."
