## Importing Dependencies

In [13]:
import spacy
spacy.cli.download("en_core_web_sm")


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [15]:
# Imports
import os
import re
import docx
import fitz  # PyMuPDF
import spacy
from sentence_transformers import SentenceTransformer, util

# Load spaCy & sentence-transformers
nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer('all-MiniLM-L6-v2')  # Fast, small model


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


## Data Cleaning and Preprocessing Text

In [46]:
# Clean Text Using spaCy
def clean_text(text):
    if not text or not isinstance(text, str):
        return ""
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and not token.is_punct and not token.like_num
    ]
    return ' '.join(tokens)


## Resume & JD Text Extraction

In [49]:
# Extract Text from File (PDF, DOCX, or TXT)
def extract_text(file_path):
    if file_path.endswith('.pdf'):
        text = ""
        with fitz.open(file_path) as doc:
            for page in doc:
                text += page.get_text()
        return text
    elif file_path.endswith('.docx'):
        doc = docx.Document(file_path)
        return "\n".join([para.text for para in doc.paragraphs])
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        return ""


## load and Clean ALL Resumes

In [52]:
# Step 4: Load and Process All Resumes
resume_dir = os.path.abspath(os.path.join(
    "C:\\Users\\ronak\\Downloads\\Py_DS_ML_Bootcamp-master\\Refactored_Py_DS_ML_Bootcamp-master\\AI Resume matcher", 
    "data", 
    "resumes"
))
resumes = {}

for file in os.listdir(resume_dir):
    if file.endswith(('.pdf', '.docx', '.txt')):
        path = os.path.join(resume_dir, file)
        text = extract_text(path)
        cleaned = clean_text(text)
        resumes[file] = cleaned


## Load and Process Job Description

In [55]:
jd_path=os.path.abspath(os.path.join(
    "C:\\Users\\ronak\\Downloads\\Py_DS_ML_Bootcamp-master\\Refactored_Py_DS_ML_Bootcamp-master\\AI Resume matcher", 
    "data", 
    "job_descriptions","Senior Python Developer.txt"
))
jd_text=extract_text(jd_path)
cleaned_jd=clean_text(jd_text)

## Embed and Score Similarity

In [60]:
jd_embedding=model.encode(cleaned_jd,convert_to_tensor=True)

results=[]
for filename,resume_text in resumes.items():
    res_embedding=model.encode(resume_text,convert_to_tensor=True)
    score=util.cos_sim(jd_embedding,res_embedding).item()
    results.append((filename,round(score*100,2))) #percentage
#sort by best match
results=sorted(results,key=lambda x:x[1],reverse=True)
for r in results:
    print(f"✅ {r[0]} → {r[1]}% match")

✅ senior-python-developer2 - Template 18.pdf → 78.33% match
✅ entry-level-software-engineer2 - Template 17.pdf → 61.14% match
✅ Full Stack Web Developer - Template 10.pdf → 53.08% match
✅ pl-sql-developer2  - Template 16 .pdf → 52.99% match
