In [None]:
#  3. Resume Matching & Scoring

#  1. Load Resume Texts & Job Description
# - Load raw resume text
# - Load a job description string

#  2. Preprocessing
# - Lowercase, remove punctuation, stop words
# - Lemmatize using spaCy

#  3. TF-IDF Vectorization
# - Use scikit-learn’s TfidfVectorizer
# - Compute vectors for all resumes + job description

#  4. Cosine Similarity
# - Compute similarity between job description and each resume
# - Score between 0 (no match) to 1 (perfect match)

#  5. Display Results
# - Table: Resume name, score, top skills


In [11]:
from pathlib import Path

# Folder where extracted resume text files are stored
resume_folder = Path("../data/extracted_texts")
resume_texts = {}
for file in resume_folder.glob("*.txt"):
    resume_texts[file.stem] = file.read_text()

# Example job description
job_description = """
Looking for a Data Scientist with strong skills in Python, SQL, and machine learning. 
Must be able to work with tools like Pandas, Scikit-learn, Power BI, and Streamlit. 
Experience with data analysis, model building, and data visualization is required. 
Good communication and problem-solving skills are a plus.
"""



In [12]:
import spacy
import string

nlp = spacy.load("en_core_web_sm")

def preprocess(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_ for token in doc
        if not token.is_stop and not token.is_punct and token.lemma_ not in string.punctuation
    ]
    return " ".join(tokens)

processed_resumes = {name: preprocess(text) for name, text in resume_texts.items()}
processed_jd = preprocess(job_description)



In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
documents = list(processed_resumes.values()) + [processed_jd]
tfidf_matrix = vectorizer.fit_transform(documents)


In [14]:
from sklearn.metrics.pairwise import cosine_similarity

# Last entry in matrix is the job description
jd_vector = tfidf_matrix[-1]
resume_vectors = tfidf_matrix[:-1]

similarities = cosine_similarity(resume_vectors, jd_vector)


In [16]:
import pandas as pd

scores = [
    {"resume": name, "similarity": float(score)}
    for name, score in zip(processed_resumes.keys(), similarities)
]

df = pd.DataFrame(scores).sort_values(by="similarity", ascending=False)
print(df)


         resume  similarity
1   resume1_pdf    0.276204
0  resume1_docx    0.270264


  {"resume": name, "similarity": float(score)}
