In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Reading the resume and job description
with open("data/sreehari_resume.txt", "r", encoding="utf-8") as f:
    resume = f.read()

with open("data/data_scientist_jd.txt", "r", encoding="utf-8") as f:
    job_description = f.read()

#  Creating TF-IDF vectors
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([resume, job_description])


similarity_matrix = cosine_similarity(vectors[0:1], vectors[1:2])
match_score = similarity_matrix[0][0] * 100  

# Getting the result
print(f"Resume Match Score: {match_score:.2f}%")


Resume Match Score: 30.81%


## Multiple resumes vs one jd (real time example) ##

In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data_folder = "data"

In [3]:
with open(os.path.join(data_folder, "data_scientist_jd.txt"), "r", encoding="utf-8") as f:
    jd_text = f.read()

resume_scores = []
for filename in os.listdir(data_folder):
    if filename.endswith(".txt") and filename != "data_scientist_jd.txt":
        filepath = os.path.join(data_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            resume_text = f.read()

        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform([resume_text, jd_text])
        similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100

        resume_scores.append((filename, round(similarity, 2)))

In [4]:
sorted_scores = sorted(resume_scores, key=lambda x: x[1], reverse=True)
for file, score in sorted_scores:
    print(f"{file} : {score}% match")

resume_3.txt : 45.32% match
resume_1.txt : 44.69% match
resume_2.txt : 41.93% match
sreehari_resume.txt : 14.58% match


In [None]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Folder containing your data
data_folder = "data"

# Step 1: Read the Job Description
with open(os.path.join(data_folder, "data_scientist_jd.txt"), "r", encoding="utf-8") as f:
    jd_text = f.read()

# Step 2: Load all resume files
resume_scores = []
for filename in os.listdir(data_folder):
    if filename.endswith(".txt") and filename != "data_scientist_jd.txt":
        filepath = os.path.join(data_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            resume_text = f.read()
        
        # Step 3: Vectorize JD and Resume
        vectorizer = TfidfVectorizer()
        vectors = vectorizer.fit_transform([resume_text, jd_text])
        similarity = cosine_similarity(vectors[0:1], vectors[1:2])[0][0] * 100
        
        # Store the results
        resume_scores.append((filename, round(similarity, 2)))

# Step 4: Sort and Display
sorted_scores = sorted(resume_scores, key=lambda x: x[1], reverse=True)
for file, score in sorted_scores:
    print(f"{file}: {score}% match")


In [5]:
import csv
with open("resume_match_scores.csv", "w", newline = "") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Resume File", "Match Score (%)"])
    for file, score in sorted_scores:
        writer.writerow([file, score])

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# 1. Vectorize with simple word counts
cv = CountVectorizer(stop_words='english')
cv_matrix = cv.fit_transform([jd_text, resume_text])

# 2. Get feature names (words)
words = cv.get_feature_names_out()

# 3. Convert to array to count frequency
counts = cv_matrix.toarray()

# 4. Find words in JD (row 0) but not in Resume (row 1)
jd_word_counts = counts[0]
resume_word_counts = counts[1]

missing_keywords = []
for idx, (jd_count, resume_count) in enumerate(zip(jd_word_counts, resume_word_counts)):
    if jd_count > 0 and resume_count == 0:
        missing_keywords.append(words[idx])

# 5. Show top 10 missing keywords
print("\nTop Keywords Missing from Resume:")
print(missing_keywords[:10])



Top Keywords Missing from Resume:
['build', 'business', 'cases', 'checks', 'chennai', 'client', 'collaborate', 'conduct', 'contribute', 'datasets']
