In [1]:
import os
import fitz
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import spacy
from sentence_transformers import SentenceTransformer, util
import torch

pd.set_option('display.max_colwidth', None)
nlp = spacy.load("en_core_web_sm")

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

def tokenize(text):
    doc = nlp(text.lower())
    tokens = set()
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num and token.is_alpha:
            tokens.add(token.lemma_)
    return tokens

def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

def process_resumes(folder_path):
    resume_data = {}
    for fname in os.listdir(folder_path):
        if fname.endswith('.pdf'):
            full_path = os.path.join(folder_path, fname)
            text = extract_text_from_pdf(full_path)
            tokens = tokenize(text)
            resume_data[fname] = tokens
    return resume_data

def read_resumes_raw(base_dir, categories):
    texts = []
    names = []
    for category in categories:
        folder = os.path.join(base_dir, category)
        for fname in os.listdir(folder):
            if fname.endswith(".pdf"):
                full_path = os.path.join(folder, fname)
                text = extract_text_from_pdf(full_path)
                texts.append(text)
                names.append(fname)
    return names, texts

def process_job_descriptions(csv_path):
    df = pd.read_csv(csv_path)
    jd_data = {}
    for idx, row in df.iterrows():
        jd_text = str(row['Job Description'])
        tokens = tokenize(jd_text)
        jd_data[idx] = {
            'job_title': row.get('Job Title', ''),
            'tokens': tokens,
            'full_desc': jd_text.strip()
        }
    return jd_data

def read_job_descriptions(csv_path):
    df = pd.read_csv(csv_path)
    jd_texts = df['Job Description'].astype(str).tolist()
    jd_titles = df['Job Title'].astype(str).tolist()
    return jd_titles, jd_texts

def match_resumes_to_jobs(resume_tokens, jd_data, top_k=10):
    results = defaultdict(list)
    for resume_name, r_tokens in resume_tokens.items():
        scores = []
        for jd_id, jd_info in jd_data.items():
            score = jaccard_similarity(r_tokens, jd_info['tokens'])
            scores.append((jd_id, score, jd_info['job_title']))
        top_matches = sorted(scores, key=lambda x: -x[1])[:top_k]
        results[resume_name] = top_matches
    return results

def match_resumes_tfidf_cosine(resume_names, resume_texts, jd_titles, jd_texts, top_k=10):
    all_docs = resume_texts + jd_texts
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_docs)

    resume_vecs = tfidf_matrix[:len(resume_texts)]
    jd_vecs = tfidf_matrix[len(resume_texts):]

    sim_matrix = cosine_similarity(resume_vecs, jd_vecs)

    results = {}
    for i, resume_name in enumerate(resume_names):
        top_matches = sorted(
            list(enumerate(sim_matrix[i])), key=lambda x: -x[1]
        )[:top_k]
        results[resume_name] = [(idx, score, jd_titles[idx]) for idx, score in top_matches]
    return results, vectorizer, jd_texts

def match_resumes_sbert(resume_names, resume_texts, jd_titles, jd_texts, top_k=10):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    resume_embeddings = model.encode(resume_texts, convert_to_tensor=True, show_progress_bar=True)
    jd_embeddings = model.encode(jd_texts, convert_to_tensor=True, show_progress_bar=True)

    results = {}
    for i, resume_name in enumerate(resume_names):
        cosine_scores = util.cos_sim(resume_embeddings[i], jd_embeddings)[0]
        top_results = torch.topk(cosine_scores, k=top_k)
        top_matches = [(int(idx), float(cosine_scores[idx]), jd_titles[int(idx)]) for idx in top_results.indices]
        results[resume_name] = top_matches
    return results, model

def compare_topk_overlap(resume_name, jaccard_results, tfidf_results):
    jaccard_ids = {jd_id for jd_id, _, _ in jaccard_results.get(resume_name, [])}
    tfidf_ids = {jd_id for jd_id, _, _ in tfidf_results.get(resume_name, [])}

    overlap = jaccard_ids.intersection(tfidf_ids)

    print(f"\nResume: {resume_name}")
    print(f"Jaccard Top-K IDs: {sorted(jaccard_ids)}")
    print(f"TF-IDF Top-K IDs: {sorted(tfidf_ids)}")
    print(f"Overlap ({len(overlap)}): {sorted(overlap)}")
    print(f"Jaccard-Only: {sorted(jaccard_ids - tfidf_ids)}")
    print(f"TF-IDF-Only: {sorted(tfidf_ids - jaccard_ids)}")

def plot_score_distribution(jaccard_results, tfidf_results):
    jaccard_scores = [score for matches in jaccard_results.values() for _, score, _ in matches]
    tfidf_scores = [score for matches in tfidf_results.values() for _, score, _ in matches]

    plt.figure(figsize=(10, 5))
    plt.hist(jaccard_scores, bins=20, alpha=0.6, label='Jaccard', color='skyblue')
    plt.hist(tfidf_scores, bins=20, alpha=0.6, label='TF-IDF', color='salmon')
    plt.title("Similarity Score Distribution: Jaccard vs TF-IDF")
    plt.xlabel("Similarity Score")
    plt.ylabel("Frequency")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
categories = ['INFORMATION-TECHNOLOGY']
base_resume_dir = './resume_pds/data/data'
jd_csv_path = 'job_title_des.csv'

resume_tokens = {}
for category in categories:
    folder_path = os.path.join(base_resume_dir, category)
    resume_tokens.update(process_resumes(folder_path))

resume_names, resume_texts = read_resumes_raw(base_resume_dir, categories)
jd_titles, jd_texts = read_job_descriptions(jd_csv_path)

jd_data = process_job_descriptions(jd_csv_path)
results_jaccard = match_resumes_to_jobs(resume_tokens, jd_data)
results_tfidf, vectorizer, all_jd_texts = match_resumes_tfidf_cosine(resume_names, resume_texts, jd_titles, jd_texts)
results_sbert, sbert_model = match_resumes_sbert(resume_names, resume_texts, jd_titles, jd_texts)

# Print top matches using SBERT for a specific resume
target_resume = "20674668.pdf"
print(f"\nTop SBERT matches for {target_resume}:")
for jd_id, score, title in results_sbert.get(target_resume, []):
    print(f"  JD ID: {jd_id} | Title: {title[:40]}... | SBERT Score: {score:.3f}")

Batches: 100%|█████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.62it/s]
Batches: 100%|███████████████████████████████████████████████████████████████████████████████████| 72/72 [00:08<00:00,  8.94it/s]



Top SBERT matches for 20674668.pdf:
  JD ID: 1088 | Title: PHP Developer... | SBERT Score: 0.728
  JD ID: 754 | Title: Node js developer... | SBERT Score: 0.728
  JD ID: 611 | Title: Node js developer... | SBERT Score: 0.713
  JD ID: 1673 | Title: Wordpress Developer... | SBERT Score: 0.703
  JD ID: 1214 | Title: PHP Developer... | SBERT Score: 0.702
  JD ID: 265 | Title: Node js developer... | SBERT Score: 0.700
  JD ID: 940 | Title: JavaScript Developer... | SBERT Score: 0.699
  JD ID: 97 | Title: Wordpress Developer... | SBERT Score: 0.697
  JD ID: 2213 | Title: Node js developer... | SBERT Score: 0.695
  JD ID: 2154 | Title: JavaScript Developer... | SBERT Score: 0.694
