In [1]:
import pandas as pd
import os
import fitz  # PyMuPDF
import pandas as pd
import re
from collections import defaultdict
import spacy
pd.set_option('display.max_colwidth', None)
nlp = spacy.load("en_core_web_sm")

In [2]:
def show_common_tokens(resume_name, jd_id, resume_tokens, jd_data):
    if resume_name not in resume_tokens:
        print(f"Resume '{resume_name}' not found.")
        return
    if jd_id not in jd_data:
        print(f"JD ID '{jd_id}' not found.")
        return
    
    r_tokens = resume_tokens[resume_name]
    jd_tokens = jd_data[jd_id]['tokens']
    common = r_tokens.intersection(jd_tokens)
    
    print(f"\n=== Common Tokens between Resume '{resume_name}' and JD ID {jd_id} ===")
    print(f"=== Resume token length: {len(r_tokens)} and JD token length: {len(jd_tokens)} ===")
    print(f"Job Title: {jd_data[jd_id]['job_title']}")
    print(f"Jaccard Score: {jaccard_similarity(r_tokens, jd_tokens):.3f}")
    print("Common Tokens (sorted):")
    for word in sorted(common):
        print(f"- {word}")

In [3]:
def print_full_jd(jd_data, jd_id):
    if jd_id not in jd_data:
        print(f"JD ID {jd_id} not found.")
        return
    print(f"\n=== JD ID {jd_id} ===")
    print(f"Job Title: {jd_data[jd_id]['job_title']}")
    print(f"Description:\n{jd_data[jd_id]['full_desc']}")

In [4]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

In [5]:
def tokenize(text):
    doc = nlp(text.lower())
    tokens = set()
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.like_num and token.is_alpha:
            tokens.add(token.lemma_)  # ← this is lemmatization!
    return tokens

In [6]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

In [7]:
def process_resumes(folder_path):
    resume_data = {}
    for fname in os.listdir(folder_path):
        if fname.endswith('.pdf'):
            full_path = os.path.join(folder_path, fname)
            text = extract_text_from_pdf(full_path)
            tokens = tokenize(text)
            resume_data[fname] = tokens
    return resume_data

In [8]:
def process_job_descriptions(csv_path):
    df = pd.read_csv(csv_path)
    jd_data = {}
    for idx, row in df.iterrows():
        jd_text = str(row['Job Description'])
        tokens = tokenize(jd_text)
        jd_data[idx] = {
            'job_title': row.get('Job Title', ''),
            'tokens': tokens,
            'full_desc': jd_text.strip()
        }
    return jd_data

In [9]:
def match_resumes_to_jobs(resume_tokens, jd_data, top_k=10):
    results = defaultdict(list)
    for resume_name, r_tokens in resume_tokens.items():
        scores = []
        for jd_id, jd_info in jd_data.items():
            score = jaccard_similarity(r_tokens, jd_info['tokens'])
            scores.append((jd_id, score, jd_info['job_title']))
        top_matches = sorted(scores, key=lambda x: -x[1])[:top_k]
        results[resume_name] = top_matches
    return results

In [10]:
categories = ['INFORMATION-TECHNOLOGY']
base_resume_dir = './resume_pds/data/data'
jd_csv_path = 'job_title_des.csv'

In [11]:
jd_data = process_job_descriptions(jd_csv_path)

In [12]:
resume_tokens = {}
for category in categories:
    folder_path = os.path.join(base_resume_dir, category)
    resume_tokens.update(process_resumes(folder_path))

In [13]:
results = match_resumes_to_jobs(resume_tokens, jd_data)

In [14]:
print_full_jd(jd_data, jd_id=1088)


=== JD ID 1088 ===
Job Title: PHP Developer
Description:
1. Minimum 3+ years of solid development experience in developing web applications with Core PHP, MySQL
HTML,HTML5, CSS3, Jquery/Ajax and JavaScript.
3. Good Experience on web technologies including HTML, Javascript, JQuery, AJAX.
4. Strong knowledge client-side scripting and JavaScript libraries, jQuery etc.
5. Strong knowledge on Database/Web application Design (Mysql, MSSQL).
6. Good understanding of Web and Social Media applications/ Cross Browser / Mobile Best Practices.
7. Good understanding of asynchronous request handling, partial page updates, and AJAX.
8. Knowledge on Flash, Flex, actionscript will be a plus.
9. Excellent communication, self-motivator, team player, ability to solve complex problems, design &
requirements documentation.
10. Candidate should have good knowledge of the OOPS concept.
Job Type: Full-time
Salary: ₹30,000.00 - ₹45,000.00 per month
Schedule:
Day shift
Experience:
software development: 3 years 

In [15]:
ct = 0
sc = -1
nm = ''
jd = 0
for resume, matches in results.items():
    if not resume == '20674668.pdf':
        continue
    print(f"\nResume: {resume}")
    for jd_id, score, title in matches:
        print(f"  JD ID: {jd_id} | Title: {title[:40]}... | Score: {score:.3f}")
        # if(score > sc):
        #     sc = score
        #     nm = resume
        #     jd = jd_id

    # ct+=1
    # if(ct > 20):
    #     break

# print(nm, sc, jd_id)


Resume: 20674668.pdf
  JD ID: 2244 | Title: Backend Developer... | Score: 0.230
  JD ID: 374 | Title: Java Developer... | Score: 0.213
  JD ID: 302 | Title: Full Stack Developer... | Score: 0.213
  JD ID: 234 | Title: Java Developer... | Score: 0.203
  JD ID: 1749 | Title: Software Engineer... | Score: 0.201
  JD ID: 431 | Title: DevOps Engineer... | Score: 0.200
  JD ID: 736 | Title: Full Stack Developer... | Score: 0.200
  JD ID: 1822 | Title: Software Engineer... | Score: 0.199
  JD ID: 625 | Title: Database Administrator... | Score: 0.199
  JD ID: 1991 | Title: Software Engineer... | Score: 0.198


In [16]:
# After matching resumes and getting resume_tokens and jd_data
show_common_tokens("20674668.pdf", 1088, resume_tokens, jd_data)


=== Common Tokens between Resume '20674668.pdf' and JD ID 1088 ===
=== Resume token length: 286 and JD token length: 81 ===
Job Title: PHP Developer
Jaccard Score: 0.116
Common Tokens (sorted):
- ability
- ajax
- application
- client
- complex
- core
- css
- database
- design
- develop
- development
- documentation
- excellent
- experience
- high
- html
- include
- javascript
- jquery
- knowledge
- language
- need
- php
- practice
- problem
- programming
- request
- requirement
- scripting
- software
- solve
- strong
- team
- technology
- update
- web
- work
- year
