In [1]:
import pandas as pd
import os
import fitz  # PyMuPDF
import pandas as pd
import re
from collections import defaultdict
pd.set_option('display.max_colwidth', None)

In [2]:
def show_common_tokens(resume_name, jd_id, resume_tokens, jd_data):
    if resume_name not in resume_tokens:
        print(f"Resume '{resume_name}' not found.")
        return
    if jd_id not in jd_data:
        print(f"JD ID '{jd_id}' not found.")
        return
    
    r_tokens = resume_tokens[resume_name]
    jd_tokens = jd_data[jd_id]['tokens']
    common = r_tokens.intersection(jd_tokens)
    
    print(f"\n=== Common Tokens between Resume '{resume_name}' and JD ID {jd_id} ===")
    print(f"=== Resume token length: {len(r_tokens)} and JD token length: {len(jd_tokens)} ===")
    print(f"Job Title: {jd_data[jd_id]['job_title']}")
    print(f"Jaccard Score: {jaccard_similarity(r_tokens, jd_tokens):.3f}")
    print("Common Tokens (sorted):")
    for word in sorted(common):
        print(f"- {word}")

In [3]:
def print_full_jd(jd_data, jd_id):
    if jd_id not in jd_data:
        print(f"JD ID {jd_id} not found.")
        return
    print(f"\n=== JD ID {jd_id} ===")
    print(f"Job Title: {jd_data[jd_id]['job_title']}")
    print(f"Description:\n{jd_data[jd_id]['full_desc']}")

In [4]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

In [5]:
def tokenize(text):
    text = text.lower()
    tokens = re.findall(r'\b\w+\b', text)
    return set(tokens)

In [6]:
def jaccard_similarity(set1, set2):
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union) if union else 0

In [7]:
def process_resumes(folder_path):
    resume_data = {}
    for fname in os.listdir(folder_path):
        if fname.endswith('.pdf'):
            full_path = os.path.join(folder_path, fname)
            text = extract_text_from_pdf(full_path)
            tokens = tokenize(text)
            resume_data[fname] = tokens
    return resume_data

In [8]:
def process_job_descriptions(csv_path):
    df = pd.read_csv(csv_path)
    jd_data = {}
    for idx, row in df.iterrows():
        jd_text = str(row['Job Description'])
        tokens = tokenize(jd_text)
        jd_data[idx] = {
            'job_title': row.get('Job Title', ''),
            'tokens': tokens,
            'full_desc': jd_text.strip()
        }
    return jd_data

In [9]:
def match_resumes_to_jobs(resume_tokens, jd_data, top_k=10):
    results = defaultdict(list)
    for resume_name, r_tokens in resume_tokens.items():
        scores = []
        for jd_id, jd_info in jd_data.items():
            score = jaccard_similarity(r_tokens, jd_info['tokens'])
            scores.append((jd_id, score, jd_info['job_title']))
        top_matches = sorted(scores, key=lambda x: -x[1])[:top_k]
        results[resume_name] = top_matches
    return results

In [10]:
categories = ['INFORMATION-TECHNOLOGY']
base_resume_dir = './resume_pds/data/data'
jd_csv_path = 'job_title_des.csv'

In [11]:
jd_data = process_job_descriptions(jd_csv_path)

In [12]:
resume_tokens = {}
for category in categories:
    folder_path = os.path.join(base_resume_dir, category)
    resume_tokens.update(process_resumes(folder_path))

In [13]:
results = match_resumes_to_jobs(resume_tokens, jd_data)

In [14]:
print_full_jd(jd_data, jd_id=2137)


=== JD ID 2137 ===
Job Title: Network Administrator
Description:
The person will be responsible for providing technical support for hardware and software functionality as related to desktop technology, and act as a system administrators for the organization. He should be delivering quick response technical assistance with strong problem solving capabilities and attention to service. Identifying network issues and fixing them. He should keep up with constantly evolving technology through continuing education.
Experience : 3 to 5 years
Location : Pune
Responsibilities :
Upgrade, install and support all necessary hardware and software.
Help desk operations including phone support, emails or walk-ins.
Manage, track and follow-up on IT related calls through an incident reporting/tracking system.
Perform troubleshooting and problem resolution while communicating with staff until final resolution is achieved.
Install hardware (which can include PC’s laptops, printers, video conferences, etc.

In [15]:
ct = 0
for resume, matches in results.items():
    print(f"\nResume: {resume}")
    for jd_id, score, title in matches:
        print(f"  JD ID: {jd_id} | Title: {title[:40]}... | Score: {score:.3f}")

    ct+=1
    if(ct > 40):
        break


Resume: 18176523.pdf
  JD ID: 2146 | Title: Network Administrator... | Score: 0.181
  JD ID: 580 | Title: Network Administrator... | Score: 0.168
  JD ID: 530 | Title: Network Administrator... | Score: 0.161
  JD ID: 133 | Title: DevOps Engineer... | Score: 0.157
  JD ID: 399 | Title: Network Administrator... | Score: 0.152
  JD ID: 2228 | Title: Network Administrator... | Score: 0.150
  JD ID: 949 | Title: Software Engineer... | Score: 0.150
  JD ID: 653 | Title: DevOps Engineer... | Score: 0.150
  JD ID: 1850 | Title: Network Administrator... | Score: 0.149
  JD ID: 1655 | Title: Network Administrator... | Score: 0.148

Resume: 25857360.pdf
  JD ID: 2137 | Title: Network Administrator... | Score: 0.122
  JD ID: 1770 | Title: Machine Learning... | Score: 0.115
  JD ID: 1459 | Title: Database Administrator... | Score: 0.115
  JD ID: 1401 | Title: Java Developer... | Score: 0.113
  JD ID: 420 | Title: Database Administrator... | Score: 0.113
  JD ID: 1996 | Title: Network Administrator

In [16]:
# After matching resumes and getting resume_tokens and jd_data
show_common_tokens("39718499.pdf", 2137, resume_tokens, jd_data)


=== Common Tokens between Resume '39718499.pdf' and JD ID 2137 ===
=== Resume token length: 188 and JD token length: 178 ===
Job Title: Network Administrator
Jaccard Score: 0.140
Common Tokens (sorted):
- 00
- 20
- 3
- a
- all
- an
- and
- as
- assist
- bachelor
- calls
- communication
- customer
- education
- equipment
- etc
- exceptional
- experience
- for
- in
- including
- interpersonal
- issues
- maintain
- maintenance
- necessary
- of
- on
- phone
- reliable
- run
- s
- science
- service
- skills
- staff
- strong
- the
- through
- to
- up
- walk
- with
- work
- years
