In [100]:
import os
import fitz
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math
pd.set_option('display.max_colwidth', None)

In [101]:
def print_full_jd(jd_titles, jd_texts, jd_id):
    if jd_id >= len(jd_titles):
        print(f"JD ID {jd_id} out of range.")
        return
    print(f"\n=== JD ID {jd_id} ===")
    print(f"Job Title: {jd_titles[jd_id]}")
    print(f"Description:\n{jd_texts[jd_id]}")

In [102]:
def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                text += page.get_text()
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
    return text

In [103]:
def read_resumes(base_dir, categories):
    texts = []
    names = []
    for category in categories:
        folder = os.path.join(base_dir, category)
        for fname in os.listdir(folder):
            if fname.endswith(".pdf"):
                full_path = os.path.join(folder, fname)
                text = extract_text_from_pdf(full_path)
                texts.append(text)
                names.append(fname)
    return names, texts

In [104]:
def read_job_descriptions(csv_path):
    df = pd.read_csv(csv_path)
    jd_texts = df['Job Description'].astype(str).tolist()
    jd_titles = df['Job Title'].astype(str).tolist()
    return jd_titles, jd_texts

In [105]:
def match_resumes_tfidf_cosine(resume_names, resume_texts, jd_titles, jd_texts, top_k=10):
    all_docs = resume_texts + jd_texts
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(all_docs)

    resume_vecs = tfidf_matrix[:len(resume_texts)]
    jd_vecs = tfidf_matrix[len(resume_texts):]

    sim_matrix = cosine_similarity(resume_vecs, jd_vecs)

    results = {}
    for i, resume_name in enumerate(resume_names):
        top_matches = sorted(
            list(enumerate(sim_matrix[i])), key=lambda x: -x[1]
        )[:top_k]
        results[resume_name] = [(idx, score, jd_titles[idx]) for idx, score in top_matches]
    return results, vectorizer, jd_texts

In [106]:
def show_top_tokens_for_match(resume_text, jd_text, vectorizer, top_n=20):
    tfidf = vectorizer.transform([resume_text, jd_text])
    feature_names = vectorizer.get_feature_names_out()
    
    r_vec = tfidf[0].toarray()[0]
    jd_vec = tfidf[1].toarray()[0]
    common_indices = (r_vec > 0) & (jd_vec > 0)
    importance = r_vec * jd_vec  # element-wise

    scored_tokens = [(feature_names[i], importance[i]) for i in range(len(importance)) if common_indices[i]]
    top_tokens = sorted(scored_tokens, key=lambda x: -x[1])[:top_n]

    print("\nTop Overlapping Tokens by TF-IDF Score:")
    for token, score in top_tokens:
        print(f"- {token}: {score:.4f}")

In [107]:
categories = ['INFORMATION-TECHNOLOGY']
base_resume_dir = './resume_pds/data/data'
jd_csv_path = 'job_title_des.csv'

In [108]:
resume_names, resume_texts = read_resumes(base_resume_dir, categories)
jd_titles, jd_texts = read_job_descriptions(jd_csv_path)

In [109]:
print(len(jd_titles), len(resume_names))

2277 120


In [110]:
results, vectorizer, all_jd_texts = match_resumes_tfidf_cosine(
    resume_names, resume_texts, jd_titles, jd_texts, top_k=5
)

In [111]:
results

{'18176523.pdf': [(580,
   np.float64(0.2593151513793338),
   'Network Administrator'),
  (1655, np.float64(0.23153351455868848), 'Network Administrator'),
  (399, np.float64(0.22670050939708306), 'Network Administrator'),
  (1459, np.float64(0.21557553551490205), 'Database Administrator'),
  (530, np.float64(0.20662352413453444), 'Network Administrator')],
 '25857360.pdf': [(420,
   np.float64(0.19220159704518303),
   'Database Administrator'),
  (2033, np.float64(0.16027720773733667), 'Database Administrator'),
  (534, np.float64(0.1404553674648251), 'Database Administrator'),
  (102, np.float64(0.13623401202449956), 'Node js developer'),
  (2169, np.float64(0.1269059853711599), 'Database Administrator')],
 '39718499.pdf': [(2172,
   np.float64(0.10647038931026832),
   'Database Administrator'),
  (847, np.float64(0.09471837140949621), 'Software Engineer'),
  (1245, np.float64(0.08254224213650725), 'Backend Developer'),
  (1409, np.float64(0.08189912088968837), 'Full Stack Developer'

In [112]:
ct = 0
for resume, matches in results.items():
    print(f"\nResume: {resume}")
    for jd_id, score, title in matches:
        print(f"  JD ID: {jd_id} | Title: {title[:40]}... | Cosine Score: {score:.3f}")

    ct += 1
    if ct > 1:
        break
            
        # Optional: View contributing tokens
        # show_top_tokens_for_match(resume_texts[resume_names.index(resume)], jd_texts[jd_id], vectorizer)


Resume: 18176523.pdf
  JD ID: 580 | Title: Network Administrator... | Cosine Score: 0.259
  JD ID: 1655 | Title: Network Administrator... | Cosine Score: 0.232
  JD ID: 399 | Title: Network Administrator... | Cosine Score: 0.227
  JD ID: 1459 | Title: Database Administrator... | Cosine Score: 0.216
  JD ID: 530 | Title: Network Administrator... | Cosine Score: 0.207

Resume: 25857360.pdf
  JD ID: 420 | Title: Database Administrator... | Cosine Score: 0.192
  JD ID: 2033 | Title: Database Administrator... | Cosine Score: 0.160
  JD ID: 534 | Title: Database Administrator... | Cosine Score: 0.140
  JD ID: 102 | Title: Node js developer... | Cosine Score: 0.136
  JD ID: 2169 | Title: Database Administrator... | Cosine Score: 0.127


In [113]:
# View full JD
# print_full_jd(jd_titles, jd_texts, jd_id=1027)

In [114]:
target_resume = '15651486.pdf'

In [115]:
# View contributing tokens
show_top_tokens_for_match(
    resume_texts[resume_names.index(target_resume)],
    jd_texts[903],
    vectorizer
)


Top Overlapping Tokens by TF-IDF Score:
- cisco: 0.0871
- windows: 0.0189
- network: 0.0180
- 2003: 0.0116
- hardware: 0.0077
- administration: 0.0070
- connectivity: 0.0069
- exchange: 0.0066
- management: 0.0066
- switches: 0.0063
- printers: 0.0055
- vpn: 0.0052
- server: 0.0051
- calls: 0.0048
- routers: 0.0043
- firewall: 0.0041
- lan: 0.0041
- setup: 0.0038
- symantec: 0.0032
- hp: 0.0028


In [116]:
res_df = pd.read_csv('gold_resumes.csv')
jd_df  = pd.read_csv('gold_jds.csv')

In [117]:
gold = {
    "R1": ["J1","J2","J3","J4"],
    "R2": ["J5","J6","J7","J8"],
    "R3": ["J9","J10","J11","J12"],
    "R4": ["J13","J14","J15","J16"],
    "R5": ["J17","J18","J19","J20"],
}

In [118]:
# Build resume names/texts from the gold CSV
resume_names = res_df['Resume_ID'].astype(str).tolist()
resume_texts = res_df['Resume_Text'].astype(str).tolist()

# Build JD titles/texts from the gold CSV
jd_titles = jd_df['Job_ID'].astype(str).tolist()
jd_texts  = jd_df['Job_Text'].astype(str).tolist()

# Now re-run your TFâ€‘IDF matcher on the CSV data
results, vectorizer, _ = match_resumes_tfidf_cosine(
    resume_names,
    resume_texts,
    jd_titles,
    jd_texts,
    top_k=4
)

In [119]:
results

{'R1': [(0, np.float64(0.35224496704842806), 'J1'),
  (2, np.float64(0.2070004260494528), 'J3'),
  (1, np.float64(0.19206210052692727), 'J2'),
  (4, np.float64(0.09375087652042687), 'J5')],
 'R2': [(19, np.float64(0.16360361889851152), 'J20'),
  (6, np.float64(0.14533547335639505), 'J7'),
  (4, np.float64(0.12114093092163422), 'J5'),
  (5, np.float64(0.11684457993776862), 'J6')],
 'R3': [(8, np.float64(0.40563918407431593), 'J9'),
  (15, np.float64(0.3781934027194118), 'J16'),
  (9, np.float64(0.32804339092387474), 'J10'),
  (10, np.float64(0.17833857579262372), 'J11')],
 'R4': [(12, np.float64(0.18632821283191955), 'J13'),
  (13, np.float64(0.14016774590035128), 'J14'),
  (14, np.float64(0.12349738676805093), 'J15'),
  (7, np.float64(0.08667120017807878), 'J8')],
 'R5': [(16, np.float64(0.2519410943729674), 'J17'),
  (17, np.float64(0.15326028993554586), 'J18'),
  (18, np.float64(0.06646908233668641), 'J19'),
  (4, np.float64(0.06568203249477786), 'J5')]}

In [120]:
def precision_at_k(recs, gold_set, k):
    return len(set(recs[:k]) & gold_set) / k

def recall_at_k(recs, gold_set, k):
    return len(set(recs[:k]) & gold_set) / len(gold_set)

def topk_accuracy(recs, gold_set, k):
    return 1.0 if set(recs[:k]) & gold_set else 0.0

def reciprocal_rank(recs, gold_set):
    for i, jid in enumerate(recs, start=1):
        if jid in gold_set:
            return 1.0 / i
    return 0.0

def dcg_at_k(rels, k):
    return sum(r / math.log2(i+1) for i, r in enumerate(rels[:k], start=1))

def ndcg_at_k(recs, gold_set, k):
    rels = [1 if jid in gold_set else 0 for jid in recs[:k]]
    dcg  = dcg_at_k(rels, k)
    ideal = [1] * min(len(gold_set), k)
    idcg = dcg_at_k(ideal, k)
    return dcg / idcg if idcg > 0 else 0.0

In [121]:
def evaluate(results, gold, k=4):
    P, R, T, RR, N = [], [], [], [], []
    for rid, ranked in results.items():
        if rid not in gold: continue
        gs   = set(gold[rid])
        recs = [title for _, _, title in ranked]
        P.append( precision_at_k(recs, gs, k) )
        R.append( recall_at_k(recs, gs, k)    )
        T.append( topk_accuracy(recs, gs, k)  )
        RR.append(reciprocal_rank(recs, gs)   )
        N.append( ndcg_at_k(recs, gs, k)      )
    return {
        f"Precision@{k}": sum(P)/len(P),
        f"Recall@{k}":    sum(R)/len(R),
        f"Top-{k} Acc.":  sum(T)/len(T),
        "MRR":            sum(RR)/len(RR),
        f"NDCG@{k}":      sum(N)/len(N),
    }

In [122]:
metrics = evaluate(results, gold, k=4)
for name, val in metrics.items():
    print(f"{name}: {val:.3f}")

Precision@4: 0.750
Recall@4: 0.750
Top-4 Acc.: 1.000
MRR: 0.900
NDCG@4: 0.772
