In [56]:
import re
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.auto import tqdm   
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import pairwise_distances
import unicodedata

In [57]:
res_df = pd.read_csv("./gold_samples/resumes_samples.csv")  
jd_df  = pd.read_csv("./gold_samples/job_desc_sampled.csv") 

In [58]:
def clean(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()               

In [59]:
# res_texts = res_df["resume_text"].fillna("").tolist()
# jd_texts  = (jd_df["job_title"].fillna("")  + " " +
#              jd_df["job_description"].fillna("")).tolist()
res_texts = res_df["resume_text"].fillna("").map(clean).tolist()
jd_texts  = (jd_df["job_title"].fillna("").map(clean) + " " +
             jd_df["job_description"].fillna("").map(clean)).tolist()

In [60]:

gold_file = Path("gold_res.txt")
row_pat   = re.compile(r"R(\d+)\s*:\s*\[(.*)\]")
jd_pat    = re.compile(r"JD\d+")

gold_dict = {}
with gold_file.open() as fh:
    for raw in fh:
        m = row_pat.match(raw.strip())
        if not m:
            continue
        rid      = f"R{m.group(1)}"
        jd_list  = jd_pat.findall(m.group(2))
        gold_dict[rid] = jd_list

assert len(gold_dict) == 50, "Gold file should contain 50 rows (R1…R50)"

In [61]:

model_name = "sentence-transformers/all-MiniLM-L6-v2"
print(f"Loading SBERT model: {model_name}")
model = SentenceTransformer(model_name, device="cpu")  # change to "cuda" if available

print("Encoding resumes …")
res_emb = model.encode(res_texts, batch_size=16, normalize_embeddings=True, show_progress_bar=True)

print("Encoding job descriptions …")
jd_emb  = model.encode(jd_texts,  batch_size=16, normalize_embeddings=True, show_progress_bar=True)

Loading SBERT model: sentence-transformers/all-MiniLM-L6-v2
Encoding resumes …


Batches: 100%|████████████████████████████████████| 4/4 [00:00<00:00,  7.19it/s]


Encoding job descriptions …


Batches: 100%|████████████████████████████████████| 4/4 [00:00<00:00,  8.79it/s]


In [62]:
# model_name = "cross-encoder/ms-marco-MiniLM-L6-v2"
# print(f"Loading Cross‑Encoder model: {model_name}")
# from sentence_transformers import CrossEncoder
# cross_encoder = CrossEncoder(model_name, device="cpu")   # "cuda" if GPU

# pairs = [(res, jd) for res in res_texts for jd in jd_texts]

# print("Scoring all resume ↔ JD pairs …")
# scores = cross_encoder.predict(pairs, batch_size=32, show_progress_bar=True)


# import numpy as np
# sim_matrix = np.array(scores).reshape(len(res_texts), len(jd_texts))

In [63]:

sim_matrix = util.cos_sim(res_emb, jd_emb).cpu().numpy()   

topk = 10
retrieved = {}

for i, sim_row in enumerate(sim_matrix):
    top_idx   = np.argsort(sim_row)[-topk:][::-1]          
    top_jds   = jd_df.iloc[top_idx]["jd_id"].tolist()
    retrieved[f"R{i+1}"] = top_jds

In [64]:

def precision_at_k(pred, gold, k=10):
    return len(set(pred[:k]) & set(gold)) / k

def recall_at_k(pred, gold, k=10):
    return len(set(pred[:k]) & set(gold)) / len(gold)

def mrr_at_k(pred, gold, k=10):
    for rank, jd in enumerate(pred[:k], 1):
        if jd in gold:
            return 1.0 / rank
    return 0.0

def ndcg_at_k(pred, gold, k=10):
    dcg = 0.0
    for rank, jd in enumerate(pred[:k], 1):
        if jd in gold:
            dcg += 1.0 / np.log2(rank + 1)
    # Ideal DCG when all 10 gold items are on top
    idcg = sum(1.0 / np.log2(r + 1) for r in range(1, min(len(gold), k) + 1))
    return dcg / idcg if idcg > 0 else 0.0

def topk_accuracy(pred, gold, k=10):
    return int(len(set(pred[:k]) & set(gold)) > 0)

In [82]:
P, R, ACC, MRR, NDCG = [], [], [], [], []
per_resume_metrics = []

for rid, pred_jds in retrieved.items():
    gold_jds = gold_dict[rid]
    p = round(precision_at_k(pred_jds, gold_jds, topk),4)
    r = round(recall_at_k(pred_jds, gold_jds, topk),4)
    acc = round(topk_accuracy(pred_jds, gold_jds, topk),4)
    mrr = round(mrr_at_k(pred_jds, gold_jds, topk),4)
    ndcg = round(ndcg_at_k(pred_jds, gold_jds, topk),4)
    
    P.append(p)
    R.append(r)
    ACC.append(acc)
    MRR.append(mrr)
    NDCG.append(ndcg)
    
    per_resume_metrics.append({
        "Resume": rid,
        "Precision@10": round(p,1),
        "Recall@10": round(r,1),
        # "Top10_Acc": round(acc,1),
        "MRR@10": round(mrr,1),
        "NDCG@10": round(ndcg,1)
    })

metrics_df = pd.DataFrame({
    "Metric": ["Precision@10", "Recall@10", "Top‑10 accuracy", "MRR@10", "NDCG@10"],
    "Value":  [np.mean(P),    np.mean(R),   np.mean(ACC),     np.mean(MRR), np.mean(NDCG)]
})

per_resume_df = pd.DataFrame(per_resume_metrics).set_index("Resume").T

print(f"model_name = {model_name}")
print("\n===  Retrieval quality (average over 50 resumes)  ===")
print(metrics_df.to_string(index=False, float_format=lambda x: f"{x:0.4f}"))

model_name = sentence-transformers/all-MiniLM-L6-v2

===  Retrieval quality (average over 50 resumes)  ===
         Metric  Value
   Precision@10 0.4680
      Recall@10 0.4680
Top‑10 accuracy 0.9800
         MRR@10 0.7965
        NDCG@10 0.5165


In [83]:
print("\n===  Per-Resume Metrics  ===")
per_resume_df


===  Per-Resume Metrics  ===


Resume,R1,R2,R3,R4,R5,R6,R7,R8,R9,R10,...,R41,R42,R43,R44,R45,R46,R47,R48,R49,R50
Precision@10,0.7,0.7,0.7,0.7,0.6,0.3,0.7,0.4,0.4,0.4,...,0.4,0.3,0.6,0.4,0.5,0.7,0.7,0.7,0.5,0.4
Recall@10,0.7,0.7,0.7,0.7,0.6,0.3,0.7,0.4,0.4,0.4,...,0.4,0.3,0.6,0.4,0.5,0.7,0.7,0.7,0.5,0.4
MRR@10,1.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,0.5,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
NDCG@10,0.7,0.8,0.8,0.8,0.7,0.3,0.8,0.4,0.4,0.5,...,0.5,0.4,0.7,0.5,0.5,0.8,0.8,0.8,0.6,0.6


In [66]:
# R6 : [JD6, JD8, JD9, JD12, JD14, JD15, JD26, JD27, JD28, JD30]
for rid in sorted(retrieved.keys(), key=lambda s: int(s[1:])):
    print(f"{rid} : {retrieved[rid]}")

R1 : ['JD3', 'JD22', 'JD23', 'JD40', 'JD4', 'JD37', 'JD38', 'JD2', 'JD5', 'JD34']
R2 : ['JD5', 'JD3', 'JD1', 'JD2', 'JD4', 'JD34', 'JD37', 'JD23', 'JD36', 'JD50']
R3 : ['JD5', 'JD1', 'JD3', 'JD4', 'JD2', 'JD10', 'JD37', 'JD47', 'JD40', 'JD23']
R4 : ['JD3', 'JD5', 'JD4', 'JD2', 'JD23', 'JD1', 'JD37', 'JD36', 'JD34', 'JD25']
R5 : ['JD3', 'JD5', 'JD2', 'JD1', 'JD4', 'JD23', 'JD47', 'JD34', 'JD37', 'JD10']
R6 : ['JD3', 'JD8', 'JD4', 'JD27', 'JD23', 'JD6', 'JD22', 'JD25', 'JD5', 'JD34']
R7 : ['JD15', 'JD11', 'JD14', 'JD6', 'JD9', 'JD8', 'JD7', 'JD27', 'JD3', 'JD13']
R8 : ['JD15', 'JD11', 'JD7', 'JD14', 'JD4', 'JD46', 'JD25', 'JD3', 'JD28', 'JD13']
R9 : ['JD46', 'JD8', 'JD36', 'JD27', 'JD37', 'JD3', 'JD6', 'JD4', 'JD15', 'JD24']
R10 : ['JD3', 'JD4', 'JD27', 'JD23', 'JD25', 'JD36', 'JD5', 'JD24', 'JD22', 'JD8']
R11 : ['JD28', 'JD20', 'JD13', 'JD27', 'JD6', 'JD15', 'JD29', 'JD3', 'JD26', 'JD24']
R12 : ['JD47', 'JD3', 'JD32', 'JD23', 'JD31', 'JD37', 'JD2', 'JD40', 'JD28', 'JD22']
R13 : ['JD47',