In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import joblib
from pathlib import Path

In [2]:
def train_tfidf(
        resumes_df: pd.DataFrame,
        jds_df: pd.DataFrame,
        *,
        ngram_range=(1, 2),
        min_df=25,
        max_df=0.8,
        max_features=50_000,
        stop_words="english",
        save_dir: str | Path | None = None
):
    corpus = pd.concat(
        [resumes_df["resume_text"], jds_df["job_description"]],
        ignore_index=True
    )

    vectorizer = TfidfVectorizer(
        lowercase=True,
        ngram_range=ngram_range,
        min_df=min_df,
        max_df=max_df,
        max_features=max_features,
        stop_words=stop_words,
        sublinear_tf=True,
        norm="l2"         
    )
    tfidf_all = vectorizer.fit_transform(corpus) 

    n_resumes = len(resumes_df)
    resume_tfidf = tfidf_all[:n_resumes]
    jd_tfidf     = tfidf_all[n_resumes:]

    if save_dir is not None:
        save_dir = Path(save_dir)
        save_dir.mkdir(parents=True, exist_ok=True)
        joblib.dump(vectorizer, save_dir / "tfidf_vectorizer.joblib")
        sparse.save_npz(save_dir / "resume_tfidf.npz", resume_tfidf)
        sparse.save_npz(save_dir / "jd_tfidf.npz", jd_tfidf)

    return vectorizer, resume_tfidf, jd_tfidf

In [3]:
df_resumes = pd.read_parquet("resumes.parquet")
df_jds = pd.read_parquet("job_description.parquet")

In [4]:
vectorizer, resume_X, jd_X = train_tfidf(df_resumes, df_jds, save_dir="tfidf_joblib")

In [5]:
gold_resumes_df = pd.read_csv('./gold_samples/resumes_samples.csv')
gold_jds_df     = pd.read_csv('./gold_samples/job_desc_sampled.csv')

In [6]:
gold_res_X = vectorizer.transform(gold_resumes_df["resume_text"])
gold_jd_X  = vectorizer.transform(gold_jds_df["job_description"])

In [7]:
k = 10
cosim_dense = cosine_similarity(gold_res_X, gold_jd_X, dense_output=True)  # 50×50 dense → tiny

topk_idx   = np.argsort(-cosim_dense, axis=1)[:, :k]

topk_vals  = np.take_along_axis(cosim_dense, topk_idx, axis=1)

jd_ids = gold_jds_df["jd_id"].to_numpy()

topk_jd_scores = [
    [
        (jd_ids[idx], round(score * 100, 4))      # 0.1054 → 10.54 %
        for idx, score in zip(idx_row, val_row)
    ]
    for idx_row, val_row in zip(topk_idx, topk_vals)
]

pred_df = pd.DataFrame({
    "resume_id": gold_resumes_df["resume_id"],
    "top10_jds": topk_jd_scores
})

pred_df.head(2)

Unnamed: 0,resume_id,top10_jds
0,R1,"[(JD38, 6.1375), (JD2, 5.8182), (JD50, 5.7929)..."
1,R2,"[(JD38, 7.8727), (JD40, 6.9629), (JD1, 6.45), ..."


In [8]:
pred_dict = dict(zip(pred_df["resume_id"], pred_df["top10_jds"]))

# for i in range(1, 51):
#     rid = f"R{i}"
#     jd_pairs = pred_dict.get(rid, [])
#     # build "(JDxx, 10.54)" strings
#     formatted = ", ".join(f"({jid}, {score:.2f})" for jd_tuple in jd_pairs
#                                                    for jid, score in [jd_tuple])
#     print(f"{rid} : [{formatted}]")

for i in range(1, 51):
    rid = f"R{i}"
    jd_pairs = pred_dict.get(rid, [])
    # build "(JDxx, 10.54)" strings
    formatted = ", ".join(f"{jid}" for jd_tuple in jd_pairs
                                                   for jid, score in [jd_tuple])
    print(f"{rid} : [{formatted}]")


R1 : [JD38, JD2, JD50, JD5, JD18, JD1, JD36, JD4, JD47, JD40]
R2 : [JD38, JD40, JD1, JD37, JD6, JD31, JD17, JD5, JD2, JD3]
R3 : [JD1, JD5, JD49, JD38, JD48, JD4, JD46, JD10, JD2, JD36]
R4 : [JD1, JD6, JD38, JD31, JD40, JD5, JD17, JD32, JD2, JD34]
R5 : [JD1, JD4, JD2, JD5, JD40, JD10, JD38, JD23, JD6, JD19]
R6 : [JD36, JD38, JD40, JD31, JD7, JD3, JD5, JD23, JD6, JD22]
R7 : [JD14, JD11, JD15, JD7, JD6, JD8, JD28, JD22, JD27, JD39]
R8 : [JD14, JD15, JD11, JD7, JD27, JD47, JD17, JD22, JD28, JD20]
R9 : [JD6, JD7, JD8, JD12, JD27, JD17, JD49, JD48, JD47, JD3]
R10 : [JD31, JD6, JD21, JD32, JD40, JD34, JD38, JD24, JD1, JD37]
R11 : [JD6, JD7, JD12, JD47, JD20, JD24, JD8, JD28, JD14, JD11]
R12 : [JD47, JD39, JD2, JD32, JD27, JD33, JD18, JD38, JD22, JD17]
R13 : [JD47, JD17, JD2, JD20, JD14, JD19, JD11, JD15, JD27, JD18]
R14 : [JD18, JD47, JD44, JD22, JD17, JD39, JD6, JD7, JD38, JD28]
R15 : [JD14, JD11, JD15, JD7, JD12, JD27, JD38, JD6, JD28, JD13]
R16 : [JD17, JD19, JD6, JD16, JD38, JD18, JD20, J

In [13]:
import re, numpy as np, pandas as pd
from pathlib import Path

K          = 10
GOLD_PATH  = Path("gold_res.txt")   
PRED_DF    = pred_df                
gold = {}
with GOLD_PATH.open() as f:
    for line in f:
        if not line.strip():
            continue
        rid, rest = line.split(":", 1)
        rid = rid.strip()
        gold[rid] = re.findall(r'JD\d+', rest)      


pred = {
    rid: [jd for jd, score in row.top10_jds[:K]]
    for rid, row in PRED_DF.set_index("resume_id").iterrows()
}


def dcg(relevances):
    return sum(rel / np.log2(i + 2) for i, rel in enumerate(relevances))

prec, rec, acc, rr, ndcg = [], [], [], [], []
per_resume = {}

for rid, gold_ids in gold.items():
    if rid not in pred:        
        continue

    topk = pred[rid]
    hits = [int(j in gold_ids) for j in topk]    

    p_at_k   = np.mean(hits)
    r_at_k   = sum(hits) / len(gold_ids) if gold_ids else 0
    has_hit  = any(hits)
    recip_r  = next((1/(i+1) for i,h in enumerate(hits) if h), 0)

    ideal_len  = min(len(gold_ids), K)
    ndcg_score = dcg(hits) / dcg([1]*ideal_len) if ideal_len else 0

    prec.append(p_at_k)
    rec.append(r_at_k)
    acc.append(1 if has_hit else 0)
    rr.append(recip_r)
    ndcg.append(ndcg_score)

    per_resume[rid] = {
        f"Precision@{K}": round(p_at_k, 1),
        f"Recall@{K}":    round(r_at_k, 1),
        f"MRR@{K}":       round(recip_r, 1),
        f"NDCG@{K}":      round(ndcg_score, 1)
    }

print(f"Evaluated {len(prec)} resumes (present in both gold & preds)\n")
print(f"Precision@{K}:   {np.mean(prec):.4f}")
print(f"Recall@{K}:      {np.mean(rec):.4f}")
print(f"Top‑{K} accuracy: {np.mean(acc):.4f}") 
print(f"MRR@{K}:          {np.mean(rr):.4f}")
print(f"NDCG@{K}:         {np.mean(ndcg):.4f}")

Evaluated 50 resumes (present in both gold & preds)

Precision@10:   0.4200
Recall@10:      0.4200
Top‑10 accuracy: 1.0000
MRR@10:          0.7429
NDCG@10:         0.4607


In [15]:
per_df = pd.DataFrame(per_resume).T
per_df.index.name = "Resume"
print("\nPer‑resume metrics:")
per_df.T


Per‑resume metrics:


Resume,R1,R2,R3,R4,R5,R6,R7,R8,R9,R10,...,R41,R42,R43,R44,R45,R46,R47,R48,R49,R50
Precision@10,0.7,0.7,0.6,0.5,0.6,0.1,0.7,0.4,0.4,0.4,...,0.4,0.2,0.3,0.2,0.4,0.4,0.7,0.6,0.5,0.5
Recall@10,0.7,0.7,0.6,0.5,0.6,0.1,0.7,0.4,0.4,0.4,...,0.4,0.2,0.3,0.2,0.4,0.4,0.7,0.6,0.5,0.5
MRR@10,1.0,1.0,1.0,1.0,1.0,0.1,1.0,1.0,1.0,0.2,...,0.3,0.2,0.3,0.3,1.0,1.0,1.0,1.0,1.0,1.0
NDCG@10,0.7,0.8,0.7,0.6,0.7,0.1,0.8,0.5,0.5,0.3,...,0.3,0.2,0.3,0.2,0.5,0.5,0.7,0.7,0.6,0.6
