In [1]:
import re, string, random, math, pickle, numpy as np, pandas as pd
from collections import Counter
from pathlib import Path
from tqdm import tqdm
import spacy
from rank_bm25 import BM25Okapi

RESUME_PARQUET       = "resumes.parquet"
JD_PARQUET           = "job_description.parquet"
GOLD_RESUMES_CSV     = "./gold_samples/resumes_samples.csv"
GOLD_JDS_CSV         = "./gold_samples/job_desc_sampled.csv"
GOLD_TOP10_PATH      = "gold_res.txt"

CACHE_DIR            = Path("./cache_bm25")
CACHE_DIR.mkdir(exist_ok=True)

K = 10

In [2]:
EXTRA_STOP = {
    "experience","experiences","years","year","requirements","requirement",
    "responsibilities","responsibility","ability","excellent","strong",
    "demonstrated","proven","successful","successfully","including","etc",
    "must","will","role","position","candidate","applicant","opportunity",
    "team","teams","work","working","environment","environments","dynamic",
    "fast‑paced","communication","communications","written","verbal",
    "skills","skill","interpersonal","detail","details","organization",
    "organizational","problem","problems","solve","solving","solutions",
    "solution","manage","management","managed","managing","lead","leading",
    "leadership","support","supporting","supported","ensure","ensuring",
    "responsible","provide","providing","provided","perform","performing",
    "performed","deliver","delivering","delivered","design","designing",
    "designed","develop","developing","developed","development","process",
    "processes","project","projects","business","client","clients","customer",
    "customers","stakeholder","stakeholders","company","companies","function",
    "functions","functional","collaborate","collaborating","collaboration",
    "across","within","preferred","plus","bonus","equivalent","related",
    "knowledge","familiar","familiarity","understanding","concepts"
}

ALIAS_MAP = {
    "k8s":"kubernetes","gke":"kubernetes","eks":"kubernetes","aks":"kubernetes",
    "gcp":"google‑cloud","aws":"amazon‑web‑services","azure":"microsoft‑azure",
    "js":"javascript","nodejs":"node‑js","node.js":"node‑js",
    ".net":"dotnet","asp.net":"aspnet","c#":"csharp","c++":"cpp",
    "py":"python","ts":"typescript","tf":"tensorflow","tfserving":"tensorflow‑serving",
    "np":"numpy","ml":"machine‑learning","dl":"deep‑learning","pytorch":"torch",
    "gql":"graphql","sqlserver":"sql‑server","tsql":"t‑sql","postgres":"postgresql",
    "psql":"postgresql","mongo":"mongodb","ci/cd":"continuous‑integration‑continuous‑delivery",
    "ci":"continuous‑integration","cd":"continuous‑delivery","infra":"infrastructure",
    "svc":"service","svc‑mesh":"service‑mesh","msgq":"message‑queue",
    "msg‑q":"message‑queue","gh":"github"
}

In [4]:

nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
BASE_STOP = set(nlp.Defaults.stop_words)
punct_tbl = str.maketrans("", "", string.punctuation)

def normalize(text: str) -> list[str]:
    text = text.translate(punct_tbl).lower()
    for k, v in ALIAS_MAP.items():
        text = text.replace(k, v)
    doc = nlp(text)
    return [t.lemma_ for t in doc if t.is_alpha and len(t) > 2]


BIG_JD_TOK_PATH   = CACHE_DIR / "jd_tokens.pkl"
BIG_RES_TOK_PATH  = CACHE_DIR / "resume_tokens_full.pkl"
AUTO_STOP_PATH    = CACHE_DIR / "auto_stop.pkl"

if BIG_JD_TOK_PATH.exists() and AUTO_STOP_PATH.exists():
    print("Loading cached big‑corpus tokens …")
    with BIG_JD_TOK_PATH.open("rb") as fh:   big_jd_tokens  = pickle.load(fh)
    with BIG_RES_TOK_PATH.open("rb") as fh:  big_res_tokens = pickle.load(fh)
    with AUTO_STOP_PATH.open("rb") as fh:    AUTO_STOP      = pickle.load(fh)
else:
    print("Reading parquet …")
    df_resumes = pd.read_parquet(RESUME_PARQUET)
    df_jds     = pd.read_parquet(JD_PARQUET)

    print("Tokenizing big JD corpus …")
    big_jd_tokens  = [normalize(t) for t in tqdm(df_jds.job_description)]

    print("Tokenizing big résumé corpus …")
    big_res_tokens = [normalize(t) for t in tqdm(df_resumes.resume_text)]

    def learn_stop(tok_lists, thresh=0.75, sample=50_000):
        sample_lists = random.sample(tok_lists, min(sample, len(tok_lists)))
        df = Counter()
        for lst in sample_lists: df.update(set(lst))
        cut = math.ceil(thresh * len(sample_lists))
        return {t for t, n in df.items() if n >= cut}

    AUTO_STOP = learn_stop(big_jd_tokens + big_res_tokens, 0.75)

    for path, obj in [(BIG_JD_TOK_PATH, big_jd_tokens),
                      (BIG_RES_TOK_PATH, big_res_tokens),
                      (AUTO_STOP_PATH,    AUTO_STOP)]:
        with path.open("wb") as fh: pickle.dump(obj, fh, protocol=4)
    print(f"Cached tokens & auto‑stop‑list → {CACHE_DIR}")

STOPWORDS = BASE_STOP | EXTRA_STOP | AUTO_STOP
def drop_stop(lst): return [t for t in lst if t not in STOPWORDS]

Loading cached big‑corpus tokens …


In [5]:

gold_resumes_df = pd.read_csv(GOLD_RESUMES_CSV)
gold_jds_df     = pd.read_csv(GOLD_JDS_CSV)

print("Tokenizing gold JDs …")
gold_jd_tokens  = [drop_stop(normalize(t)) for t in tqdm(gold_jds_df.job_description)]

print("Tokenizing gold résumés …")
gold_res_tokens = [drop_stop(normalize(t)) for t in tqdm(gold_resumes_df.resume_text)]


bm25 = BM25Okapi(gold_jd_tokens)      
jd_ids_arr = gold_jds_df.jd_id.to_numpy()


pred = {}
for rid, qtok in zip(gold_resumes_df.resume_id, gold_res_tokens):
    scores = bm25.get_scores(qtok)
    top_idx = np.argpartition(-scores, K-1)[:K]
    top_idx = top_idx[np.argsort(scores[top_idx])[::-1]]
    pred[rid] = jd_ids_arr[top_idx].tolist()

pred_df = pd.DataFrame({"resume_id": list(pred.keys()),
                        "top10_jds": list(pred.values())})
print("\nPrediction sample:\n", pred_df.head())


gold_map = {}
with open(GOLD_TOP10_PATH) as fh:
    for ln in fh:
        if not ln.strip(): continue
        rid, rest = ln.split(":", 1)
        gold_map[rid.strip()] = re.findall(r'JD\d+', rest)

def dcg(rel): return sum(r/np.log2(i+2) for i,r in enumerate(rel))

prec, rec, acc, rr, ndcg = [], [], [], [], []
for rid, gold_jds in gold_map.items():
    if rid not in pred: continue
    hits = [int(j in gold_jds) for j in pred[rid]]
    prec.append(np.mean(hits))
    rec.append(sum(hits)/len(gold_jds))
    acc.append(int(any(hits)))
    rr.append(next((1/(i+1) for i,h in enumerate(hits) if h), 0))
    ndcg.append(dcg(hits)/dcg([1]*min(len(gold_jds),K)))

print("\n──────── Accuracy on gold 50×50 ────────")
print(f"Precision@{K}:   {np.mean(prec):.4f}")
print(f"Recall@{K}:      {np.mean(rec):.4f}")
print(f"Top‑{K} accuracy: {np.mean(acc):.4f}")
print(f"MRR@{K}:          {np.mean(rr):.4f}")
print(f"NDCG@{K}:         {np.mean(ndcg):.4f}")

Tokenizing gold JDs …


100%|███████████████████████████████████████████| 50/50 [00:01<00:00, 25.93it/s]


Tokenizing gold résumés …


100%|███████████████████████████████████████████| 50/50 [00:00<00:00, 58.87it/s]



Prediction sample:
   resume_id                                          top10_jds
0        R1  [JD2, JD38, JD40, JD4, JD18, JD1, JD17, JD5, J...
1        R2  [JD38, JD1, JD3, JD40, JD5, JD37, JD31, JD2, J...
2        R3  [JD1, JD10, JD5, JD38, JD2, JD31, JD36, JD3, J...
3        R4  [JD1, JD38, JD31, JD5, JD47, JD40, JD6, JD37, ...
4        R5  [JD1, JD5, JD38, JD4, JD2, JD37, JD7, JD40, JD...

──────── Accuracy on gold 50×50 ────────
Precision@10:   0.4060
Recall@10:      0.4060
Top‑10 accuracy: 0.9600
MRR@10:          0.6593
NDCG@10:         0.4350
