In [24]:
import re, string, random, math, numpy as np, pandas as pd
from collections import Counter
from pathlib import Path
from tqdm import tqdm
import spacy
from rank_bm25 import BM25Okapi   # BM25 implementation

In [25]:
RESUME_PARQUET       = "resumes.parquet"
JD_PARQUET           = "job_description.parquet"
GOLD_RESUMES_CSV     = "./gold_samples/resumes_samples.csv"
GOLD_JDS_CSV         = "./gold_samples/job_desc_sampled.csv"
GOLD_TOP10_PATH      = "gold_res.txt"

K = 10                            # evaluate @ top‑K

In [26]:
print("Loading parquet …")
df_resumes = pd.read_parquet(RESUME_PARQUET)          # ≈ 4 k rows
df_jds     = pd.read_parquet(JD_PARQUET)              # ≈ 492 k rows

gold_resumes_df = pd.read_csv(GOLD_RESUMES_CSV)       # 50 rows
gold_jds_df     = pd.read_csv(GOLD_JDS_CSV)           # 50 rows

Loading parquet …


In [27]:
EXTRA_STOP = {
    "experience","experiences","years","year","requirements","requirement",
    "responsibilities","responsibility","ability","excellent","strong",
    "demonstrated","proven","successful","successfully","including","etc",
    "must","will","role","position","candidate","applicant","opportunity",
    "team","teams","work","working","environment","environments","dynamic",
    "fast‑paced","communication","communications","written","verbal",
    "skills","skill","interpersonal","detail","details","organization",
    "organizational","problem","problems","solve","solving","solutions",
    "solution","manage","management","managed","managing","lead","leading",
    "leadership","support","supporting","supported","ensure","ensuring",
    "responsible","provide","providing","provided","perform","performing",
    "performed","deliver","delivering","delivered","design","designing",
    "designed","develop","developing","developed","development","process",
    "processes","project","projects","business","client","clients","customer",
    "customers","stakeholder","stakeholders","company","companies","function",
    "functions","functional","collaborate","collaborating","collaboration",
    "across","within","preferred","plus","bonus","equivalent","related",
    "knowledge","familiar","familiarity","understanding","concepts"
}

ALIAS_MAP = {
    "k8s":"kubernetes","gke":"kubernetes","eks":"kubernetes","aks":"kubernetes",
    "gcp":"google‑cloud","aws":"amazon‑web‑services","azure":"microsoft‑azure",
    "js":"javascript","nodejs":"node‑js","node.js":"node‑js",
    ".net":"dotnet","asp.net":"aspnet","c#":"csharp","c++":"cpp",
    "py":"python","ts":"typescript","tf":"tensorflow","tfserving":"tensorflow‑serving",
    "np":"numpy","ml":"machine‑learning","dl":"deep‑learning","pytorch":"torch",
    "gql":"graphql","sqlserver":"sql‑server","tsql":"t‑sql","postgres":"postgresql",
    "psql":"postgresql","mongo":"mongodb","ci/cd":"continuous‑integration‑continuous‑delivery",
    "ci":"continuous‑integration","cd":"continuous‑delivery","infra":"infrastructure",
    "svc":"service","svc‑mesh":"service‑mesh","msgq":"message‑queue",
    "msg‑q":"message‑queue","gh":"github"
}

In [28]:
print("Setting up spaCy …")
nlp = spacy.load("en_core_web_sm", disable=["ner","parser"])
BASE_STOP = set(nlp.Defaults.stop_words)
punct_tbl = str.maketrans("", "", string.punctuation)

def tok_normalize(text:str)->list[str]:
    text = text.translate(punct_tbl).lower()
    for k,v in ALIAS_MAP.items():
        text = text.replace(k, v)
    doc = nlp(text)
    return [tok.lemma_ for tok in doc
            if tok.is_alpha and len(tok)>2]

Setting up spaCy …


In [30]:
def learn_stop(tokens_lists, df_thresh=0.75, sample_size=50_000):
    sample = random.sample(tokens_lists, min(sample_size,len(tokens_lists)))
    df_counter = Counter()
    for toks in sample:
        df_counter.update(set(toks))
    cut = math.ceil(df_thresh * len(sample))
    return {t for t,df in df_counter.items() if df>=cut}

In [31]:
import joblib, os, pickle

CACHE_DIR = Path("./cache_bm25")
CACHE_DIR.mkdir(exist_ok=True)

JDTOK_PATH   = CACHE_DIR / "jd_tokens.pkl"
RESTOK_PATH  = CACHE_DIR / "resume_tokens_full.pkl"
STOP_PATH    = CACHE_DIR / "auto_stop.pkl"

def dump(obj, path):
    with path.open("wb") as fh:
        pickle.dump(obj, fh, protocol=pickle.HIGHEST_PROTOCOL)

def load(path):
    with path.open("rb") as fh:
        return pickle.load(fh)

if JDTOK_PATH.exists() and RESTOK_PATH.exists() and STOP_PATH.exists():
    print("Loading cached token lists …")
    jd_tokens           = load(JDTOK_PATH)
    resume_tokens_full  = load(RESTOK_PATH)
    AUTO_STOP           = load(STOP_PATH)
else:
    print("Tokenizing JD corpus …")
    jd_tokens = [tok_normalize(t) for t in tqdm(df_jds.job_description)]

    print("Tokenizing resume corpus …")
    resume_tokens_full = [tok_normalize(t) for t in tqdm(df_resumes.resume_text)]

    print("Deriving auto stop‑words …")
    AUTO_STOP = learn_stop(jd_tokens + resume_tokens_full, 0.75)

    dump(jd_tokens,          JDTOK_PATH)
    dump(resume_tokens_full, RESTOK_PATH)
    dump(AUTO_STOP,          STOP_PATH)
    print(f"Saved token lists & auto‑stop list → {CACHE_DIR}")

STOPWORDS = BASE_STOP | EXTRA_STOP | AUTO_STOP
def drop_stop(tok_list): return [t for t in tok_list if t not in STOPWORDS]

jd_tokens          = [drop_stop(t) for t in jd_tokens]
resume_tokens_full = [drop_stop(t) for t in resume_tokens_full]

Loading cached token lists …


In [32]:
AUTO_STOP = learn_stop(jd_tokens + resume_tokens_full, 0.75)
STOPWORDS = BASE_STOP | EXTRA_STOP | AUTO_STOP

def drop_stop(toks): return [t for t in toks if t not in STOPWORDS]

jd_tokens = [drop_stop(t) for t in jd_tokens]
resume_tokens_full = [drop_stop(t) for t in resume_tokens_full]

print(f"Stop‑word tally → default:{len(BASE_STOP)} "
      f"+extra:{len(EXTRA_STOP)} +auto:{len(AUTO_STOP)} = {len(STOPWORDS)}")

Stop‑word tally → default:326 +extra:109 +auto:0 = 431


In [33]:
print("Fitting BM25 …")
bm25 = BM25Okapi(jd_tokens)

jd_id_to_idx = {jd_id: i for i,jd_id in zip(df_jds.jd_id, range(len(df_jds)))}

gold_jd_indices = [jd_id_to_idx[jid] for jid in gold_jds_df.jd_id]
gold_jd_ids_arr = gold_jds_df.jd_id.to_numpy()

Fitting BM25 …


KeyError: 'JD1'

In [None]:
print("Tokenizing gold résumés …")
gold_resume_tokens = [drop_stop(tok_normalize(t))
                      for t in tqdm(gold_resumes_df.resume_text)]

In [None]:
print("Scoring BM25 …")
pred = {}               # resume_id  →  [top‑K jd_ids]

for rid, r_toks in zip(gold_resumes_df.resume_id, gold_resume_tokens):
    if not r_toks:
        pred[rid] = []
        continue
    
    scores = bm25.get_scores(r_toks)
    
    sub_scores = scores[gold_jd_indices]   
    top_idx_sub = np.argpartition(-sub_scores, K-1)[:K]
    
    top_idx_sub = top_idx_sub[np.argsort(sub_scores[top_idx_sub])[::-1]]
    pred[rid] = gold_jd_ids_arr[top_idx_sub].tolist()

pred_df = pd.DataFrame({"resume_id":list(pred.keys()),
                        "top10_jds":list(pred.values())})
print("\nPredictions sample:\n")
pred_df.head()

In [None]:
gold_map={}
with open(GOLD_TOP10_PATH) as fh:
    for line in fh:
        if not line.strip(): continue
        rid, rest = line.split(":",1)
        gold_map[rid.strip()] = re.findall(r'JD\d+', rest)

In [None]:
def dcg(rel): return sum(r/np.log2(i+2) for i,r in enumerate(rel))

prec, rec, acc, rr, ndcg = [], [], [], [], []
for rid, gold_jds in gold_map.items():
    if rid not in pred: continue
    hits = [int(j in gold_jds) for j in pred[rid]]
    prec.append(np.mean(hits))
    rec.append(sum(hits)/len(gold_jds))
    acc.append(int(any(hits)))
    rr.append(next((1/(i+1) for i,h in enumerate(hits) if h), 0))
    ideal = min(len(gold_jds), K)
    ndcg.append(dcg(hits)/dcg([1]*ideal) if ideal else 0)

print("\n──────── Accuracy on gold 50×50 ────────")
print(f"Precision@{K}:   {np.mean(prec):.4f}")
print(f"Recall@{K}:      {np.mean(rec):.4f}")
print(f"Top‑{K} accuracy: {np.mean(acc):.4f}")
print(f"MRR@{K}:          {np.mean(rr):.4f}")
print(f"NDCG@{K}:         {np.mean(ndcg):.4f}")
