In [1]:
!pip install -q rank_bm25 pandas tqdm transformers

In [34]:
import pandas as pd
import re
from rank_bm25 import BM25Okapi
from tqdm import tqdm
from transformers import BertTokenizerFast
from transformers import AutoTokenizer

#Tokenizers roberta-base, allenai/longformer-base-4096, bert-base-uncased,microsoft/mpnet-base
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
job_df = pd.read_csv("../data/job_postings_cleaned.csv")
resume_df = pd.read_csv("../data/resume_queries.csv")

In [36]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def tokenize(text):
    cleaned = clean_text(text)
    return tokenizer.tokenize(cleaned)

In [None]:
job_texts = job_df["job_matching"].astype(str).tolist()
job_tokens = [tokenize(text) for text in tqdm(job_texts)]

Tokenizing job postings with tokenizer...


100%|██████████| 10000/10000 [00:27<00:00, 367.69it/s]


In [38]:
bm25 = BM25Okapi(job_tokens)

In [None]:
results = []

for resume in tqdm(resume_df["Resume_str"].astype(str)):
    resume_tokens = tokenize(resume)
    scores = bm25.get_scores(resume_tokens)
    top_indices = scores.argsort()[-10:][::-1]

    for idx in top_indices:
        results.append({
            "Resume_str": resume,
            "Job_posting": job_df["job_matching"].iloc[idx],
            "BM25_score": float(scores[idx])
        })

Matching resumes using BM25...


100%|██████████| 20/20 [01:05<00:00,  3.26s/it]


In [None]:
final_df = pd.DataFrame(results)

In [41]:
final_df

Unnamed: 0,Resume_str,Job_posting,BM25_score
0,"DIRECTOR, GLOBAL BUSINESS DEVELOPMENT...","Senior Manager, Wireless Infrastructure Respon...",766.510187
1,"DIRECTOR, GLOBAL BUSINESS DEVELOPMENT...",Commercial Sales Manager (TX) SolarEdge (NASDA...,751.034521
2,"DIRECTOR, GLOBAL BUSINESS DEVELOPMENT...","Director, North America Business Product Owner...",750.014382
3,"DIRECTOR, GLOBAL BUSINESS DEVELOPMENT...",Public Sector Partner Manager WHAT IS BOX?\n\n...,740.646725
4,"DIRECTOR, GLOBAL BUSINESS DEVELOPMENT...","Director, Corporate Strategy Company Descripti...",736.857116
...,...,...,...
195,ENGLISH LANGUAGE ARTS TEACHER S...,High School Special Education Teacher Company ...,552.487228
196,ENGLISH LANGUAGE ARTS TEACHER S...,Mastery Teaching Residency About Mastery:\n\nF...,548.528223
197,ENGLISH LANGUAGE ARTS TEACHER S...,"K-12 Teacher - Elementary, Middle, High School...",545.549363
198,ENGLISH LANGUAGE ARTS TEACHER S...,High School Math Teacher Company Description\n...,544.614997


In [None]:
final_df.to_csv("/data_results/4096_LongFormer_Tokenizer_BM25_matches.csv", index=False)

In [None]:
#Run for heatmap visualizations of the bm25 score of the differing tokenizer methods
import matplotlib.pyplot as plt
import seaborn as sns 

df = pd.read_csv("/data_results/4096_LongFormer_Tokenizer_BM25_matches.csv")

df["Resume_ID"] = df.groupby("Resume_str", sort=False).ngroup()

df["Match_Rank"] = df.groupby("Resume_ID").cumcount() + 1

heatmap_data = df.pivot(index="Resume_ID", columns="Match_Rank", values="BM25_score")

plt.figure(figsize=(14, 8))
sns.heatmap(heatmap_data, annot=True, fmt=".0f", cmap="YlGnBu", linewidths=0.5, 
            cbar_kws={"label": "BM25 Score"}
)

plt.title("BM25 Match Scores (Top 10 Job Matches per Resume)")
plt.xlabel("Match Rank (1 = Best Match)")
plt.ylabel("Resume ID")
plt.tight_layout()
plt.show()

In [None]:
tokenizer_files = {
    "Roberta-Base": "/data_results/Roberta_Base_Tokenizer_BM25_matches.csv",
    "MPNet": "/data_results/MpNet_Tokenizer_BM25_matches.csv",
    "BERT": "/data_result/Bert_Tokenizer_BM25_matches.csv",
    "LongFormer-4096": "/data_results/4096_LongFormer_Tokenizer_BM25_matches.csv"
}

gap_scores_summary = {}

for name, path in tokenizer_files.items():
    df = pd.read_csv(path)
    
    df["Resume_ID"] = df.groupby("Resume_str", sort=False).ngroup()
    df["Match_Rank"] = df.groupby("Resume_ID").cumcount() + 1
    
    gap_scores = df.groupby("Resume_ID")["BM25_score"].apply(lambda x: x.iloc[0] - x.iloc[-1])
    avg_gap = gap_scores.mean()
    
    gap_scores_summary[name] = avg_gap

sorted_gaps = dict(sorted(gap_scores_summary.items(), key=lambda item: item[1], reverse=True))

print("🔍 BM25 Top-to-Bottom Avg Score Gaps by Tokenizer:")
for tokenizer, gap in sorted_gaps.items():
    print(f"- {tokenizer}: {gap:.2f}")

best = max(sorted_gaps, key=sorted_gaps.get)
print(f"\nBest separation (highest avg gap): {best} with {sorted_gaps[best]:.2f}")