In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
import re
import time
from datasets import load_dataset
from collections import Counter
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback

#### Loading the Cleaned Resume dataset

In [2]:
# cleaned_dataset = load_dataset("shesadree15/Cleaned_resume_dataset")
cleaned_dataset = load_dataset("persona-156/tokenized-resume-fit-data")

In [55]:
train_ds = cleaned_dataset["train"]
train_df = train_ds.to_pandas()
print(train_df.shape)
train_df.head()

(5616, 5)


Unnamed: 0,labels,jd_cleaned,resume_cleaned,input_ids,attention_mask
0,1,lrs consulting services has been delivering th...,<<summarysenior lead release engineer speciali...,"[101, 1048, 2869, 10552, 2578, 2038, 2042, 127...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,2,"driven, empowering, transformative. academic p...",<<summaryleadership-oriented certified technol...,"[101, 5533, 1010, 7861, 23948, 1010, 10938, 80...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,1,position title: senior accountant organization...,<<profilededicated epidemiologist/data manager...,"[101, 2597, 2516, 1024, 3026, 17907, 3029, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1,"at farmer brothers, sales tax accountant will ...",<<summaryseasoned data architect adept at unde...,"[101, 2012, 7500, 3428, 1010, 4341, 4171, 1790...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,0,job description i am actively seeking an exper...,<<summaryenergetic and personableadministrativ...,"[101, 3105, 6412, 1045, 2572, 8851, 6224, 2019...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [56]:
val_ds   = cleaned_dataset["validation"]
val_df = val_ds.to_pandas()
print(val_df.shape)
val_df.head()

(625, 5)


Unnamed: 0,labels,jd_cleaned,resume_cleaned,input_ids,attention_mask
0,0,job description job title: salesforce communic...,<<summaryi am a computer engineer with over 12...,"[101, 3105, 6412, 3105, 2516, 1024, 4341, 1482...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,"at farmer brothers, sales tax accountant will ...",<<profilehighly motivated sales associate with...,"[101, 2012, 7500, 3428, 1010, 4341, 4171, 1790...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,0,"hello,greetings from devcare solutionsi got an...",<<professional summaryhighly motivated sales a...,"[101, 7592, 1010, 14806, 2015, 2013, 16475, 16...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1,if you can handle the accounting responsibilit...,<<summarymy name is jessica claire and i am in...,"[101, 2065, 2017, 2064, 5047, 1996, 9529, 1019...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,0,job purpose: perform designated tasks in the a...,<<summarycapable accountant successful at mana...,"[101, 3105, 3800, 1024, 4685, 4351, 8518, 1999...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [57]:
test_ds  = cleaned_dataset["test"]
test_df = test_ds.to_pandas()
print(test_df.shape)
test_df.head()

(1759, 5)


Unnamed: 0,labels,jd_cleaned,resume_cleaned,input_ids,attention_mask
0,1,key responsibilities:create intricate wiring n...,<<summary7+ years of experience as a bi develo...,"[101, 3145, 10198, 1024, 3443, 17796, 27930, 6...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,1,personal development and becoming the best you...,<<professional backgroundanalyst versed in dat...,"[101, 3167, 2458, 1998, 3352, 1996, 2190, 2017...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,1,"location: tampa, fl exp: 7-10 yrs spoc: tushar...",<<executive profilededicated professional with...,"[101, 3295, 1024, 9925, 1010, 13109, 4654, 236...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,1,"primary location: melbourne, florida v-soft co...","<<summarytyee highlightsmicrosoft excel, word,...","[101, 3078, 3295, 1024, 4940, 1010, 3516, 1058...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,1,at oregon specialty group the accounting & pay...,<<summaryeit certified engineer and astqb cert...,"[101, 2012, 5392, 12233, 2177, 1996, 9529, 100...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


### To check if the cleaned text already contains words such as good fit, no fit , potental fit

In [6]:
def contains_label_words(text):
    return any(lbl.lower() in text.lower() 
               for lbl in ["good fit", "no fit", "potential fit"])


#### Data leakage before removing fit labels

In [7]:
train_df["leakage_jd"] = train_df["jd_cleaned"].apply(contains_label_words)
train_df["leakage_jd"].value_counts()

leakage_jd
False    5616
Name: count, dtype: int64

In [8]:
train_df["leakage_resume"] = train_df["resume_cleaned"].apply(contains_label_words)
train_df["leakage_resume"].value_counts()

leakage_resume
False    5616
Name: count, dtype: int64

##### Check to confirm if there are words like good fit, no fit, potential fit and to remove any as this can lead the model to cheat instead to find patterns

In [9]:
def remove_label_words(text):
    patterns = [
        r"good fit", 
        r"no fit", 
        r"potential fit",
        r"fit score[: ]?\d+",
        r"label[: ]?\w+",
    ]
    for p in patterns:
        text = re.sub(p, " ", text, flags=re.IGNORECASE)
    return " ".join(text.split())

In [40]:
train_df["cleaned_text"] = train_df["cleaned_text"].apply(remove_label_words)
val_df["cleaned_text"]   = val_df["cleaned_text"].apply(remove_label_words)
test_df["cleaned_text"]  = test_df["cleaned_text"].apply(remove_label_words)

#### Data leakage after fit labels are removed

In [None]:
train_df["leakage"] = train_df["cleaned_text"].apply(contains_label_words)
train_df["leakage"].value_counts()

### Converting the cleaned text to numeric vectors

In [58]:
all_jd_text = " ".join(train_df['jd_cleaned'])
words = all_jd_text.split()
freq = Counter(words)

In [59]:
jd_keyword_tfidf = TfidfVectorizer(max_features=3000, stop_words='english')
jd_keyword_tfidf.fit(train_df["jd_cleaned"])

AUTO_SKILLS = list(jd_keyword_tfidf.vocabulary_.keys())

In [60]:
def extract_skills_auto(jd_text):
    jd_text = jd_text.lower().split()
    return [word for word in jd_text if word in AUTO_SKILLS]

In [61]:
train_df["jd_skill_list"] = train_df["jd_cleaned"].apply(extract_skills_auto)
val_df["jd_skill_list"]   = val_df["jd_cleaned"].apply(extract_skills_auto)
test_df["jd_skill_list"]  = test_df["jd_cleaned"].apply(extract_skills_auto)

In [62]:
def resume_skill_match(row):
    resume = row["resume_cleaned"].split()
    jd_skills = row["jd_skill_list"]
    return sum(skill in resume for skill in jd_skills)

In [63]:
train_df["skill_match_count"] = train_df.apply(resume_skill_match, axis=1)
val_df["skill_match_count"]   = val_df.apply(resume_skill_match, axis=1)
test_df["skill_match_count"]  = test_df.apply(resume_skill_match, axis=1)


In [64]:
def skill_match_ratio(row):
    jd_skills = row["jd_skill_list"]
    if len(jd_skills) == 0: 
        return 0
    resume = row["resume_cleaned"].split()
    matched = sum(skill in resume for skill in jd_skills)
    return matched / len(jd_skills)

In [65]:
train_df["skill_ratio"] = train_df.apply(skill_match_ratio, axis=1)
val_df["skill_ratio"]   = val_df.apply(skill_match_ratio, axis=1)
test_df["skill_ratio"]  = test_df.apply(skill_match_ratio, axis=1)


In [66]:
def numeric_feats(df):
    return csr_matrix(df[[
        "skill_match_count",
        "skill_ratio",
    ]].values)

train_num = numeric_feats(train_df)
val_num   = numeric_feats(val_df)
test_num  = numeric_feats(test_df)


In [67]:
tfidf_jd = TfidfVectorizer(max_features=12000, ngram_range=(1,2), stop_words="english")
X_train_jd = tfidf_jd.fit_transform(train_df["jd_cleaned"])
X_val_jd   = tfidf_jd.transform(val_df["jd_cleaned"])
X_test_jd  = tfidf_jd.transform(test_df["jd_cleaned"])


In [68]:
tfidf_res = TfidfVectorizer(max_features=12000, ngram_range=(1,2), stop_words="english")
X_train_res = tfidf_res.fit_transform(train_df["resume_cleaned"])
X_val_res   = tfidf_res.transform(val_df["resume_cleaned"])
X_test_res  = tfidf_res.transform(test_df["resume_cleaned"])


In [69]:
def cos_sim(mat1, mat2):
    sims = [cosine_similarity(mat1[i], mat2[i])[0][0] for i in range(mat1.shape[0])]
    return csr_matrix(np.array(sims).reshape(-1,1))

train_cos = cos_sim(X_train_jd, X_train_res)
val_cos   = cos_sim(X_val_jd, X_val_res)
test_cos  = cos_sim(X_test_jd, X_test_res)


In [70]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=False)   # IMPORTANT for sparse matrices

train_num_scaled = scaler.fit_transform(train_num)
val_num_scaled   = scaler.transform(val_num)
test_num_scaled  = scaler.transform(test_num)

In [78]:
X_train = hstack([X_train_jd, X_train_res, train_cos, train_num_scaled])
X_val   = hstack([X_val_jd,   X_val_res,   val_cos,   val_num_scaled])
X_test  = hstack([X_test_jd,  X_test_res,  test_cos,  test_num_scaled])

y_train = train_df["labels"].values
y_val   = val_df["labels"].values
y_test  = test_df["labels"].values

print(X_train.shape)   # Expected ~25k – 30k features

(5616, 24003)


In [79]:
svm = SGDClassifier(
    loss="hinge",       # linear SVM
    penalty="l2",       
    alpha=1e-4,         # inverse of C
    max_iter=8000,
    tol=1e-3,
)
svm.fit(X_train, y_train)


0,1,2
,loss,'hinge'
,penalty,'l2'
,alpha,0.0001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,8000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [80]:
logreg = LogisticRegression(solver="saga",
    penalty="l2",
    max_iter=10000,
    tol=1e-2,
    n_jobs=-1)
logreg.fit(X_train, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.01
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'saga'
,max_iter,10000


In [81]:
xgb_model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    eval_metric="mlogloss",
    n_estimators=250,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8
)
xgb_model.fit(X_train, y_train)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [82]:
print("TRAIN:", X_train.shape)
print("VAL:", X_val.shape)
print("TEST:", X_test.shape)

TRAIN: (5616, 24003)
VAL: (625, 24003)
TEST: (1759, 24003)


In [86]:
def eval_model(model, X, y):
    preds = model.predict(X)
    if preds.ndim > 1:
        preds = preds.argmax(axis=1)
    return accuracy_score(y, preds), f1_score(y, preds, average="weighted")

models = { "LOGREG": logreg,"SVM": svm, "XGB": xgb_model}

for name, m in models.items():
    acc, f1 = eval_model(m, X_train, y_train)
    print(f"{name} → TRAIN ACC: {acc:.4f}, TRAIN F1: {f1:.4f}")

LOGREG → TRAIN ACC: 0.7534, TRAIN F1: 0.7495
SVM → TRAIN ACC: 0.7961, TRAIN F1: 0.7955
XGB → TRAIN ACC: 0.9991, TRAIN F1: 0.9991


In [87]:
for name, m in models.items():
    acc, f1 = eval_model(m, X_val, y_val)
    print(f"{name} → VAL ACC: {acc:.4f}, VAL F1: {f1:.4f}")

LOGREG → VAL ACC: 0.6672, VAL F1: 0.6596
SVM → VAL ACC: 0.7024, VAL F1: 0.7030
XGB → VAL ACC: 0.7408, VAL F1: 0.7367
