In [11]:
import pandas as pd
import xgboost as xgb
import re
import time
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback

#### Loading the Cleaned Resume dataset

In [None]:
cleaned_dataset = load_dataset("shesadree15/Cleaned_resume_dataset")

In [19]:
train_ds = cleaned_dataset["train"]
train_df = train_ds.to_pandas()
print(train_df.shape)
train_df.head()

(5616, 5)


Unnamed: 0,text,label,cleaned_text,label_id,text_length
0,For the given job description <<LRS Consulting...,No Fit,LRS Consulting Services has been delivering th...,1,8762
1,"For the given job description <<Driven, empowe...",Potential Fit,"Driven, empowering, transformative. Academic P...",2,8248
2,For the given job description <<Position Title...,No Fit,Position Title: Senior Accountant Organization...,1,12933
3,For the given job description <<At Farmer Brot...,No Fit,"At Farmer Brothers, Sales Tax Accountant will ...",1,13409
4,For the given job description <<Job Descriptio...,Good Fit,Job Description I am actively seeking an exper...,0,7312


In [20]:
val_ds   = cleaned_dataset["validation"]
val_df = val_ds.to_pandas()
print(val_df.shape)
val_df.head()

(625, 5)


Unnamed: 0,text,label,cleaned_text,label_id,text_length
0,For the given job description <<Job Descriptio...,Good Fit,Job Description Job Title: Salesforce Communic...,0,6372
1,For the given job description <<At Farmer Brot...,No Fit,"At Farmer Brothers, Sales Tax Accountant will ...",1,8473
2,"For the given job description <<Hello,Greeting...",Good Fit,"Hello,Greetings from DevCare SolutionsI got an...",0,7882
3,For the given job description <<If you can han...,No Fit,If you can handle the accounting responsibilit...,1,7819
4,For the given job description <<Job Purpose: P...,Good Fit,Job Purpose: Perform designated tasks in the a...,0,6636


In [21]:
test_ds  = cleaned_dataset["test"]
test_df = test_ds.to_pandas()
print(test_df.shape)
test_df.head()

(1759, 5)


Unnamed: 0,text,label,cleaned_text,label_id,text_length
0,For the given job description <<Key Responsibi...,No Fit,Key Responsibilities:Create intricate wiring n...,1,9907
1,For the given job description <<Personal devel...,No Fit,Personal development and becoming the best you...,1,12665
2,For the given job description <<Location: Tamp...,No Fit,"Location: Tampa, FL Exp: 7-10 Yrs SPOC: Tushar...",1,8929
3,For the given job description <<Primary Locati...,No Fit,"Primary Location: Melbourne, Florida V-Soft Co...",1,7461
4,For the given job description <<At Oregon Spec...,No Fit,At Oregon Specialty Group the Accounting & Pay...,1,11198


### To check if the cleaned text already contains words such as good fit, no fit , potental fit

In [None]:
def contains_label_words(text):
    return any(lbl.lower() in text.lower() 
               for lbl in ["good fit", "no fit", "potential fit"])


#### Data leakage before removing fit labels

In [None]:
train_df["leakage"] = train_df["cleaned_text"].apply(contains_label_words)
train_df["leakage"].value_counts()

In [39]:
def remove_label_words(text):
    patterns = [
        r"good fit", 
        r"no fit", 
        r"potential fit",
        r"fit score[: ]?\d+",
        r"label[: ]?\w+",
    ]
    for p in patterns:
        text = re.sub(p, " ", text, flags=re.IGNORECASE)
    return " ".join(text.split())

In [40]:
train_df["cleaned_text"] = train_df["cleaned_text"].apply(remove_label_words)
val_df["cleaned_text"]   = val_df["cleaned_text"].apply(remove_label_words)
test_df["cleaned_text"]  = test_df["cleaned_text"].apply(remove_label_words)

#### Data leakage after fit labels are removed

In [None]:
train_df["leakage"] = train_df["cleaned_text"].apply(contains_label_words)
train_df["leakage"].value_counts()

In [43]:
X_train = train_df["cleaned_text"]
y_train = train_df["label_id"]

X_val = val_df["cleaned_text"]
y_val = val_df["label_id"]

X_test = test_df["cleaned_text"]
y_test = test_df["label_id"]

### Converting the cleaned text to numeric vectors

In [50]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words="english"
)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec   = vectorizer.transform(X_val)
X_test_vec  = vectorizer.transform(X_test)

### Evaluation metric function

In [51]:
def compute_metrics(model, X, y):
    preds = model.predict(X)
    return {
        "accuracy": accuracy_score(y, preds),
        "f1_weighted": f1_score(y, preds, average="weighted")
    }

### Checking evaluation across 3 models Logistic regression, SVC, XGB

In [52]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=2500),
    "Linear SVM": LinearSVC(),
    "XGBoost": xgb.XGBClassifier(
        objective="multi:softmax",
        num_class=len(y_train.unique()),
        eval_metric="mlogloss",
        n_estimators=350,
        learning_rate=0.15,
        max_depth=6
    )
}

### Iterating over the 3 models, storing the results and time taken by each model in the results dictionary

In [56]:
results = {}

for name, model in models.items():
    print(f"Training {name}...")

    start = time.time()
    model.fit(X_train_vec, y_train)
    train_time = time.time() - start
    print(f"Training time: {train_time:.3f} sec")

    res = compute_metrics(model, X_val_vec, y_val)

    res["train_time_sec"] = round(train_time, 3)
    results[name] = res

Training Logistic Regression...
Training time: 1.771 sec
Training Linear SVM...
Training time: 2.183 sec
Training XGBoost...
Training time: 338.923 sec


In [57]:
comparison_res_df = pd.DataFrame(results).T
comparison_res_df

Unnamed: 0,accuracy,f1_weighted,train_time_sec
Logistic Regression,0.6224,0.606954,1.771
Linear SVM,0.6608,0.654788,2.183
XGBoost,0.6912,0.685638,338.923


### Inference

##### After removing the label leakage (words like good fit, no fit and potential fit were making the model to cheat and not learn as it was a identical matching criteria and classification was made easy). Now, post removal the evaluation of classical machine learning model using TF-IDF features makes the result fall under expected performace range of a 3-class text clasification.The baseline ranges from 55-70% accuracy and 60-75% weighted F1

##### Our obtained results are within this baseline, indicating a moderate dataset difficulty and no data leakage

##### Given the level of complexity in resume JD semantic match, classical model reaches  their natural performance ceiling. This makes us to explore advance tranformer based models such as BERT/ ALBERT based models

#### Loading the tokenized Resume dataset

In [3]:
tokenized_dataset = load_dataset("sibi-seeni/tokenized-resume-data")

In [4]:
train_tok_ds = tokenized_dataset["train"]
val_tok_ds   = tokenized_dataset["validation"]
test_tok_ds  = tokenized_dataset["test"]

In [5]:
train_tok_ds.set_format("torch")
val_tok_ds.set_format("torch")
test_tok_ds.set_format("torch")

#### Going with ALBERTA  model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "albert-base-v2",
    num_labels=3
)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Training arguements with early stopping

In [12]:
args = TrainingArguments(
    output_dir="albert-fit-model",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    num_train_epochs=5,          
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    load_best_model_at_end=True,  
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

#### To compute metrics

In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

#### Traing with ear;y stopping callback

In [14]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok_ds,
    eval_dataset=val_tok_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
trainer.train()

In [None]:
# Train accuracy
train_pred = trainer.predict(train_ds)
train_acc = accuracy_score(train_pred.label_ids, train_pred.predictions.argmax(axis=1))
print("Train Accuracy:", round(train_acc, 4))

# Validation accuracy
val_pred = trainer.predict(val_ds)
val_acc = accuracy_score(val_pred.label_ids, val_pred.predictions.argmax(axis=1))
print("Validation Accuracy:", round(val_acc, 4))