**Method 1 Transformer based architecture**

Since the dataset contains text data (job titles) with multiple possible labels, a transformer-based model like BERT is a perfect fit. Transformers are great at capturing the relationships between words in context, which makes them particularly effective for understanding job titles, especially when the titles include specific industry terms.

In [4]:
import os
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import DataCollatorWithPadding
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier
import joblib

In [5]:
data = pd.read_excel("JobLevelData.xlsx")
data.head()

Unnamed: 0,Title,Column 1,Column 2,Column 3,Column 4
0,Vice President / Director of Systems Engineering,Vice President,,,
1,Systems Engineer; Systems Architect,Manager,Individual Contributor/Staff,,
2,"Executive Director, Global IT Infrastructure /...",Director,Chief Officer,,
3,CTO/Executive Director of Technology Services,Director,Chief Officer,,
4,"Vice President, CIO",Vice President,,,


In [6]:
# Step 1: Data Preprocessing
# Filling missing values in the dataset with an empty string to ensure consistent handling of missing data
data = data.drop(["Column 3", "Column 4"], axis=1)
data.fillna("", inplace=True)


# Combining labels from Columns 1 and 2 into a single list for each job title
# Only non-empty labels are included in the list
data["labels"] = data[["Column 1", "Column 2"]].apply(
    lambda x: [label for label in x if label], axis=1
)

# Applying MultiLabelBinarizer to transform labels into a binary format
# This allows the model to process multi-label classification effectively
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data["labels"])


# Step 2: Dataset Creation
# Creating a custom dataset class for PyTorch to handle job titles and their corresponding labels
class JobTitlesDataset(Dataset):
    def __init__(self, titles, labels):
        self.titles = titles
        self.labels = labels

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        # Tokenizing the text of the job title
        # Using padding, truncation, and a maximum length of 128 tokens to ensure uniformity
        encodings = tokenizer(
            self.titles[idx],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        # Preparing the input for the model
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        # Converting labels to a tensor of type float
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Step 3: Data Splitting
# Splitting the data into training and validation sets
# 80% of the data is used for training, and 20% is used for validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["Title"].tolist(), labels, test_size=0.2, random_state=42
)

# Creating training and validation datasets using the custom dataset class
train_dataset = JobTitlesDataset(train_texts, train_labels)
val_dataset = JobTitlesDataset(val_texts, val_labels)


In [7]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(mlb.classes_), problem_type="multi_label_classification")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [9]:
os.environ["WANDB_MODE"] = "disabled"

In [10]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    run_name="text-classification",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=50,

)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

# Train and evaluate
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.172,0.133858
2,0.0917,0.105571
3,0.084,0.087674
4,0.0682,0.090183
5,0.0522,0.091789


TrainOutput(global_step=560, training_loss=0.10934469577457223, metrics={'train_runtime': 301.1324, 'train_samples_per_second': 29.754, 'train_steps_per_second': 1.86, 'total_flos': 589389930823680.0, 'train_loss': 0.10934469577457223, 'epoch': 5.0})

In [11]:
# Save the model and tokenizer
model.save_pretrained("./job_title_classifier")
tokenizer.save_pretrained("./job_title_classifier")

('./job_title_classifier/tokenizer_config.json',
 './job_title_classifier/special_tokens_map.json',
 './job_title_classifier/vocab.txt',
 './job_title_classifier/added_tokens.json')

#This cell runs take some time so you can do not run it I have calculated and hard coded best threshold

In [18]:
mlb_classes = ['Chief Officer', 'Director', 'Individual Contributor/Staff', 'Manager', 'Owner', 'Vice President']

def predict_labels_with_threshold(job_titles, threshold):

    predictions = []
    for title in job_titles:
        # Tokenize the input title
        inputs = tokenizer(title, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        inputs = inputs.to(device)  # Move inputs to the same device as the model

        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)

        # Apply sigmoid to get probabilities
        probs = torch.sigmoid(outputs.logits).squeeze().tolist()

        # Convert probabilities to binary predictions using the threshold
        binary_predictions = [1 if prob > threshold else 0 for prob in probs]
        predictions.append(binary_predictions)

    return np.array(predictions)

def find_best_threshold(job_titles, y_true, thresholds=np.arange(0.15, 0.95, 0.1)):

    best_threshold = 0.0
    best_metric = (-float("inf"), 0, 0)  # Prioritize F1 score, then accuracy
    results = []

    for threshold in tqdm(thresholds):
        y_pred = predict_labels_with_threshold(job_titles, threshold)

        # Calculate metrics
        f1 = f1_score(y_true, y_pred, average="macro", zero_division=1)
        accuracy = accuracy_score(y_true, y_pred)

        # Custom metric prioritization (F1 first, then accuracy)
        metric = (f1, accuracy)

        # Append results for logging
        results.append((threshold, f1, accuracy))

        # Update best threshold if metrics improve
        if metric > best_metric:
            best_metric = metric
            best_threshold = threshold

    return best_threshold

# Evaluate thresholds
best_threshold_for_BERT = 0.45
# best_threshold_for_BERT = find_best_threshold(val_texts, val_labels)

100%|██████████| 8/8 [00:56<00:00,  7.03s/it]


In [20]:
def evaluate_with_best_threshold(job_titles, y_true, best_threshold):

    # Predict labels with the best threshold
    y_pred = predict_labels_with_threshold(job_titles, best_threshold)

    # Calculate metrics
    precision = precision_score(y_true, y_pred, average="macro", zero_division=1)
    recall = recall_score(y_true, y_pred, average="macro", zero_division=1)
    f1 = f1_score(y_true, y_pred, average="macro", zero_division=1)
    accuracy = accuracy_score(y_true, y_pred)

    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "accuracy": accuracy
    }


# Calculate metrics with the best threshold
final_metrics = evaluate_with_best_threshold(val_texts, val_labels, best_threshold_for_BERT)

print(f"Best Threshold: {best_threshold_for_BERT:.2f}")
print("Final Evaluation Metrics:")
print(f"Precision: {final_metrics['precision']:.4f}")
print(f"Recall: {final_metrics['recall']:.4f}")
print(f"F1 Score: {final_metrics['f1_score']:.4f}")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")

Best Threshold: 0.45
Final Evaluation Metrics:
Precision: 0.9197
Recall: 0.8617
F1 Score: 0.8865
Accuracy: 0.8973


**Method 2 CatBoost Algorithm**

CatBoost is an excellent second choice for this task due to its ability to handle categorical data natively and its robustness with small- to medium-sized datasets.

In [21]:
data = pd.read_excel("JobLevelData.xlsx")
data.head()

Unnamed: 0,Title,Column 1,Column 2,Column 3,Column 4
0,Vice President / Director of Systems Engineering,Vice President,,,
1,Systems Engineer; Systems Architect,Manager,Individual Contributor/Staff,,
2,"Executive Director, Global IT Infrastructure /...",Director,Chief Officer,,
3,CTO/Executive Director of Technology Services,Director,Chief Officer,,
4,"Vice President, CIO",Vice President,,,


In [22]:
# Step 1: Data Preprocessing
# Filling missing values in the dataset with an empty string to ensure consistent handling of missing data
data = data.drop(["Column 3", "Column 4"], axis=1)
data.fillna("", inplace=True)

# Combining labels from Columns 1 and 2 into a single list for each job title
# Only non-empty labels are included in the list
data["labels"] = data[["Column 1", "Column 2"]].apply(
    lambda x: [label for label in x if label], axis=1
)

# Applying MultiLabelBinarizer to transform labels into a binary format
# This allows the model to process multi-label classification effectively
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data["labels"])

In [23]:
# Preprocess the "Title" column
def preprocess_text(text):
    # Lowercase the text and remove unnecessary characters
    return text.lower()

data['Title'] = data['Title'].apply(preprocess_text)

# Convert "Title" to numeric features using TfidfVectorizer
tfidf = TfidfVectorizer(max_features=500)
X_features = tfidf.fit_transform(data['Title']).toarray()

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features, labels, test_size=0.2, random_state=42)

In [24]:
# Initialize the CatBoostClassifier for multi-label classification
cat_boost_model = CatBoostClassifier(
    iterations=1000,
    depth=7,
    learning_rate=0.1,
    loss_function='MultiLogloss',
    task_type="CPU",
    random_seed=42
)

# Train the model
cat_boost_model.fit(X_train, y_train, verbose=50)

0:	learn: 0.5724708	total: 224ms	remaining: 3m 44s
50:	learn: 0.1090643	total: 3.68s	remaining: 1m 8s
100:	learn: 0.0799556	total: 9.16s	remaining: 1m 21s
150:	learn: 0.0625659	total: 13.6s	remaining: 1m 16s
200:	learn: 0.0509757	total: 17s	remaining: 1m 7s
250:	learn: 0.0438193	total: 19.9s	remaining: 59.4s
300:	learn: 0.0369664	total: 22.8s	remaining: 52.9s
350:	learn: 0.0322088	total: 26.4s	remaining: 48.8s
400:	learn: 0.0278267	total: 30.7s	remaining: 45.8s
450:	learn: 0.0245993	total: 33.6s	remaining: 40.9s
500:	learn: 0.0217819	total: 36.5s	remaining: 36.3s
550:	learn: 0.0195969	total: 39.3s	remaining: 32.1s
600:	learn: 0.0179853	total: 44.3s	remaining: 29.4s
650:	learn: 0.0164627	total: 47.2s	remaining: 25.3s
700:	learn: 0.0150754	total: 50.2s	remaining: 21.4s
750:	learn: 0.0139315	total: 53.1s	remaining: 17.6s
800:	learn: 0.0128983	total: 57.9s	remaining: 14.4s
850:	learn: 0.0121258	total: 1m 3s	remaining: 11s
900:	learn: 0.0113991	total: 1m 5s	remaining: 7.24s
950:	learn: 0.01

<catboost.core.CatBoostClassifier at 0x79d9749207d0>

In [26]:
cat_boost_model.save_model("cat_boost_model/catboost_model.cbm")
joblib.dump(tfidf, "cat_boost_model/tfidf_vectorizer.pkl")

['cat_boost_model/tfidf_vectorizer.pkl']

In [27]:
def predict_labels_with_threshold(job_titles, threshold):

    # Perform inference
    probs = cat_boost_model.predict_proba(job_titles)

    # Convert probabilities to binary predictions using the threshold
    binary_predictions = (probs > threshold).astype(int)

    return binary_predictions

def find_best_threshold(job_titles, y_true, thresholds=np.arange(0.25, 0.95, 0.01)):

    best_threshold = 0.0
    best_metric = (-float("inf"), 0, 0)  # Prioritize F1 score, then accuracy
    results = []

    for threshold in thresholds:
        y_pred = predict_labels_with_threshold(job_titles, threshold)

        # Calculate metrics
        f1 = f1_score(y_true, y_pred, average="macro", zero_division=1)
        accuracy = accuracy_score(y_true, y_pred)

        # Custom metric prioritization (F1 first, then accuracy)
        metric = (f1, accuracy)

        # Append results for logging
        results.append((threshold, f1, accuracy))

        # Update best threshold if metrics improve
        if metric > best_metric:
            best_metric = metric
            best_threshold = threshold

    return best_threshold

# Evaluate thresholds
best_threshold_for_catboost = find_best_threshold(X_test, y_test)

In [28]:
def evaluate_with_best_threshold(job_titles, y_true, best_threshold_for_catboost):

    # Predict labels with the best threshold
    y_pred = predict_labels_with_threshold(job_titles, best_threshold_for_catboost)

    # Calculate metrics
    precision = precision_score(y_true, y_pred, average="macro", zero_division=1)
    recall = recall_score(y_true, y_pred, average="macro", zero_division=1)
    f1 = f1_score(y_true, y_pred, average="macro", zero_division=1)
    accuracy = accuracy_score(y_true, y_pred)

    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "accuracy": accuracy
    }

# Calculate metrics with the best threshold
final_metrics = evaluate_with_best_threshold(X_test, y_test, best_threshold_for_catboost)

print(f"Best Threshold: {best_threshold_for_catboost:.2f}")
print("Final Evaluation Metrics:")
print(f"Precision: {final_metrics['precision']:.4f}")
print(f"Recall: {final_metrics['recall']:.4f}")
print(f"F1 Score: {final_metrics['f1_score']:.4f}")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")


Best Threshold: 0.59
Final Evaluation Metrics:
Precision: 0.9059
Recall: 0.8322
F1 Score: 0.8642
Accuracy: 0.8504


#INFERENCE

In [34]:
# Load models
catboost_model = CatBoostClassifier()
catboost_model.load_model("cat_boost_model/catboost_model.cbm")
tfidf = joblib.load("cat_boost_model/tfidf_vectorizer.pkl")



model_path = "job_title_classifier"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [30]:
mlb_classes = ['Chief Officer', 'Director', 'Individual Contributor/Staff', 'Manager', 'Owner', 'Vice President']

In [31]:
def classify_with_bert(job_title, threshold):

    # Tokenize the input title
    inputs = tokenizer(job_title, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Apply sigmoid to get probabilities
    probs = torch.sigmoid(outputs.logits).squeeze().tolist()

    # Convert probabilities to binary predictions using the threshold
    binary_predictions = [mlb.classes_[i] for i, prob in enumerate(probs) if prob > threshold]

    return binary_predictions

In [32]:
def classify_with_catboost(job_title, threshold):

    # Convert the single job title to features using the same TfidfVectorizer
    features = tfidf.transform([job_title]).toarray()

    # Perform inference
    probs = cat_boost_model.predict_proba(features)

    # Get the predicted binary labels based on the threshold
    binary_predictions = (probs > threshold).astype(int)

    # Map the binary predictions back to the corresponding labels
    predicted_labels = [mlb_classes[i] for i, val in enumerate(binary_predictions[0]) if val == 1]

    return predicted_labels


In [33]:
job_titles = "Marketing Manager"

# Classify using CatBoost
catboost_predictions = classify_with_catboost(job_titles, threshold=best_threshold_for_catboost)
print("CatBoost Predictions:", catboost_predictions)

# Classify using BERT
bert_predictions = classify_with_bert(job_titles, threshold=best_threshold_for_BERT)
print("BERT Predictions:", bert_predictions)

CatBoost Predictions: ['Manager']
BERT Predictions: ['Manager']


**I didn't have sufficient time to tune all the hyperparameters. With proper tuning, I believe all the metrics could be improved.**