**Method 1 Transformer based architecture**

Since the dataset contains text data (job titles) with multiple possible labels, a transformer-based model like BERT is a perfect fit. Transformers are great at capturing the relationships between words in context, which makes them particularly effective for understanding job titles, especially when the titles include specific industry terms.

In [42]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import DataCollatorWithPadding
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier
import joblib

In [43]:
data = pd.read_excel("JobLevelData.xlsx")
data.head()

Unnamed: 0,Title,Column 1,Column 2,Column 3,Column 4
0,Vice President / Director of Systems Engineering,Vice President,,,
1,Systems Engineer; Systems Architect,Manager,Individual Contributor/Staff,,
2,"Executive Director, Global IT Infrastructure /...",Director,Chief Officer,,
3,CTO/Executive Director of Technology Services,Director,Chief Officer,,
4,"Vice President, CIO",Vice President,,,


In [44]:
# Step 1: Data Preprocessing
# Filling missing values in the dataset with an empty string to ensure consistent handling of missing data
data.fillna("", inplace=True)

# Combining labels from Columns 1 to 4 into a single list for each job title
# Only non-empty labels are included in the list
data["labels"] = data[["Column 1", "Column 2", "Column 3", "Column 4"]].apply(
    lambda x: [label for label in x if label], axis=1
)

# Applying MultiLabelBinarizer to transform labels into a binary format
# This allows the model to process multi-label classification effectively
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(data["labels"])


# Step 2: Dataset Creation
# Creating a custom dataset class for PyTorch to handle job titles and their corresponding labels
class JobTitlesDataset(Dataset):
    def __init__(self, titles, labels):
        self.titles = titles
        self.labels = labels

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        # Tokenizing the text of the job title
        # Using padding, truncation, and a maximum length of 128 tokens to ensure uniformity
        encodings = tokenizer(
            self.titles[idx],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        # Preparing the input for the model
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        # Converting labels to a tensor of type float
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Step 3: Data Splitting
# Splitting the data into training and validation sets
# 80% of the data is used for training, and 20% is used for validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["Title"].tolist(), labels, test_size=0.2, random_state=42
)

# Creating training and validation datasets using the custom dataset class
train_dataset = JobTitlesDataset(train_texts, train_labels)
val_dataset = JobTitlesDataset(val_texts, val_labels)


In [45]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(mlb.classes_), problem_type="multi_label_classification")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [46]:
import os
os.environ["WANDB_MODE"] = "disabled"


In [47]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator
)

# Train and evaluate
trainer.train()

# Save the model and tokenizer
model.save_pretrained("./job_title_classifier")
tokenizer.save_pretrained("./job_title_classifier")



Epoch,Training Loss,Validation Loss
1,0.1832,0.133489
2,0.0949,0.108019
3,0.0877,0.09641


('./job_title_classifier/tokenizer_config.json',
 './job_title_classifier/special_tokens_map.json',
 './job_title_classifier/vocab.txt',
 './job_title_classifier/added_tokens.json')

In [49]:
# Load the fine-tuned model and tokenizer
model_path = "./job_title_classifier"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [50]:
mlb_classes = ['Chief Officer', 'Director', 'Individual Contributor/Staff', 'Manager', 'Owner', 'Vice President']

def predict_labels_with_threshold(job_titles, threshold):
    """
    Predicts labels for a list of job titles using the fine-tuned BERT model and a given threshold.

    Args:
        job_titles (list): List of job titles as strings.
        threshold (float): Probability threshold for classification.

    Returns:
        np.ndarray: Predicted binary labels for each job title.
    """
    predictions = []
    for title in job_titles:
        # Tokenize the input title
        inputs = tokenizer(title, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)

        # Apply sigmoid to get probabilities
        probs = torch.sigmoid(outputs.logits).squeeze().tolist()

        # Convert probabilities to binary predictions using the threshold
        binary_predictions = [1 if prob > threshold else 0 for prob in probs]
        predictions.append(binary_predictions)

    return np.array(predictions)

def find_best_threshold(job_titles, y_true, thresholds=np.arange(0.15, 0.95, 0.1)):
    """
    Finds the best threshold for classification based on a prioritized metric (F1, accuracy, etc.).

    Args:
        job_titles (list): List of job titles as strings.
        y_true (np.ndarray): Ground truth binary labels.
        thresholds (np.ndarray): List of thresholds to evaluate.

    Returns:
        float: Best threshold.
        dict: Metrics (best threshold, F1 score, accuracy).
    """
    best_threshold = 0.0
    best_metric = (-float("inf"), 0, 0)  # Prioritize F1 score, then accuracy
    results = []

    for threshold in thresholds:
        y_pred = predict_labels_with_threshold(job_titles, threshold)

        # Calculate metrics
        f1 = f1_score(y_true, y_pred, average="macro", zero_division=1)
        accuracy = accuracy_score(y_true, y_pred)

        # Custom metric prioritization (F1 first, then accuracy)
        metric = (f1, accuracy)

        # Append results for logging
        results.append((threshold, f1, accuracy))

        # Update best threshold if metrics improve
        if metric > best_metric:
            best_metric = metric
            best_threshold = threshold

    return best_threshold

# Evaluate thresholds
best_threshold_for_BERT = find_best_threshold(val_texts, val_labels)

In [51]:
def evaluate_with_best_threshold(job_titles, y_true, best_threshold):
    """
    Evaluates the model using the best threshold and calculates precision, recall, F1 score, and accuracy.

    Args:
        job_titles (list): List of job titles as strings.
        y_true (np.ndarray): Ground truth binary labels.
        best_threshold (float): Best threshold for classification.

    Returns:
        dict: Evaluation metrics (precision, recall, F1 score, accuracy).
    """
    # Predict labels with the best threshold
    y_pred = predict_labels_with_threshold(job_titles, best_threshold)

    # Calculate metrics
    precision = precision_score(y_true, y_pred, average="macro", zero_division=1)
    recall = recall_score(y_true, y_pred, average="macro", zero_division=1)
    f1 = f1_score(y_true, y_pred, average="macro", zero_division=1)
    accuracy = accuracy_score(y_true, y_pred)

    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "accuracy": accuracy
    }


# Calculate metrics with the best threshold
final_metrics = evaluate_with_best_threshold(val_texts, val_labels, best_threshold_for_BERT)

print(f"Best Threshold: {best_threshold_for_BERT:.2f}")
print("Final Evaluation Metrics:")
print(f"Precision: {final_metrics['precision']:.4f}")
print(f"Recall: {final_metrics['recall']:.4f}")
print(f"F1 Score: {final_metrics['f1_score']:.4f}")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")

Best Threshold: 0.45
Final Evaluation Metrics:
Precision: 0.8925
Recall: 0.8626
F1 Score: 0.8764
Accuracy: 0.8750


**Method 2 CatBoost Algorithm**  

CatBoost is an excellent second choice for this task due to its ability to handle categorical data natively and its robustness with small- to medium-sized datasets.

In [52]:
# Preprocess the "Title" column
def preprocess_text(text):
    # Lowercase the text and remove unnecessary characters
    return text.lower()

data['Title'] = data['Title'].apply(preprocess_text)

# Convert "Title" to numeric features using TfidfVectorizer
tfidf = TfidfVectorizer(max_features=500)
X_features = tfidf.fit_transform(data['Title']).toarray()

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features, labels, test_size=0.2, random_state=42)

In [67]:
# Initialize the CatBoostClassifier for multi-label classification
cat_boost_model = CatBoostClassifier(
    iterations=2000,
    depth=7,
    learning_rate=0.05,
    loss_function='MultiLogloss',
    task_type="CPU",
    random_seed=42
)

# Train the model
cat_boost_model.fit(X_train, y_train, verbose=50)

0:	learn: 0.6288488	total: 83ms	remaining: 2m 45s
50:	learn: 0.1399074	total: 6.3s	remaining: 4m
100:	learn: 0.1089963	total: 11.7s	remaining: 3m 40s
150:	learn: 0.0950120	total: 15.2s	remaining: 3m 6s
200:	learn: 0.0839259	total: 19.6s	remaining: 2m 55s
250:	learn: 0.0717664	total: 22.6s	remaining: 2m 37s
300:	learn: 0.0635438	total: 25.5s	remaining: 2m 23s
350:	learn: 0.0573158	total: 28.5s	remaining: 2m 13s
400:	learn: 0.0522393	total: 32.9s	remaining: 2m 11s
450:	learn: 0.0472612	total: 35.9s	remaining: 2m 3s
500:	learn: 0.0434622	total: 38.8s	remaining: 1m 56s
550:	learn: 0.0401983	total: 41.8s	remaining: 1m 49s
600:	learn: 0.0364641	total: 46.2s	remaining: 1m 47s
650:	learn: 0.0344435	total: 49.1s	remaining: 1m 41s
700:	learn: 0.0325126	total: 52.1s	remaining: 1m 36s
750:	learn: 0.0304285	total: 55s	remaining: 1m 31s
800:	learn: 0.0286401	total: 59.4s	remaining: 1m 28s
850:	learn: 0.0269234	total: 1m 3s	remaining: 1m 25s
900:	learn: 0.0251387	total: 1m 6s	remaining: 1m 21s
950:	l

<catboost.core.CatBoostClassifier at 0x7ae48c110c50>

In [68]:
cat_boost_model.save_model("catboost_model.cbm")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [69]:
catboost_model = CatBoostClassifier()
catboost_model.load_model("catboost_model.cbm")
tfidf = joblib.load("tfidf_vectorizer.pkl")

In [70]:
def predict_labels_with_threshold(job_titles, threshold):
    """
    Predicts labels for a list of job titles using the trained CatBoost model and a given threshold.

    Args:
        job_titles (list): List of job titles as strings.
        threshold (float): Probability threshold for classification.

    Returns:
        np.ndarray: Predicted binary labels for each job title.
    """
    # Convert job titles to features using the same TfidfVectorizer
    features = tfidf.transform(job_titles).toarray()

    # Perform inference
    probs = catboost_model.predict_proba(features)

    # Convert probabilities to binary predictions using the threshold
    binary_predictions = (probs > threshold).astype(int)

    return binary_predictions

def find_best_threshold(job_titles, y_true, thresholds=np.arange(0.25, 0.95, 0.01)):
    """
    Finds the best threshold for classification based on a prioritized metric (F1, accuracy, etc.).

    Args:
        job_titles (list): List of job titles as strings.
        y_true (np.ndarray): Ground truth binary labels.
        thresholds (np.ndarray): List of thresholds to evaluate.

    Returns:
        float: Best threshold.
        dict: Metrics (best threshold, F1 score, accuracy).
    """
    best_threshold = 0.0
    best_metric = (-float("inf"), 0, 0)  # Prioritize F1 score, then accuracy
    results = []

    for threshold in thresholds:
        y_pred = predict_labels_with_threshold(job_titles, threshold)

        # Calculate metrics
        f1 = f1_score(y_true, y_pred, average="macro", zero_division=1)
        accuracy = accuracy_score(y_true, y_pred)

        # Custom metric prioritization (F1 first, then accuracy)
        metric = (f1, accuracy)

        # Append results for logging
        results.append((threshold, f1, accuracy))

        # Update best threshold if metrics improve
        if metric > best_metric:
            best_metric = metric
            best_threshold = threshold

    return best_threshold

# Evaluate thresholds
best_threshold_for_catboost = find_best_threshold(val_texts, val_labels)

In [71]:
def evaluate_with_best_threshold(job_titles, y_true, best_threshold):
    """
    Evaluates the model using the best threshold and calculates precision, recall, F1 score, and accuracy.

    Args:
        job_titles (list): List of job titles as strings.
        y_true (np.ndarray): Ground truth binary labels.
        best_threshold (float): Best threshold for classification.

    Returns:
        dict: Evaluation metrics (precision, recall, F1 score, accuracy).
    """
    # Predict labels with the best threshold
    y_pred = predict_labels_with_threshold(job_titles, best_threshold)

    # Calculate metrics
    precision = precision_score(y_true, y_pred, average="macro", zero_division=1)
    recall = recall_score(y_true, y_pred, average="macro", zero_division=1)
    f1 = f1_score(y_true, y_pred, average="macro", zero_division=1)
    accuracy = accuracy_score(y_true, y_pred)

    return {
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "accuracy": accuracy
    }

# Calculate metrics with the best threshold
final_metrics = evaluate_with_best_threshold(val_texts, val_labels, best_threshold_for_catboost)

print(f"Best Threshold: {best_threshold_for_catboost:.2f}")
print("Final Evaluation Metrics:")
print(f"Precision: {final_metrics['precision']:.4f}")
print(f"Recall: {final_metrics['recall']:.4f}")
print(f"F1 Score: {final_metrics['f1_score']:.4f}")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")


Best Threshold: 0.28
Final Evaluation Metrics:
Precision: 0.8665
Recall: 0.8531
F1 Score: 0.8584
Accuracy: 0.8504


In [72]:
def classify_with_bert(job_titles, threshold=0.5):
    """
    Classifies a list of job titles using the fine-tuned BERT model and a given threshold.

    Args:
        job_titles (list): List of job titles as strings.
        threshold (float): Probability threshold for classification.

    Returns:
        list: Predicted labels for each job title.
    """
    predictions = []

    for title in job_titles:
        # Tokenize the input title
        inputs = tokenizer(title, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

        # Perform inference
        with torch.no_grad():
            outputs = model(**inputs)

        # Apply sigmoid to get probabilities
        probs = torch.sigmoid(outputs.logits).squeeze().tolist()

        # Convert probabilities to binary predictions using the threshold
        binary_predictions = [mlb.classes_[i] for i, prob in enumerate(probs) if prob > threshold]

        predictions.append(binary_predictions)

    return predictions


In [73]:
def classify_with_catboost(job_titles, threshold=0.5):
    """
    Classifies a list of job titles using the trained CatBoost model and a given threshold.

    Args:
        job_titles (list): List of job titles as strings.
        threshold (float): Probability threshold for classification.

    Returns:
        list: Predicted labels for each job title.
    """
    # Convert job titles to features using the same TfidfVectorizer
    features = tfidf.transform(job_titles).toarray()

    # Perform inference
    probs = cat_boost_model.predict_proba(features)

    # Get the predicted binary labels based on the threshold
    binary_predictions = (probs > threshold).astype(int)

    # Map the binary predictions back to the corresponding labels
    predictions = []
    for pred in binary_predictions:
        predicted_labels = [mlb_classes[i] for i, val in enumerate(pred) if val == 1]
        predictions.append(predicted_labels)

    return predictions


In [74]:
# Example job titles
job_titles = [
    "Senior Software Engineer",
    "Chief Executive Officer",
    "Marketing Manager"
]

# Classify using BERT
bert_predictions = classify_with_bert(job_titles, threshold=best_threshold_for_BERT)
print("BERT Predictions:", bert_predictions)

# Classify using CatBoost
catboost_predictions = classify_with_catboost(job_titles, threshold=best_threshold_for_catboost)
print("CatBoost Predictions:", catboost_predictions)


BERT Predictions: [['Individual Contributor/Staff'], ['Chief Officer'], ['Manager']]
CatBoost Predictions: [['Individual Contributor/Staff'], ['Chief Officer'], ['Manager']]


**Did not have enogh time to tune all hyperparameters, in case of tuning I think all metrics will be increased**