# Fine-tuned classification model



## 1 Setup

In [1]:
!pip install transformers datasets accelerate -q

import os
import torch
import numpy as np
import pandas as pd
import json
import pickle
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.utils.class_weight import compute_class_weight

os.environ['WANDB_DISABLED'] = 'true'
drive.mount('/content/drive')


Mounted at /content/drive


## 2 Load and Prepare Data

In [2]:

df_seniority = pd.read_csv('/content/drive/MyDrive/seniority-v2.csv')
df_department = pd.read_csv('/content/drive/MyDrive/department-v2.csv')

df_seniority['text'] = df_seniority['text'].fillna("").astype(str).str.strip()
df_department['text'] = df_department['text'].fillna("").astype(str).str.strip()

# Seniority Ordinal Encoding
seniority_hierarchy = {'Junior': 0, 'Senior': 1, 'Lead': 2, 'Management': 3, 'Director': 4}
df_seniority['label_encoded'] = df_seniority['label'].map(seniority_hierarchy)
le_sen = LabelEncoder()
le_sen.classes_ = np.array(['Junior', 'Senior', 'Lead', 'Management', 'Director'])

# Department Label Encoding
le_dept = LabelEncoder()
df_department['label_encoded'] = le_dept.fit_transform(df_department['label'])

# Train-Test Splits
train_sen, test_sen = train_test_split(
    df_seniority,
    test_size=0.2,
    stratify=df_seniority['label_encoded'],
    random_state=42
)
train_dept, test_dept = train_test_split(
    df_department,
    test_size=0.2,
    stratify=df_department['label_encoded'],
    random_state=42
)

print(f"Seniority - Train: {len(train_sen)}, Test: {len(test_sen)}")
print(f"Department - Train: {len(train_dept)}, Test: {len(test_dept)}")

Seniority - Train: 7542, Test: 1886
Department - Train: 8116, Test: 2029


## 3 Custom weighted trainer

In [3]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights.to(logits.device))
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1': f1_score(labels, predictions, average='weighted')
    }

def prepare_dataset_for_model(df, tokenizer, label_col='label_encoded'):
    dataset = Dataset.from_dict({
        'text': df['text'].tolist(),
        'labels': df[label_col].tolist()
    })
    return dataset.map(
        lambda x: tokenizer(x['text'], padding='max_length', truncation=True, max_length=128),
        batched=True
    )

## 4 Configuration

In [4]:
MODELS_TO_TEST = [
    'distilbert-base-multilingual-cased',
    'xlm-roberta-base',
    'bert-base-multilingual-cased'
]

# Standard hyperparameters for model comparison
STANDARD_HP = {
    'learning_rate': 2e-5,
    'weight_decay': 0.01,
    'num_epochs': 3,
    'batch_size': 16
}

# Hyperparameters to tune (for best model)
TUNING_HP = {
    'batch_sizes': [8, 16],
    'num_epochs': [3, 4]
}

## 5 Model Comparison (Seniority)

In [5]:
print("MODEL COMPARISON - SENIORITY\n")

results_model_comparison_sen = []
best_model_name_sen = None
best_acc_model_sen = 0

class_weights_sen = torch.tensor(
    compute_class_weight('balanced', classes=np.unique(train_sen['label_encoded']), y=train_sen['label_encoded']),
    dtype=torch.float
)

for m_name in MODELS_TO_TEST:
    print(f"\nTesting: {m_name}")

    try:
        # Load tokenizer and prepare datasets
        tokenizer = AutoTokenizer.from_pretrained(m_name)
        ds_train = prepare_dataset_for_model(train_sen, tokenizer)
        ds_test = prepare_dataset_for_model(test_sen, tokenizer)

        # Load model
        model = AutoModelForSequenceClassification.from_pretrained(m_name, num_labels=len(le_sen.classes_))

        # Training arguments (standard HP)
        args = TrainingArguments(
            output_dir=f'./results_sen_comparison_{m_name.replace("/", "_")}',
            num_train_epochs=STANDARD_HP['num_epochs'],
            per_device_train_batch_size=STANDARD_HP['batch_size'],
            per_device_eval_batch_size=32,
            learning_rate=STANDARD_HP['learning_rate'],
            weight_decay=STANDARD_HP['weight_decay'],
            eval_strategy='epoch',
            save_strategy='epoch',
            load_best_model_at_end=True,
            metric_for_best_model='accuracy',
            save_total_limit=1,
            logging_steps=100,
            report_to="none"
        )

        # Trainer
        trainer = WeightedTrainer(
            model=model,
            args=args,
            train_dataset=ds_train,
            eval_dataset=ds_test,
            compute_metrics=compute_metrics,
            class_weights=class_weights_sen,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        # Train
        trainer.train()

        # Evaluate
        acc = trainer.evaluate()['eval_accuracy']
        print(f"  → Accuracy: {acc:.4f}")

        results_model_comparison_sen.append({
            'model': m_name,
            'accuracy': acc
        })

        # Track best model
        if acc > best_acc_model_sen:
            best_acc_model_sen = acc
            best_model_name_sen = m_name

        # Cleanup
        del model, trainer, tokenizer
        torch.cuda.empty_cache()

    except Exception as e:
        print(f" Error: {e}")
        results_model_comparison_sen.append({
            'model': m_name,
            'accuracy': 0.0,
            'error': str(e)
        })

# Results
print("\nMODEL COMPARISON RESULTS - SENIORITY")
df_model_comp_sen = pd.DataFrame(results_model_comparison_sen).sort_values('accuracy', ascending=False)
print("\n", df_model_comp_sen.to_string(index=False))
print(f"\n Best Model: {best_model_name_sen} (Acc: {best_acc_model_sen:.4f})")


MODEL COMPARISON - SENIORITY


Testing: distilbert-base-multilingual-cased


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.068,0.059694,0.992047,0.992041
2,0.0386,0.044022,0.994698,0.994682
3,0.0157,0.02572,0.993637,0.99363


  → Accuracy: 0.9947

Testing: xlm-roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1512,0.06488,0.988335,0.988325
2,0.053,0.026985,0.994698,0.994697
3,0.0292,0.010405,0.997349,0.99735


  → Accuracy: 0.9973

Testing: bert-base-multilingual-cased


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.0464,0.078351,0.990986,0.990971
2,0.0369,0.042642,0.994698,0.994712
3,0.0288,0.023308,0.995758,0.995763


  → Accuracy: 0.9958

MODEL COMPARISON RESULTS - SENIORITY

                              model  accuracy
                  xlm-roberta-base  0.997349
      bert-base-multilingual-cased  0.995758
distilbert-base-multilingual-cased  0.994698

 Best Model: xlm-roberta-base (Acc: 0.9973)


## 6 Hyperparameter Tuning (Seniority)

In [6]:
print(f"HYPERPARAMETER TUNING - SENIORITY ({best_model_name_sen})")


results_tuning_sen = []
best_acc_sen = 0
best_model_sen = None
best_tokenizer_sen = None
best_params_sen = {}

# Tokenizer for best model
tokenizer_sen = AutoTokenizer.from_pretrained(best_model_name_sen)
ds_train_sen = prepare_dataset_for_model(train_sen, tokenizer_sen)
ds_test_sen = prepare_dataset_for_model(test_sen, tokenizer_sen)

for bs in TUNING_HP['batch_sizes']:
    for epochs in TUNING_HP['num_epochs']:
        print(f"\nTesting: Batch Size={bs}, Epochs={epochs}")

        try:
            # Load model
            model = AutoModelForSequenceClassification.from_pretrained(
                best_model_name_sen,
                num_labels=len(le_sen.classes_)
            )

            # Training arguments
            args = TrainingArguments(
                output_dir=f'./results_sen_tune_bs{bs}_ep{epochs}',
                num_train_epochs=epochs,
                per_device_train_batch_size=bs,
                per_device_eval_batch_size=32,
                learning_rate=STANDARD_HP['learning_rate'],
                weight_decay=STANDARD_HP['weight_decay'],
                eval_strategy='epoch',
                save_strategy='epoch',
                load_best_model_at_end=True,
                metric_for_best_model='accuracy',
                save_total_limit=1,
                logging_steps=50,
                report_to="none"
            )

            # Trainer
            trainer = WeightedTrainer(
                model=model,
                args=args,
                train_dataset=ds_train_sen,
                eval_dataset=ds_test_sen,
                compute_metrics=compute_metrics,
                class_weights=class_weights_sen,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
            )

            # Train
            trainer.train()

            # Evaluate
            acc = trainer.evaluate()['eval_accuracy']
            print(f"  → Accuracy: {acc:.4f}")

            results_tuning_sen.append({
                'batch_size': bs,
                'epochs': epochs,
                'accuracy': acc
            })

            # Track best
            if acc > best_acc_sen:
                best_acc_sen = acc
                best_model_sen = model
                best_tokenizer_sen = tokenizer_sen
                best_params_sen = {'batch_size': bs, 'epochs': epochs}
            else:
                del model  # Delete if not best

            del trainer
            torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error: {e}")

# Results
print("\nHYPERPARAMETER TUNING RESULTS - SENIORITY")
df_tuning_sen = pd.DataFrame(results_tuning_sen).sort_values('accuracy', ascending=False)
print("\n", df_tuning_sen.to_string(index=False))
print(f"\nBest Parameters: BS={best_params_sen['batch_size']}, Epochs={best_params_sen['epochs']}")
print(f"   Best Accuracy: {best_acc_sen:.4f}")

HYPERPARAMETER TUNING - SENIORITY (xlm-roberta-base)


Map:   0%|          | 0/7542 [00:00<?, ? examples/s]

Map:   0%|          | 0/1886 [00:00<?, ? examples/s]


Testing: Batch Size=8, Epochs=3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1618,0.193023,0.97561,0.975489
2,0.037,0.038612,0.994698,0.994685
3,0.0128,0.013629,0.995228,0.995237


  → Accuracy: 0.9952

Testing: Batch Size=8, Epochs=4


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1421,0.037759,0.988335,0.988347
2,0.0355,0.020644,0.994698,0.99471
3,0.0182,0.03128,0.996288,0.996291
4,0.0303,0.017657,0.996819,0.996826


  → Accuracy: 0.9968

Testing: Batch Size=16, Epochs=3


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2039,0.078643,0.97561,0.975616
2,0.0349,0.022202,0.994168,0.9942
3,0.0318,0.014785,0.996288,0.996301


  → Accuracy: 0.9963

Testing: Batch Size=16, Epochs=4


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1343,0.037904,0.991516,0.99152
2,0.0322,0.009091,0.996819,0.996825
3,0.0112,0.012305,0.995228,0.995226
4,0.0064,0.005794,0.997349,0.997351


  → Accuracy: 0.9973

HYPERPARAMETER TUNING RESULTS - SENIORITY

  batch_size  epochs  accuracy
         16       4  0.997349
          8       4  0.996819
         16       3  0.996288
          8       3  0.995228

Best Parameters: BS=16, Epochs=4
   Best Accuracy: 0.9973


## 7 Model Comparison (Department)

In [7]:

print(" MODEL COMPARISON - DEPARTMENT")

results_model_comparison_dept = []
best_model_name_dept = None
best_acc_model_dept = 0

class_weights_dept = torch.tensor(
    compute_class_weight('balanced', classes=np.unique(train_dept['label_encoded']), y=train_dept['label_encoded']),
    dtype=torch.float
)

for m_name in MODELS_TO_TEST:
    print(f"\n{'='*70}")
    print(f"Testing: {m_name}")
    print(f"{'='*70}")

    try:
        # Load tokenizer and prepare datasets
        tokenizer = AutoTokenizer.from_pretrained(m_name)
        ds_train = prepare_dataset_for_model(train_dept, tokenizer)
        ds_test = prepare_dataset_for_model(test_dept, tokenizer)

        # Load model
        model = AutoModelForSequenceClassification.from_pretrained(m_name, num_labels=len(le_dept.classes_))

        # Training arguments (standard HP)
        args = TrainingArguments(
            output_dir=f'./results_dept_comparison_{m_name.replace("/", "_")}',
            num_train_epochs=STANDARD_HP['num_epochs'],
            per_device_train_batch_size=STANDARD_HP['batch_size'],
            per_device_eval_batch_size=32,
            learning_rate=STANDARD_HP['learning_rate'],
            weight_decay=STANDARD_HP['weight_decay'],
            eval_strategy='epoch',
            save_strategy='epoch',
            load_best_model_at_end=True,
            metric_for_best_model='accuracy',
            save_total_limit=1,
            logging_steps=100,
            report_to="none"
        )

        # Trainer
        trainer = WeightedTrainer(
            model=model,
            args=args,
            train_dataset=ds_train,
            eval_dataset=ds_test,
            compute_metrics=compute_metrics,
            class_weights=class_weights_dept,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
        )

        # Train
        trainer.train()

        # Evaluate
        acc = trainer.evaluate()['eval_accuracy']
        print(f"  → Accuracy: {acc:.4f}")

        results_model_comparison_dept.append({
            'model': m_name,
            'accuracy': acc
        })

        # Track best model
        if acc > best_acc_model_dept:
            best_acc_model_dept = acc
            best_model_name_dept = m_name

        # Cleanup
        del model, trainer, tokenizer
        torch.cuda.empty_cache()

    except Exception as e:
        print(f" Error: {e}")
        results_model_comparison_dept.append({
            'model': m_name,
            'accuracy': 0.0,
            'error': str(e)
        })

# Results
print("\nMODEL COMPARISON RESULTS - DEPARTMENT")
df_model_comp_dept = pd.DataFrame(results_model_comparison_dept).sort_values('accuracy', ascending=False)
print("\n", df_model_comp_dept.to_string(index=False))
print(f"\nBest Model: {best_model_name_dept} (Acc: {best_acc_model_dept:.4f})")


 MODEL COMPARISON - DEPARTMENT

Testing: distilbert-base-multilingual-cased


Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3593,0.269901,0.989157,0.98837
2,0.1686,0.043505,0.997043,0.997048
3,0.0322,0.027309,0.997043,0.997043


  → Accuracy: 0.9970

Testing: xlm-roberta-base


Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6958,0.813639,0.969936,0.964986
2,0.4386,0.283925,0.991621,0.991366
3,0.0952,0.183568,0.992607,0.992562


  → Accuracy: 0.9926

Testing: bert-base-multilingual-cased


Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5106,0.607639,0.982257,0.97975
2,0.3986,0.172717,0.992607,0.991814
3,0.092,0.156325,0.994086,0.994068


  → Accuracy: 0.9941

MODEL COMPARISON RESULTS - DEPARTMENT

                              model  accuracy
distilbert-base-multilingual-cased  0.997043
      bert-base-multilingual-cased  0.994086
                  xlm-roberta-base  0.992607

Best Model: distilbert-base-multilingual-cased (Acc: 0.9970)


## 8 Hyperparameter Tuning (Department)

In [8]:

print(f"HYPERPARAMETER TUNING - DEPARTMENT ({best_model_name_dept})")

results_tuning_dept = []
best_acc_dept = 0
best_model_dept = None
best_tokenizer_dept = None
best_params_dept = {}

# Tokenizer for best model
tokenizer_dept = AutoTokenizer.from_pretrained(best_model_name_dept)
ds_train_dept = prepare_dataset_for_model(train_dept, tokenizer_dept)
ds_test_dept = prepare_dataset_for_model(test_dept, tokenizer_dept)

for bs in TUNING_HP['batch_sizes']:
    for epochs in TUNING_HP['num_epochs']:
        print(f"\nTesting: Batch Size={bs}, Epochs={epochs}")

        try:
            # Load model
            model = AutoModelForSequenceClassification.from_pretrained(
                best_model_name_dept,
                num_labels=len(le_dept.classes_)
            )

            # Training arguments
            args = TrainingArguments(
                output_dir=f'./results_dept_tune_bs{bs}_ep{epochs}',
                num_train_epochs=epochs,
                per_device_train_batch_size=bs,
                per_device_eval_batch_size=32,
                learning_rate=STANDARD_HP['learning_rate'],
                weight_decay=STANDARD_HP['weight_decay'],
                eval_strategy='epoch',
                save_strategy='epoch',
                load_best_model_at_end=True,
                metric_for_best_model='accuracy',
                save_total_limit=1,
                logging_steps=50,
                report_to="none"
            )

            # Trainer
            trainer = WeightedTrainer(
                model=model,
                args=args,
                train_dataset=ds_train_dept,
                eval_dataset=ds_test_dept,
                compute_metrics=compute_metrics,
                class_weights=class_weights_dept,
                callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
            )

            # Train
            trainer.train()

            # Evaluate
            acc = trainer.evaluate()['eval_accuracy']
            print(f"  → Accuracy: {acc:.4f}")

            results_tuning_dept.append({
                'batch_size': bs,
                'epochs': epochs,
                'accuracy': acc
            })

            # Track best
            if acc > best_acc_dept:
                best_acc_dept = acc
                best_model_dept = model
                best_tokenizer_dept = tokenizer_dept
                best_params_dept = {'batch_size': bs, 'epochs': epochs}
            else:
                del model  # Delete if not best

            del trainer
            torch.cuda.empty_cache()

        except Exception as e:
            print(f" Error: {e}")

# Results
print("HYPERPARAMETER TUNING RESULTS - DEPARTMENT")
df_tuning_dept = pd.DataFrame(results_tuning_dept).sort_values('accuracy', ascending=False)
print("\n", df_tuning_dept.to_string(index=False))
print(f"\n Best Parameters: BS={best_params_dept['batch_size']}, Epochs={best_params_dept['epochs']}")
print(f"   Best Accuracy: {best_acc_dept:.4f}")


HYPERPARAMETER TUNING - DEPARTMENT (distilbert-base-multilingual-cased)


Map:   0%|          | 0/8116 [00:00<?, ? examples/s]

Map:   0%|          | 0/2029 [00:00<?, ? examples/s]


Testing: Batch Size=8, Epochs=3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2923,0.28171,0.990143,0.988843
2,0.0443,0.156618,0.995071,0.995016
3,0.0416,0.040686,0.997043,0.997046


  → Accuracy: 0.9970

Testing: Batch Size=8, Epochs=4


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2814,0.281503,0.990636,0.989298
2,0.0426,0.042553,0.997043,0.997033
3,0.0326,0.025207,0.997043,0.997053
4,0.0011,0.023479,0.997536,0.997532


  → Accuracy: 0.9975

Testing: Batch Size=16, Epochs=3


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4913,0.269901,0.989157,0.98837
2,0.0819,0.043505,0.997043,0.997048
3,0.0411,0.027309,0.997043,0.997043


  → Accuracy: 0.9970

Testing: Batch Size=16, Epochs=4


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.493,0.282945,0.988172,0.988136


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.493,0.282945,0.988172,0.988136
2,0.0969,0.038173,0.995071,0.995101
3,0.0301,0.034607,0.994579,0.994627
4,0.0236,0.02632,0.995071,0.995106


  → Accuracy: 0.9951
HYPERPARAMETER TUNING RESULTS - DEPARTMENT

  batch_size  epochs  accuracy
          8       4  0.997536
          8       3  0.997043
         16       3  0.997043
         16       4  0.995071

 Best Parameters: BS=8, Epochs=4
   Best Accuracy: 0.9975


## 9 Evaluation on SnapAddy labeled dataset

In [9]:
print("Evaluation on SnapAddy labeled dataset")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model_sen.to(device)
best_model_dept.to(device)

with open('/content/drive/MyDrive/linkedin-cvs-annotated.json', 'r', encoding='utf-8') as f:
    linkedin_data = json.load(f)

# Seniority Evaluation
true_s, pred_s = [], []

for person in linkedin_data:
    for job in person:
        if job.get('status') == 'ACTIVE':
            text = str(job.get('position', '')).strip()
            inputs = best_tokenizer_sen(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                logits = best_model_sen(**inputs).logits

            pred_s.append(le_sen.inverse_transform([logits.argmax().item()])[0])
            true_s.append(job['seniority'])

# Department Evaluation
true_d, pred_d = [], []

for person in linkedin_data:
    for job in person:
        if job.get('status') == 'ACTIVE':
            text = str(job.get('position', '')).strip()
            inputs = best_tokenizer_dept(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
            inputs = {k: v.to(device) for k, v in inputs.items()}

            with torch.no_grad():
                logits = best_model_dept(**inputs).logits

            pred_d.append(le_dept.inverse_transform([logits.argmax().item()])[0])
            true_d.append(job['department'])

# Results

print("\nSENIORITY RESULTS")
print(f"Accuracy: {accuracy_score(true_s, pred_s):.4f}")
print("\n" + classification_report(true_s, pred_s))

print("\nDEPARTMENT RESULTS")
print(f"Accuracy: {accuracy_score(true_d, pred_d):.4f}")
print("\n" + classification_report(true_d, pred_d))

Evaluation on SnapAddy labeled dataset

SENIORITY RESULTS
Accuracy: 0.4671

              precision    recall  f1-score   support

    Director       0.60      1.00      0.75        34
      Junior       0.05      0.33      0.09        12
        Lead       0.71      0.59      0.65       125
  Management       0.83      0.72      0.77       192
Professional       0.00      0.00      0.00       216
      Senior       0.19      0.91      0.31        44

    accuracy                           0.47       623
   macro avg       0.40      0.59      0.43       623
weighted avg       0.45      0.47      0.43       623


DEPARTMENT RESULTS
Accuracy: 0.2857

                        precision    recall  f1-score   support

        Administrative       0.06      0.29      0.10        14
  Business Development       0.30      0.30      0.30        20
            Consulting       0.23      0.59      0.33        39
      Customer Support       0.50      0.17      0.25         6
       Human Resources

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 10 Save Model

In [12]:
print("SAVING MODELS")

best_model_sen.save_pretrained('/content/drive/MyDrive/best_sen_model')
best_tokenizer_sen.save_pretrained('/content/drive/MyDrive/best_sen_model')
print(" Seniority model saved")

best_model_dept.save_pretrained('/content/drive/MyDrive/best_dept_model')
best_tokenizer_dept.save_pretrained('/content/drive/MyDrive/best_dept_model')
print(" Department model saved")

with open('/content/drive/MyDrive/label_encoders.pkl', 'wb') as f:
    pickle.dump({'sen': le_sen, 'dept': le_dept}, f)
print(" Label encoders saved")

# Save experiment results
results_summary = {
    'seniority': {
        'model_comparison': results_model_comparison_sen,
        'best_model': best_model_name_sen,
        'hyperparameter_tuning': results_tuning_sen,
        'best_params': best_params_sen,
        'csv_accuracy': best_acc_sen,
        'linkedin_accuracy': accuracy_score(true_s, pred_s)

    },
    'department': {
        'model_comparison': results_model_comparison_dept,
        'best_model': best_model_name_dept,
        'hyperparameter_tuning': results_tuning_dept,
        'best_params': best_params_dept,
        'csv_accuracy': best_acc_dept,
        'linkedin_accuracy': accuracy_score(true_d, pred_d)
    }
}

with open('/content/drive/MyDrive/experiment_results.json', 'w') as f:
    json.dump(results_summary, f, indent=2)
print("Experiment results saved")



SAVING MODELS
 Seniority model saved
 Department model saved
 Label encoders saved
Experiment results saved
