In [None]:
import json
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
from evaluate import load as load_metric




In [2]:
# Configurations
MODEL_NAME = "algiraldohe/lm-ner-linkedin-skills-recognition"
OUTPUT_DIR = "./ner-finetuned"
MODEL_OUTPUT_DIR = "./ner-finetuned/model"
BATCH_SIZE = 16
EPOCHS = 3
LR = 5e-5
MAX_LEN = 128
SEED = 42

In [3]:
# Load and preprocess the dataset
def load_data(path):
    examples = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            text = obj["text"]
            tokens = text.split()
            char_to_token = []
            offset = 0
            for i, tok in enumerate(tokens):
                start_idx = text.find(tok, offset)
                # map each character in tok to token index
                char_to_token.extend([i] * len(tok))
                offset = start_idx + len(tok)
                # add mapping for the space (None)
                char_to_token.append(None)
            labels = ["O"] * len(tokens)
            for ent in obj.get("entities", []):
                s, e = ent["start"], ent["end"] - 1
                covered = set(char_to_token[s:e+1])
                for idx in covered:
                    if idx is None:
                        continue
                    # if the start of entity corresponds to this token, B- else I-
                    if idx == char_to_token[s]:
                        labels[idx] = "B-SKILL"
                    else:
                        labels[idx] = "I-SKILL"
            examples.append({"tokens": tokens, "ner_tags": labels})
    return examples

In [4]:
# Tokenize and align labels
def tokenize_and_align(examples, tokenizer, label2id):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )
    aligned_labels = []
    for i, tags in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[tags[word_idx]])
            else:
                lab = tags[word_idx]
                # I- for sub-tokens
                if lab.startswith("B-"):
                    lab = lab.replace("B-", "I-")
                label_ids.append(label2id[lab])
            previous_word_idx = word_idx
        aligned_labels.append(label_ids)
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs


In [5]:
data = load_data("jobs-new_with_offsets.jsonl")
train, test = train_test_split(data, test_size=0.2, random_state=SEED)

In [6]:
train_ds = Dataset.from_list(train)
test_ds = Dataset.from_list(test)

In [7]:
len(train_ds), len(test_ds)

(400, 100)

In [8]:
labels = ["O", "B-SKILL", "I-SKILL"]
label2id = {l:i for i,l in enumerate(labels)}
id2label = {i:l for l,i in label2id.items()}

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
# use gpu if available
# device = 0 if torch.cuda.is_available() else -1
# model.to(device)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at algiraldohe/lm-ner-linkedin-skills-recognition and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_tokenized = train_ds.map(
    lambda x: tokenize_and_align(x, tokenizer, label2id),
    batched=True
)

test_tokenized = test_ds.map(
    lambda x: tokenize_and_align(x, tokenizer, label2id),
    batched=True
)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [11]:
metric = load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    # Remove ignored index (-100) and convert to labels
    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = metric.compute(predictions=true_predictions, references=true_labels)

    # Handle cases where metric.compute might return None or missing keys
    if results is None:
        return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "accuracy": 0.0}

    return {
        "precision": results.get("overall_precision", 0.0),
        "recall":    results.get("overall_recall", 0.0),
        "f1":        results.get("overall_f1", 0.0),
        "accuracy":  results.get("overall_accuracy", 0.0)
    }

In [12]:
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="steps",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    seed=SEED,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    save_steps=50
)

In [13]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()




Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,0.5241,0.11828,0.863415,0.917098,0.889447,0.972294
20,0.1719,0.074207,0.924623,0.953368,0.938776,0.980087
30,0.0685,0.060844,0.953846,0.963731,0.958763,0.981818
40,0.0414,0.050592,0.984456,0.984456,0.984456,0.988745
50,0.0692,0.051185,0.979275,0.979275,0.979275,0.987879
60,0.0344,0.048125,0.979275,0.979275,0.979275,0.987879
70,0.0264,0.045442,0.979275,0.979275,0.979275,0.987879




TrainOutput(global_step=75, training_loss=0.1260528187950452, metrics={'train_runtime': 332.0567, 'train_samples_per_second': 3.614, 'train_steps_per_second': 0.226, 'total_flos': 39196638720000.0, 'train_loss': 0.1260528187950452, 'epoch': 3.0})

In [15]:
trainer.evaluate()



{'eval_loss': 0.045934613794088364,
 'eval_precision': 0.9792746113989638,
 'eval_recall': 0.9792746113989638,
 'eval_f1': 0.9792746113989638,
 'eval_accuracy': 0.9878787878787879,
 'eval_runtime': 5.734,
 'eval_samples_per_second': 17.44,
 'eval_steps_per_second': 1.221,
 'epoch': 3.0}

In [16]:
# save the model
trainer.save_model(MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(MODEL_OUTPUT_DIR)
print("Model saved to", MODEL_OUTPUT_DIR)

Model saved to ./ner-finetuned/model


In [18]:
import pandas as pd

eval_results = trainer.evaluate()

# Round all float values in eval_results to 4 decimal places
eval_results_rounded = {k: round(v, 4) if isinstance(v, float) else v for k, v in eval_results.items()}

# Convert the results dictionary to a pandas DataFrame
metrics_df = pd.DataFrame([eval_results_rounded])

# Define the path for the CSV file
metrics_file_path = f"{OUTPUT_DIR}/evaluation_metrics.csv"

# Save the DataFrame to a CSV file
metrics_df.to_csv(metrics_file_path, index=False)

print(f"Evaluation metrics saved to {metrics_file_path}")
print(metrics_df)



Evaluation metrics saved to ./ner-finetuned/evaluation_metrics.csv
   eval_loss  eval_precision  eval_recall  eval_f1  eval_accuracy  \
0     0.0459          0.9793       0.9793   0.9793         0.9879   

   eval_runtime  eval_samples_per_second  eval_steps_per_second  epoch  
0         5.779                   17.304                  1.211    3.0  
