In [1]:
# import required libraries
import pandas as pd
import numpy as np
import re
import ast

import torch
from transformers import RobertaTokenizerFast, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [None]:
import wandb

wandb.init()

# Tokenize Data

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = RobertaTokenizerFast.from_pretrained("Jean-Baptiste/roberta-large-ner-english")

label_list = ['O', 'B-SKILL', 'I-SKILL']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

def tokenize_and_label(row):
    text = row["Qualification"]
    skills = row["Skills_Dict"]

    # tokenize the text into a dict output with input_ids and offset_mapping as keys
    encoding = tokenizer(text,
                         return_offsets_mapping=True,
                         return_attention_mask=True,
                         truncation=True,
                         padding="max_length",
                         max_length=128)


    offsets = encoding["offset_mapping"]
    labels = [0] * len(offsets)

    skill_spans = []
    for skill in skills:
        for match in re.finditer(r'\b{}\b'.format(re.escape(skill)), text):
            skill_spans.append((match.start(), match.end()))

    for span_start, span_end in skill_spans:
        inside = False
        for i, (token_start, token_end) in enumerate(offsets):
            if token_start == token_end:
                labels[i] = -100  # special token like [CLS], [SEP], [PAD]
                continue

            if token_end <= span_start:
                continue
            if token_start >= span_end:
                break

            if token_start < span_end and token_end > span_start:
                if labels[i] == 0:
                    labels[i] = 1 if not inside else 2
                    inside = True

    for i, (start, end) in enumerate(offsets):
        if start == end:
            labels[i] = -100

    return {
        "input_ids": encoding["input_ids"],
        "attention_mask": encoding["attention_mask"],
        "labels": labels
    }


df_filtered = pd.read_csv("/Users/tracy/Desktop/留学/UMich/SI 630/Final Project/Data/labeled_train.csv", index_col=0)
df_filtered["Skills_Dict"] = df_filtered["Skills"].apply(ast.literal_eval)

tokenized_data = df_filtered.apply(tokenize_and_label, axis=1)
dataset = Dataset.from_list(tokenized_data.tolist())

dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]


# Load Model

In [8]:
model = AutoModelForTokenClassification.from_pretrained(
    "Jean-Baptiste/roberta-large-ner-english",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

model.to(device)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at Jean-Baptiste/roberta-large-ner-english and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([5]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([5, 1024]) in the checkpoint and torch.Size([3, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)


# Train and Evaluate model

In [10]:
def extract_entities(labels):
    entities = []
    start = None
    entity_type = None

    for i, tag in enumerate(labels):
        if tag.startswith("B-"):
            if start is not None:
                entities.append((start, i - 1, entity_type))
            start = i
            entity_type = tag[2:]
        elif tag.startswith("I-"):
            if entity_type is None:
                start = i
                entity_type = tag[2:]
        else:
            if start is not None:
                entities.append((start, i - 1, entity_type))
                start = None
                entity_type = None

    if start is not None:
        entities.append((start, len(labels) - 1, entity_type))

    return entities

def compute_custom_ner_metrics(predictions, labels, id2label):
    total_pred = 0
    total_true = 0
    correct = 0
    token_correct = 0
    token_total = 0

    for pred_seq, label_seq in zip(predictions, labels):
        pred_labels = [id2label[p] for p in pred_seq]
        true_labels = [id2label[l] for l in label_seq]

        pred_entities = set(extract_entities(pred_labels))
        true_entities = set(extract_entities(true_labels))

        total_pred += len(pred_entities)
        total_true += len(true_entities)
        correct += len(pred_entities & true_entities)

        for pl, tl in zip(pred_labels, true_labels):
            if tl != "O":
                token_total += 1
                if pl == tl:
                    token_correct += 1

    precision = correct / total_pred if total_pred else 0
    recall = correct / total_true if total_true else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
    token_acc = token_correct / token_total if token_total else 0

    return {
        "precision": round(precision, 4),
        "recall": round(recall, 4),
        "f1": round(f1, 4),
        "token_accuracy": round(token_acc, 4)
    }

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true = []
    pred = []

    for p_seq, l_seq in zip(predictions, labels):
        filtered_preds = []
        filtered_labels = []
        for p, l in zip(p_seq, l_seq):
            if l != -100:
                filtered_preds.append(p)
                filtered_labels.append(l)
        pred.append(filtered_preds)
        true.append(filtered_labels)

    return compute_custom_ner_metrics(pred, true, id2label)


In [20]:
training_args = TrainingArguments(
    output_dir="./ner-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.02,
    logging_dir="./logs",
    logging_steps=10,
    use_cpu=True
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Token Accuracy
1,0.0093,0.087241,0.8889,0.6154,0.7273,0.593
2,0.0052,0.094641,0.7321,0.7885,0.7593,0.8721
3,0.005,0.068571,0.8095,0.6538,0.7234,0.6744
4,0.0018,0.063129,0.8,0.7692,0.7843,0.814
5,0.0014,0.063887,0.7843,0.7692,0.7767,0.8256


TrainOutput(global_step=50, training_loss=0.00454567264765501, metrics={'train_runtime': 534.8619, 'train_samples_per_second': 1.44, 'train_steps_per_second': 0.093, 'total_flos': 178776702128640.0, 'train_loss': 0.00454567264765501, 'epoch': 5.0})

In [22]:
trainer.evaluate(eval_dataset)

{'eval_loss': 0.06388738751411438,
 'eval_precision': 0.7843,
 'eval_recall': 0.7692,
 'eval_f1': 0.7767,
 'eval_token_accuracy': 0.8256,
 'eval_runtime': 3.1075,
 'eval_samples_per_second': 5.792,
 'eval_steps_per_second': 0.644,
 'epoch': 5.0}

# Predict Skill keywords

In [24]:
def pred_on_text(text):
    encoding = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=256,
        return_offsets_mapping=True
    )
    input_ids = encoding["input_ids"]
    attention_mask = encoding["attention_mask"]
    offsets = encoding["offset_mapping"][0].tolist()

    trainer.model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)[0].tolist()


    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    results = []
    for token, pred, (start, end) in zip(tokens, predictions, offsets):
        if start == end or pred == -100:
            continue
        label = id2label[pred]
        results.append((text[start:end], label))

    return results

def pred_label(text):
    pred_result = pred_on_text(text)

    list_skills = []
    for p in pred_result:
        if p[1] in ["B-SKILL", "I-SKILL"]:
            list_skills.append(p[0])

    return list_skills

In [25]:
df_pred = pd.read_excel("/Users/tracy/Desktop/留学/UMich/SI 630/Final Project/Data/summarized_test.xlsx", index_col=0)
df_pred = df_pred.dropna()
df_pred.reset_index(inplace=True, drop=True)
df_pred['Skills_Pred'] = None

for idx, row in df_pred.iterrows():
    text = row["Qualification"]
    pred_res = pred_label(text)
    df_pred.at[idx, 'Skills_Pred'] = pred_res

print("Finish Prediction!")

Finish Prediction!


In [None]:
trainer.push_to_hub()