In [None]:
!pip install -U accelerate>=0.21.0
!pip install torch -U
!pip install transformers -U
!pip install datasets spacy tqdm
!pip install seqeval
!pip install evaluate
!pip install pandas seaborn



In [None]:
import torch
# import torchtext

SEED = 1234
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

print("PyTorch Version: ", torch.__version__)
# print("torchtext Version: ", torchtext.__version__)
print(f"Using {'GPU' if str(DEVICE) == 'cuda' else 'CPU'}.")

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from datasets import load_dataset, load_dataset_builder, get_dataset_split_names, ClassLabel, Sequence

#description
dataset_name = "surrey-nlp/PLOD-CW"
ds_builder = load_dataset_builder(dataset_name)
print(ds_builder.info.description)
print(ds_builder.info.features)


In [None]:
ds = load_dataset(dataset_name)

In [None]:
from datasets import ClassLabel, Value, Sequence, Features

# Assuming `dataset` is your original dataset
def convert_format(example):
    # example[''] = str(example['id'])  # Convert id to string
    example['pos_tags'] = [ClassLabel(names=['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']).str2int(tag) for tag in example['pos_tags']]
    example['ner_tags'] = [ClassLabel(names=['B-O', 'B-AC', 'B-LF', 'I-LF']).str2int(tag) for tag in example['ner_tags']]
    return example

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [None]:
# Define the new features
new_features = Features({
    # 'id': Value(dtype='string', id=None),
    'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
    'pos_tags': Sequence(feature=ClassLabel(names=['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']), length=-1, id=None),
    'ner_tags': Sequence(feature=ClassLabel(names=['B-O', 'B-AC', 'B-LF', 'I-LF']), length=-1, id=None)
})

new_dataset = ds.map(convert_format, features=new_features)

In [None]:
label_list = new_dataset["train"].features[f"ner_tags"].feature.names

In [None]:
label_all_tokens = True
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) ## For some models, you may need to set max_length to approximately 500.

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
metric = evaluate.load("seqeval")
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }


In [None]:
from transformers import TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification, AutoTokenizer, Trainer
import accelerate
ds = load_dataset(dataset_name)
new_dataset = ds.map(convert_format, features=new_features)

task = "ner" # Should be one of "ner", "pos" or "chunk"
model_checkpoint = "bert-base-cased"
batch_size = 4
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# tokenized_datasets = new_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_datasets = new_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=new_dataset["train"].column_names,
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) #pads all of them to be of the same size
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {v: k for k, v in id2label.items()}
print("id2label", id2label)
print("label2id", label2id)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)


args = TrainingArguments(
    "bert-finetuned-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    pust_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
text = ds["test"][6]["tokens"]
labels = ds["test"][6]["ner_tags"]
model.to(DEVICE)
inputs = tokenizer(text, return_tensors="pt",  truncation=True, is_split_into_words=True, return_offsets_mapping=True)
inputs = {key: value.to(DEVICE) for key, value in inputs.items()}
offset_mapping = inputs['offset_mapping'].cpu().tolist()[0]
del inputs["offset_mapping"]

predictions = 0
# Example forward pass
with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    # Move the tensor to the CPU if it's on the GPU
    pre_predictions = predictions
    predictions = predictions.cpu().view(-1).tolist()

input_ids = inputs['input_ids'].cpu().view(-1).tolist()
attention_mask = inputs['attention_mask'].cpu().view(-1).tolist()

aggregated_predictions = []
for i, offset in enumerate(offset_mapping):
  # Ignore special tokens
  if sum(offset) == 0:
    continue
  # If the offset's start position is 0, it's a new word
  if offset[0] == 0:
    aggregated_predictions.append(predictions[i])


print(len(aggregated_predictions))

In [None]:
predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)


#prediction was what model predicted and labels is what is actually is.
# Remove the predictions for the [CLS] and [SEP] tokens
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Compute multiple metrics on the test restuls
results = metric.compute(predictions=true_predictions, references=true_labels)
results