In [None]:
#Ner con Bert

from datasets import load_dataset
data = load_dataset("conllpp")

data["train"][0]

label_list = data["train"].features["ner_tags"].feature.names

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

example = data["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])


def get_alignment(sequence):
        tokenized_inputs = tokenizer(sequence["tokens"], truncation=True, is_split_into_words=True)

        labels = []
        # iteriamo su tutti i ner tags della sequenza
        for i, label in enumerate(sequence["ner_tags"]):
           word_ids = tokenized_inputs.word_ids(batch_index=i)  # Mappa i tokens alla parola corretta
           previous_word_idx = None
           label_ids = []
            for word_idx in word_ids:
               if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:  # si assegna la label solo al primo dei token spezzati
                    label_ids.append(label[word_idx])
                else:
                  label_ids.append(-100)
               previous_word_idx = word_idx
            labels.append(label_ids)

        tokenized_inputs["labels"] = labels
        return tokenized_inputs

tokenized_data = data.map(get_alignment, batched=True)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    }),
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    }),
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

tokenized_data['train'][0]


small_tr = tokenized_data['train'].select(range(5000))
small_ts = tokenized_data['test'].select(range(2500))


from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors='tf')




DataCollatorForTokenClassification(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased',
    vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right',
    truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]',
    'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'},
    clean_up_tokenization_spaces=True), padding=True, max_length=None,
    pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors="tf")


import evaluate
seqeval = evaluate.load("seqeval")

import numpy as np

labels = [label_list[i] for i in example["ner_tags"]]

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }
id2label = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-ORG',
    4: 'I-ORG',
    5: 'B-LOC',
    6: 'I-LOC',
    7: 'B-MISC',
    8: 'I-MISC'
}

label2id = {
    'O': 0,
    'B-PER': 1,
    'I-PER': 2,
    'B-ORG': 3,
    'I-ORG': 4,
    'B-LOC': 5,
    'I-LOC': 6,
    'B-MISC': 7,
    'I-MISC': 8
}

from transformers import create_optimizer

batch_size = 16
num_train_epochs = 3
num_train_steps = (len(small_tr) // batch_size) * num_train_epochs
optimizer, lr_schedule = create_optimizer(
    init_lr=2e-5,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
    num_warmup_steps=0,
)

from transformers import TFAutoModelForTokenClassification

model = TFAutoModelForTokenClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=9, id2label=id2label, label2id=label2id
)

 tf_train_set = model.prepare_tf_dataset(
    small_tr,
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    small_ts,
    shuffle=False,
    batch_size=32,
    collate_fn=data_collator,
)

 from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

callbacks = [metric_callback]

model.compile(optimizer=optimizer)

model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=1, callbacks=callbacks)

model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=1, callbacks=callbacks)



from transformers import NerPipeline
text = "Bill Gates and Microsoft will be the sponsors for one-day cricket international between Pakistan and New Zealand"
print(text)
ner = NerPipeline(tokenizer=tokenizer, model=model, aggregation_strategy='simple')
ner(text)



