In [None]:
pip install transformers datasets tokenizers seqeval pyarrow -q

In [None]:
!pip install --upgrade datasets pyarrow

In [None]:
!pip show pyarrow

In [None]:
# hack
!conda install -c conda-forge pyarrow -y

In [None]:
import datasets 
from datasets import Dataset, load_dataset, DatasetDict
import numpy as np 
from transformers import BertTokenizerFast 
from transformers import DataCollatorForTokenClassification 
from transformers import AutoModelForTokenClassification 
from datasets import Dataset

conll2003 = datasets.load_dataset("conll2003") 
atco = load_dataset('json', data_files={
    'train': '/kaggle/input/allv3/train.json',
    'validation': '/kaggle/input/allv3/validation.json',
    'test': '/kaggle/input/allv3/test.json'
})

In [None]:
atco

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") 

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True): 
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) 
    labels = [] 
    for i, label in enumerate(examples["ner_tags"]): 
        word_ids = tokenized_inputs.word_ids(batch_index=i) 
        previous_word_idx = None 
        label_ids = []
        for word_idx in word_ids: 
            if word_idx is None: 
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx]) 
            else: 
                label_ids.append(label[word_idx] if label_all_tokens else -100) 
            previous_word_idx = word_idx 
        labels.append(label_ids) 
    tokenized_inputs["labels"] = labels 
    return tokenized_inputs 

In [None]:
tokenized_datasets = atco.map(tokenize_and_align_labels, batched=True)

In [None]:

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
from transformers import TrainingArguments, Trainer 
args = TrainingArguments( 
"test-ner",
evaluation_strategy = "epoch", 
learning_rate=2e-5, 
per_device_train_batch_size=16, 
per_device_eval_batch_size=16, 
num_train_epochs=3, 
weight_decay=0.01, 
) 

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer) 

In [None]:
metric = datasets.load_metric("seqeval") 

In [None]:

example = atco['train'][0]

In [None]:

label_list = ['O', 'CSG']

label_list

In [None]:
example["ner_tags"]

In [None]:


labels = [label_list[i] for i in example["ner_tags"]] 

metric.compute(predictions=[labels], references=[labels]) 

In [None]:
def compute_metrics(eval_preds): 
    pred_logits, labels = eval_preds 
    
    pred_logits = np.argmax(pred_logits, axis=2) 
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax
    
    # We remove all the values where the label is -100
    predictions = [ 
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100] 
        for prediction, label in zip(pred_logits, labels) 
    ] 
    
    true_labels = [ 
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100] 
       for prediction, label in zip(pred_logits, labels) 
   ] 
    results = metric.compute(predictions=predictions, references=true_labels) 
    return { 
   "precision": results["overall_precision"], 
   "recall": results["overall_recall"], 
   "f1": results["overall_f1"], 
  "accuracy": results["overall_accuracy"], 
} 

In [None]:
trainer = Trainer( 
    model, 
    args, 
   train_dataset=tokenized_datasets["train"], 
   eval_dataset=tokenized_datasets["validation"], 
   data_collator=data_collator, 
   tokenizer=tokenizer, 
   compute_metrics=compute_metrics 
) 

In [None]:
trainer.train() 

In [None]:
predictions, label_ids, metrics = trainer.predict(atco["test"])
# print(preds_list)

In [None]:
model.save_pretrained("ner_model")

In [None]:
tokenizer.save_pretrained("tokenizer")

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
import json

In [None]:
config = json.load(open("ner_model/config.json"))

In [None]:
config["id2label"] = id2label
config["label2id"] = label2id

In [None]:
json.dump(config, open("ner_model/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [None]:
!pip show transformers

In [None]:
from transformers import pipeline

In [None]:
import shutil
shutil.make_archive("output", 'zip', "./")

In [None]:
nlp = pipeline("token-classification", model=model_fine_tuned, tokenizer=tokenizer)

model_fine_tuned


example = "Alpha Charlie Zero Three You be leave tma praha"

ner_results = nlp(example)

for res in ner_results:
    print(res)

In [None]:
# print(" ".join(atco["test"][0]['tokens']))

for res in nlp("Alpha Charlie Zero Three You be leave tma praha switch to praha info one two six decimal one naslysenou"):
    final_str = res["word"]
    if res["score"] > 0.5:
        final_str += res["entity"]
    print(final_str)

In [None]:
import torch

In [None]:
input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]])
attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]])
output = model_fine_tuned(input_ids, attention_mask=attention_mask)
print(output.logits)