In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 29.4 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 68.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 56.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.24.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 34.0 M

In [None]:
import re
import pandas as pd
from datasets import Dataset

def load_data_to_df(file):
  f = open(file, "r")
  lines = f.readlines()

  tokens = []
  pos = []
  ner_tags = []

  tokens_temp = []
  pos_temp = []
  tags_temp = []

  for i in lines:
    if i == '\n':
      tokens.append(tokens_temp)
      pos.append(pos_temp)
      ner_tags.append(tags_temp)

      tokens_temp = []
      pos_temp = []
      tags_temp = []

    else:
      i = re.sub('\n','',i)
      elements = re.split("\t", i)
      tokens_temp.append(elements[0])
      pos_temp.append(elements[1])
      tags_temp.append(elements[2])

  data = pd.DataFrame({"tokens" : tokens,
                      "pos": pos,
                      "ner_tags" : ner_tags})
  return data 

train = load_data_to_df('train2.txt')
dev = load_data_to_df('dev2.txt')
test = load_data_to_df('test2.txt')

In [None]:
train.head()

Unnamed: 0,tokens,pos,ner_tags
0,"[Maßnahmenbekanntgabe, zu, MA, 40, ,, Prüfung,...","[NN, APPR, NE, CARD, $,, NN, ART, NN]","[O, O, B-ORG, I-ORG, O, O, O, O]"
1,[INHALTSVERZEICHNIS],[NE],[O]
2,[ABKÜRZUNGSVERZEICHNIS],[NE],[O]
3,"[bzw., beziehungsweise, Nr., Nummer]","[KON, KON, NN, NN]","[O, O, O, O]"
4,"[Erledigung, des, Prüfungsberichtes, Der, Stad...","[NN, ART, NN, ART, NN, NE, VVFIN, ART, NN, APP...","[O, O, O, O, B-ORG, I-ORG, O, O, O, O, O, O, O..."


In [None]:
label2id = {'O': 0,'B-ORG':1,'I-ORG':2}
id2label = {0 :'O', 1:'B-ORG', 2:'I-ORG'}
label_list = ['O','B-ORG','I-ORG']

model_checkpoint = "bert-base-german-dbmdz-cased"

In [None]:
from transformers import (
    AutoConfig,
    AutoModelForTokenClassification,
    AutoTokenizer
)

In [None]:
config = AutoConfig.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label = id2label,
    label2id = label2id
)


Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    config=config
)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/240k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/479k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-german-dbmdz-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

In [None]:
def align_labels(data, label_encoding_dict):
    tokenized_inputs = tokenizer(data["tokens"],
                        max_length = 128, padding = 'max_length',
                        truncation=True, is_split_into_words=True)

    label_id_temp = {}
    for i, label in enumerate(data['ner_tags']):
        label_id_temp.update({i:label})

    word_ids = tokenized_inputs.word_ids(batch_index=0)

    labels = []
    for w_id in word_ids:
        if w_id == None:
            labels.append(-100)
        else:
            tag = label_id_temp[w_id]
            labels.append(label2id[tag])

    tokenized_inputs["labels"] = labels

    return tokenized_inputs

def tokenized_for_bert(df):
  for_bert = df.copy()
  for_bert['input_ids'] = ""
  for_bert['token_type_ids'] = ""
  for_bert['attention_mask']= ""
  for_bert['labels'] = ""

  for index, row in df.iterrows():
      inputs = align_labels(row,label2id)
      for_bert.at[index,'input_ids'] = inputs['input_ids']
      for_bert.at[index,'token_type_ids'] = inputs['token_type_ids']
      for_bert.at[index,'attention_mask'] = inputs['attention_mask']
      for_bert.at[index,'labels'] = inputs['labels']
  for_bert = for_bert.drop('pos', axis = 1)
  dataset_for_bert = Dataset.from_pandas(for_bert)
  print(dataset_for_bert)
  return dataset_for_bert

In [None]:
train_dataset = tokenized_for_bert(train)
dev_dataset = tokenized_for_bert(dev)
test_dataset = tokenized_for_bert(test)

Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 1800
})
Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 954
})
Dataset({
    features: ['tokens', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 948
})


In [None]:
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 89 kB/s 
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16180 sha256=71c5322e8c63202acc5c337c00b4ea91564df74fcde0434306fafcaf1a699579
  Stored in directory: /root/.cache/pip/wheels/05/96/ee/7cac4e74f3b19e3158dce26a20a1c86b3533c43ec72a549fd7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import EarlyStoppingCallback
import torch
from seqeval.metrics import classification_report

In [None]:
import numpy as np
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [[label_list[p] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    true_labels = [[label_list[l] for (p, l) in zip(prediction, label) if l != -100] for prediction, label in zip(predictions, labels)]
    
    result = classification_report(true_labels, true_predictions, output_dict = True)

    # print(result)
    
    return result['ORG']



batch_size = 16
learning_rate = 5e-5
epoch = 10

args = TrainingArguments(
    f"test-ner",
    evaluation_strategy = "steps",
    eval_steps = 50,
    logging_steps = 50,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=1e-5,
    metric_for_best_model = 'f1-score',
    save_total_limit=5,
    load_best_model_at_end = True
)

data_collator = DataCollatorForTokenClassification(tokenizer)


trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [EarlyStoppingCallback(early_stopping_patience = 3)]
)

print("Training...")
trainer.train()
print("Evaluating...")
trainer.evaluate()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1800
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1130
  Number of trainable parameters = 109339395
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Training...


Step,Training Loss,Validation Loss,Precision,Recall,F1-score,Support
50,0.1192,0.078285,0.643384,0.714355,0.677014,2076
100,0.0539,0.062279,0.736081,0.808767,0.770714,2076
150,0.0416,0.058118,0.776591,0.805395,0.790731,2076
200,0.036,0.055591,0.707424,0.858382,0.775626,2076
250,0.0303,0.057293,0.768822,0.836224,0.801108,2076
300,0.0255,0.064482,0.735245,0.840077,0.784173,2076
350,0.0267,0.059598,0.777983,0.810212,0.793771,2076
400,0.0214,0.055715,0.768527,0.849229,0.806865,2076
450,0.0178,0.056237,0.774989,0.847784,0.809754,2076
500,0.0117,0.06086,0.809634,0.842004,0.825502,2076


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 954
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 954
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
 

Evaluating...


{'eval_loss': 0.060859911143779755,
 'eval_precision': 0.809634089856415,
 'eval_recall': 0.8420038535645472,
 'eval_f1-score': 0.8255017709563165,
 'eval_support': 2076,
 'eval_runtime': 7.7598,
 'eval_samples_per_second': 122.941,
 'eval_steps_per_second': 7.732,
 'epoch': 5.75}

In [None]:
trainer.predict(test_dataset)

The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: ner_tags, tokens. If ner_tags, tokens are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 948
  Batch size = 16


PredictionOutput(predictions=array([[[ 3.9110475 , -0.40749884, -2.5104394 ],
        [ 7.646634  , -2.8674703 , -3.6296341 ],
        [ 7.6051126 , -2.80595   , -3.5596433 ],
        ...,
        [ 6.44978   , -2.9990916 , -1.8067471 ],
        [ 6.499626  , -3.0433328 , -1.8625259 ],
        [ 3.8731773 , -0.3928484 , -2.4924996 ]],

       [[ 2.268562  ,  0.05247756, -1.7877622 ],
        [ 7.897511  , -3.5148902 , -3.9147797 ],
        [ 7.6728287 , -3.6376078 , -3.876852  ],
        ...,
        [ 5.98212   , -2.9625158 , -2.0979192 ],
        [ 5.9532804 , -2.8680558 , -2.1514614 ],
        [ 2.2664766 ,  0.05025629, -1.7852705 ]],

       [[ 2.2180457 , -0.0148789 , -1.6795976 ],
        [ 6.713145  , -2.4653203 , -2.968332  ],
        [-2.430811  ,  5.541468  , -3.1071675 ],
        ...,
        [ 5.190865  , -2.806817  , -1.5729558 ],
        [ 4.2855105 , -2.327956  , -1.375544  ],
        [ 2.2146957 , -0.01730819, -1.6752793 ]],

       ...,

       [[-0.18836343, -0.753585

In [None]:
trainer.save_model('./bert')

Saving model checkpoint to ./bert
Configuration saved in ./bert/config.json
Model weights saved in ./bert/pytorch_model.bin
tokenizer config file saved in ./bert/tokenizer_config.json
Special tokens file saved in ./bert/special_tokens_map.json


In [None]:
!huggingface-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
        
Token: 
Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in yo

In [None]:
from transformers import AutoModel

model_trained = AutoModel.from_pretrained("/content/drive/MyDrive/bert")

loading configuration file /content/drive/MyDrive/bert/config.json
Model config BertConfig {
  "_name_or_path": "/content/drive/MyDrive/bert",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-ORG",
    "2": "I-ORG"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-ORG": 1,
    "I-ORG": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}

loading weights file /content/drive/MyDrive/bert/pytorch_model.bin
Some weights of the model checkpoint at /content/drive/My

In [None]:
model_load = AutoModel.from_pretrained("Pacho/bert_finetuning_sh")

Downloading:   0%|          | 0.00/767 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--Pacho--bert_finetuning_sh/snapshots/9a35b6fab50617528b6cc89ceb1b472fa20445f1/config.json
Model config BertConfig {
  "_name_or_path": "Pacho/bert_finetuning_sh",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-ORG",
    "2": "I-ORG"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-ORG": 1,
    "I-ORG": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31102
}



Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--Pacho--bert_finetuning_sh/snapshots/9a35b6fab50617528b6cc89ceb1b472fa20445f1/pytorch_model.bin
All model checkpoint weights were used when initializing BertModel.

All the weights of BertModel were initialized from the model checkpoint at Pacho/bert_finetuning_sh.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.


In [None]:
model_trained.push_to_hub("bert_finetuning_sh")

Configuration saved in /tmp/tmp34x3ew66/config.json
Model weights saved in /tmp/tmp34x3ew66/pytorch_model.bin
Uploading the following files to Pacho/bert_finetuning_sh: pytorch_model.bin,config.json


CommitInfo(commit_url='https://huggingface.co/Pacho/bert_finetuning_sh/commit/a8b1c9df2692c832b60010841481db214e658698', commit_message='Upload model', commit_description='', oid='a8b1c9df2692c832b60010841481db214e658698', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer_trained = AutoTokenizer.from_pretrained("/content/drive/MyDrive/bert")
tokenizer_trained.push_to_hub("bert_finetuning_sh")

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
tokenizer config file saved in /tmp/tmpo4oumtao/tokenizer_config.json
Special tokens file saved in /tmp/tmpo4oumtao/special_tokens_map.json
Uploading the following files to Pacho/bert_finetuning_sh: vocab.txt,tokenizer_config.json,tokenizer.json,special_tokens_map.json


CommitInfo(commit_url='https://huggingface.co/Pacho/bert_finetuning_sh/commit/2d41882b233d7e5e2a5098826e58604f3d5ff870', commit_message='Upload tokenizer', commit_description='', oid='2d41882b233d7e5e2a5098826e58604f3d5ff870', pr_url=None, pr_revision=None, pr_num=None)