### Install and import required packages

In [None]:
# !pip install keras
# !pip install scikit-learn
# !pip install transformers
# !pip install torch torchvision torchaudio

In [None]:
import csv
import pickle
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

from keras.preprocessing.sequence import pad_sequences

import transformers
from transformers import BertTokenizer, BertConfig
from transformers import get_linear_schedule_with_warmup
from transformers import BertForTokenClassification, AdamW

from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
torch.__version__

In [None]:
device = torch.device('cuda')
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
transformers.__version__

### Set-up data

In [None]:
#data = pd.read_csv('New_Data.csv', sep=',')
data = pd.read_csv('Classified_data.csv', sep=',')

data = data.rename(columns={'text': 'Text', 'ner_tag': 'Tag', 'value': 'Word'})

data.head()

In [None]:
data.dtypes

### Set-up data iterator

The class **`GetSentence`** returns a list of tokenized sentence and its corresponding labels.

In [None]:
class GetSentence(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg = lambda s: [(w, t) for w, t in zip(s['Word'].values.tolist(), s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('sentence_number').apply(agg)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped['{}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter = GetSentence(data)

In [None]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
sentences[0]

In [None]:
labels = [[s[1] for s in sentence] for sentence in getter.sentences]
print(labels[0])

### Set of unique tags and its indices

In [None]:
tag_values = list(set(data['Tag'].values))
tag_values.append('PAD')
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
tag_values

Save **`tag_values`** as it will be required for later use.

In [None]:
t_values = open("tag_values.pkl", "wb")
pickle.dump(tag_values, t_values)
t_values.close()

### Set-up BERT tokenizer from pre-trained **`bert-base-german-cased`**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased', do_lower_case=False)

As with `tag_values`, we will also require **`tokenizer`** for later use.

In [None]:
save_tokenizer = open("tokenizer.pkl", "wb")
pickle.dump(tokenizer, save_tokenizer)
save_tokenizer.close()

Since BERT uses **WordPiece**, we also have to make our sentences to similar format.

The following function accepts **`sentences`** and **`labels`**, and iterates through every single one of them.

Our **`tokenizer`** is applied to every single word from each sentence of **`sentences`**. While doing this, we have to make each sub-word from word has the same label.

In [None]:
def tokenize_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
      if not isinstance(word, str):
        word = str(word)  # Convert non-string word to string

      tokenized_word = tokenizer.tokenize(word)
      n_subwords = len(tokenized_word)
      tokenized_sentence.extend(tokenized_word)
      labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
%%time
tokenized_texts_labels = [tokenize_preserve_labels(sent, labels) for sent, labels in zip(sentences, labels)]

Extract **tokens** and **labels** from **`tokenized_texts_labels`**.

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_labels]

In [None]:
tokenized_texts[4]

### Apply padding and generate **`attention_mask`**

In [None]:
MAX_LEN = 100
BATCH_SIZE = 64

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen=MAX_LEN, dtype='long', value=0.0, truncating='post', padding='post')

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], maxlen=MAX_LEN, value=tag2idx['PAD'], padding='post', dtype='long', truncating='post')

In [None]:
type(tags)

In [None]:
attention_mask = [[float(i != 0.0) for i in ii] for ii in input_ids]

### Prepare training and testing data

Split data and attention mask.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(input_ids, tags, random_state=42, test_size=0.1)
tr_mask, val_mask, _, _ = train_test_split(attention_mask, input_ids, random_state=42, test_size=0.1)

In [None]:
y_train = torch.cuda.LongTensor(y_train)


In [None]:
X_train = torch.tensor(X_train)
X_test  = torch.tensor(X_test)
y_train = y_train
y_test= torch.tensor(y_test)
tr_mask, val_mask = torch.tensor(tr_mask), torch.tensor(val_mask)

Create data-loaders.

In [None]:
train_data = TensorDataset(X_train, tr_mask, y_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_data = TensorDataset(X_test, val_mask, y_test)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler,batch_size=BATCH_SIZE)

### Pull and fine-tune **`bert-base-german-cased`** model

In [None]:
model = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=len(tag2idx), output_attentions=False, output_hidden_states=False)

In [None]:
model.cuda();

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters)
    optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer]}]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, eps=1e-8)

### Training and evaluation

In [None]:
class EarlyStopping:
    def __init__(self, patience=3, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss


In [None]:
EPOCHS = 15
MAX_GRAD_NORM = 1.0

total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Initialize EarlyStopping
early_stopping = EarlyStopping(patience=5, verbose=True)

In [None]:
%%time
loss_values, validation_loss_values = [], []
# Initialize lists to store per-epoch loss and accuracy
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

for e in range(EPOCHS):
    print(f'- Epoch 0{e+1} -')
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        loss.backward()
        total_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print('Average train loss:\t{:.5f}'.format(avg_train_loss))
    loss_values.append(avg_train_loss)
    train_losses.append(avg_train_loss)


    model.eval()
    eval_loss, eval_accuracy = 0, 0
    predictions, true_labels = [], []

    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print('Validation loss:\t{:.5f}'.format(eval_loss))
    val_accuracies.append(eval_accuracy)

    print(f'Epoch {e+1}: Validation Loss: {eval_loss:.5f}, Accuracy: {eval_accuracy:.5f}')

    # # Early Stopping call
    # early_stopping(eval_loss, model)
    # if early_stopping.early_stop:
    #   print("Early stopping")
    #   break

    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels) for p_i, l_i in zip(p, l) if tag_values[l_i] != 'PAD']
    valid_tags = [tag_values[l_i] for l in true_labels for l_i in l if tag_values[l_i] != 'PAD']

    print('Validation accuracy:\t{:.5f}'.format(accuracy_score(pred_tags, valid_tags)))
    print('Validation precision:\t{:.5f}'.format(precision_score(pred_tags, valid_tags, average='micro')))
    print('Validation recall:\t{:.5f}'.format(recall_score(pred_tags, valid_tags, average='micro')))
    print('Validation f1-score:\t{:.5f}\n'.format(f1_score(pred_tags, valid_tags, average='micro')))

Calculate confusion matrix to identify **TP**, **TN**, **FP**, and **FN**. This is required to calculate **Micro- precision**, **recall**, and **F1-Score**.

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report


In [None]:
tags = list(set(valid_tags))

In [None]:
# Print classification report
print(classification_report(valid_tags, pred_tags))

In [None]:
true_labels = valid_tags
predicted_labels = pred_tags


# Calculate confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels, labels=tag_values)

# Convert the confusion matrix to a DataFrame for better visualization
conf_matrix_df = pd.DataFrame(conf_matrix, index=tag_values, columns=tag_values)

# Plot the confusion matrix
plt.figure(figsize=(12, 12))
sns.heatmap(conf_matrix_df, annot=True, fmt='g')
plt.title('Confusion Matrix for NER Entities')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

Finally, save our model for later use.

In [None]:
torch.save(model.state_dict(), "model.pt")

In [120]:
import torch
from transformers import BertTokenizer, BertForTokenClassification
from keras.preprocessing.sequence import pad_sequences
import pickle

# Load the tokenizer and the model
model_path = "/content/model.pt"
tokenizer_path = "/content/tokenizer.pkl"
tag_values_path = "/content/tag_values.pkl"

with open(tokenizer_path, 'rb') as f:
    tokenizer = pickle.load(f)

model = BertForTokenClassification.from_pretrained(
    'bert-base-german-cased',
    num_labels=len(tag2idx),  # tag2idx should be known or loaded as well
    output_attentions=False,
    output_hidden_states=False
)
ner = model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))

model.eval()

with open(tag_values_path, 'rb') as f:
    tag_values = pickle.load(f)
tag2idx = {t: i for i, t in enumerate(tag_values)}

# Prepare the text for entity prediction
text = "A1 zwischen AS Munsbach AS Flaxweiler Verkehrsbehinderung 02.12.2015 ACL_A1."
tokenized_sentence = tokenizer.encode(text)
input_ids = pad_sequences([tokenized_sentence], maxlen=100, dtype="long", value=0.0,
                          truncating="post", padding="post")

# Create attention masks
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

# Convert to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

# Predict entities
with torch.no_grad():
    outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)
    logits = outputs[0]

# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()

# Find predicted tags
predictions = [list(p) for p in np.argmax(logits, axis=2)]

# Convert predictions to tag names
predicted_tags = [tag_values[p_i] for p in predictions for p_i in p if tag_values[p_i] != "PAD"]

# Tokenize the text into words (tokens)
tokens = tokenizer.convert_ids_to_tokens(input_ids.to('cpu').numpy()[0])

# Align the tokens with their predicted tags
final_predictions = []
for token, label in zip(tokens, predicted_tags):
    if token.startswith("##"):
        # Combine subword tokens
        final_predictions[-1] = (final_predictions[-1][0] + token[2:], final_predictions[-1][1])
    else:
        final_predictions.append((token, label))

# Print the tokens with their labels
for token, label in final_predictions:
    print(f"{token}: {label}")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[CLS]: LOCATION_ROUTE
A1: LOCATION_STREET
zwischen: LOCATION_ROUTE
AS: LOCATION_ROUTE
Munsbach: LOCATION_CITY
AS: LOCATION_STOP
Flaxweiler: LOCATION_CITY
Verkehrsbehinderung: TRIGGER
02: DATE
.: DATE
12: DATE
.: DATE
2015: DATE
ACL: LOCATION_ROUTE
_: LOCATION_ROUTE
A1: LOCATION_STREET
