<a href="https://colab.research.google.com/github/ReAlex1902/Innoscripta_task/blob/main/Analysis/test_innoscripta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary libraries

In [78]:
!pip install transformers==4.9.1
!pip install -U spacy==3.1.0
!pip install seqeval==1.2.2
!python -m spacy download de_core_news_lg

2021-08-02 17:56:32.092528: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
Collecting de-core-news-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.1.0/de_core_news_lg-3.1.0-py3-none-any.whl (571.2 MB)
[K     |████████████████████████████████| 571.2 MB 11 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_lg')


# Import necessary libraries

In [None]:
import numpy as np
import pandas as pd
import json
from tqdm import trange

import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertForTokenClassification, AdamW

from seqeval.metrics import classification_report, accuracy_score, f1_score

import spacy
from spacy.training import offsets_to_biluo_tags
nlp = spacy.load("de_core_news_lg")

In [None]:
# Adding '\n' to the default spacy tokenizer

prefixes = ['\\n', ] + nlp.Defaults.prefixes
prefix_regex = spacy.util.compile_prefix_regex(prefixes)
nlp.tokenizer.prefix_search = prefix_regex.search

# Data preprocessing

In [None]:
PATH = '/content/JSON.json'
df = pd.read_json(PATH)
df.head()

Unnamed: 0,text,labels
0,Öffentliche Bekanntmachung AUREG\n\n\n\nAmtsge...,"[[103, 113, PUBDATE], [349, 359, STATUS], [360..."
1,Öffentliche Bekanntmachung RegisSTAR\n\n\n\nAm...,"[[104, 114, PUBDATE], [343, 379, STATUS], [380..."
2,Öffentliche Bekanntmachung RegisSTAR\n\n\n\nAm...,"[[108, 118, PUBDATE], [340, 347, POSITION], [3..."
3,Öffentliche Bekanntmachung RegisSTAR\n\n\n\nAm...,"[[110, 120, PUBDATE], [342, 349, POSITION], [3..."
4,Öffentliche Bekanntmachung RegisSTAR\n\n\n\nAm...,"[[111, 121, PUBDATE], [316, 323, POSITION], [3..."


In [79]:
def get_sents_and_tags(df):
    '''
    The function returns sentences with tags for each token

    in: df, pd.DataFrame - pandas datasframe with texts and labels
    out: sentences, list - tokenized sentences
         tags, list - tags for each token
    '''
    tags = []
    sentences = []

    for i in range(df.shape[0]):
        text = df['text'][i]
        entities = df['labels'][i]
    
        doc = nlp(text)
    
        tag = offsets_to_biluo_tags(doc, entities)
        tmp = pd.DataFrame([list(doc), tag]).T
 
        ## Look for text borders
        sent_borders = []
        for i in range(tmp.shape[0]):
            if tmp[0][i].text is '.' and tmp[1][i] is 'O':
                sent_borders.append(i)
        sent_borders.append(len(doc))

        ## DIvide text to sentences
        last_border = 0
        data = []
        for current_border in sent_borders:
            data.append([list(doc)[last_border:current_border], tag[last_border:current_border]])
            last_border = current_border
    
        ## Change '-' to 'O'
        for d in data:
            tag = ['O' if t is '-' else t for t in d[1]]
            if len(set(tag)) > 1:
                sentences.append(d[0])
                tags.append(tag)
    
    return sentences, tags

In [80]:
sentences, tags = get_sents_and_tags(df)
len(sentences), len(tags)

(443, 443)

In [None]:
# tag_vals = set(['X', '[CLS]', '[SEP]'])
# for i in range(len(tags)):
#     tag_vals = tag_vals.union(tags[i])

# tag2idx = {t: i for i, t in enumerate(tag_vals)}
# idx2tag = {tag2idx[key] : key for key in tag2idx.keys()}

# with open('idx2tag.json', 'w') as fp:
#     json.dump(idx2tag, fp)

In [None]:
with open('idx2tag.json') as json_file:
    idx2tag_str = json.load(json_file)

tag2idx = {idx2tag_str[key]: int(key) for key in idx2tag_str.keys()}
idx2tag = {int(key): idx2tag_str[key] for key in idx2tag_str.keys()}

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased', do_lower_case = False)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=254728.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=485115.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




In [84]:
def get_tokenized_data(sentences, tags):
    '''
    Tokenize data given by get_sents_and_tags function

    in: sentences, list - list of sentences
        tags, list - list of tags
    out: tokenized_texts, list - tokenized texts
         word_piece_labels, list - labels for each token
    '''

    tokenized_texts = []
    word_piece_labels = []

    for word_list, label in zip(sentences, tags):
    
        # Add [CLS] at the front
        labels = ['[CLS]']
        tokens = ['[CLS]']
    
        ## Токенайзер ставит лейбл только первому кусочку, остальным Х
        ## Зачем???
        for word, lab in zip(word_list, label):
            token_list = tokenizer.tokenize(word.text)
            for m, token in enumerate(token_list):
                tokens.append(token)
                # if m == 0:
                labels.append(lab)
                # else:
                    # labels.append('X')  
                
        # Add [SEP] at the end
        labels.append('[SEP]')
        tokens.append('[SEP]')
    
        tokenized_texts.append(tokens)
        word_piece_labels.append(labels)
    
    return tokenized_texts, word_piece_labels

In [None]:
tokenized_texts, word_piece_labels = get_tokenized_data(sentences, tags)

In [None]:
print(tokenized_texts[1])
print(word_piece_labels[1])

['[CLS]', '.', 'Nicht', 'mehr', 'Pro', '##kur', '##ist', ':', '1', '.', 'Gust', '##af', '##sson', ',', 'Christine', '[SEP]']
['[CLS]', 'O', 'B-STATUS', 'L-STATUS', 'U-POSITION', 'U-POSITION', 'U-POSITION', 'O', 'O', 'O', 'U-SURNAME', 'U-SURNAME', 'U-SURNAME', 'O', 'U-NAME', '[SEP]']


In [None]:
class Config:
    '''
    Configuration class
    '''

    max_len = 512
    batch_size = 4

In [None]:
texts = pad_sequences([tokenizer.convert_tokens_to_ids(text) for text in tokenized_texts],
                          maxlen = Config.max_len, dtype = "long", truncating = "post", padding = "post")
print(len(texts[0]))
print(texts[0])

512
[    3  4624   322 19118    32 14960  4980  7201  6517  5816  4000 26964
    84 10220   810  6822 26961    84 26925  5287  2119   235 26964 10346
 26914  4163 26914  2216   148 26964 15644  1971   173 26954 26955  6117
 26897  2428    21 25666    42    91  4182 24392  6660   935 16395 26964
  8922  9737 26914  4163 26914  2216  4960    21  3770 26964    84 10220
   810  6822 26961    84 26925 26964  5169  3081     4     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in label] for label in word_piece_labels], maxlen=Config.max_len, value = tag2idx["O"], 
                     padding = "post", dtype = "long", truncating = "post")
print(len(tags[0]))
print(tags[0])

512
[ 7  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 10
 10 10 10 10  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 30  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  

In [None]:
attention_masks = [[float(sent > 0) for sent in text] for text in texts]
print(attention_masks[0])

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

# Creating TensorDatasets and DataLoaders

In [None]:
train_texts, val_texts, train_tags, val_tags, train_masks, val_masks = \
    train_test_split(texts, tags, attention_masks, random_state = 11, test_size = 0.3)

In [None]:
train_texts = torch.tensor(train_texts)
val_texts = torch.tensor(val_texts)
train_tags = torch.tensor(train_tags)
val_tags = torch.tensor(val_tags)
train_masks = torch.tensor(train_masks)
val_masks = torch.tensor(val_masks)

In [None]:
train_data = TensorDataset(train_texts, train_masks, train_tags)
train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler = train_sampler, batch_size = Config.batch_size)

val_data = TensorDataset(val_texts, val_masks, val_tags)
val_sampler = SequentialSampler(val_data)
val_loader = DataLoader(val_data, sampler = val_sampler, batch_size = Config.batch_size)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [81]:
model = BertForTokenClassification.from_pretrained("bert-base-german-cased", num_labels = len(tag2idx))
model.to(device)

PATH_TO_MODEL = '/content/drive/MyDrive/torch_models/HAWK_3.0.pth'
model.load_state_dict(torch.load(PATH_TO_MODEL, map_location = device))

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

<All keys matched successfully>

In [None]:
import random

seed_val = 11

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def train_model(model, train_loader, val_loader, optimizer, num_epochs, scheduler = None):
    '''
    Training function.

    in: model - Bert model to train
        train_loader - train DataLoader
        val_loader - validation DataLoader
        optimizer - AdamW optimizer
        num_epochs - number of epochs
        scheduler - scheduler to apply while training
    out: train_loss_history, list - loss history on training dataset
    '''
    train_loss_history = []

    for epoch in trange(num_epochs, desc = 'EPOCHS'):
        model.train() # Enter train mode
        
        train_loss_accum = 0

        for index, (sentence, attention_mask, label) in enumerate(train_loader):
            model.zero_grad()

            sentence = sentence.to(device)
            attention_mask = attention_mask.to(device)
            label = label.to(device)

            output = model(sentence, token_type_ids = None, attention_mask = attention_mask, labels = label)
            loss_value, logits = output[0], output[1]
            train_loss_accum += loss_value.item()

            loss_value.backward()
            optimizer.step()
            if scheduler:
                scheduler.step()
            
        avg_loss = train_loss_accum / index
        train_loss_history.append(avg_loss)

        print(f'Train_loss: {avg_loss}')
        if avg_loss < 0.001:
            return train_loss_history

    return train_loss_history

def eval_model(model, loader):
    '''
    Evaluating function. 

    in: model - trained model to evaluate
        loader - validation loader to use for evaluating model
    out: accuracy, int
         F1 score, int
         classification report, str
    '''
    model.eval()

    y_true = []
    y_pred = []
    eval_loss, eval_accuracy = 0, 0
    num_eval_steps, num_eval_examples = 0, 0

    for sentence, attention_mask, label in loader:
        sentence = sentence.to(device)
        attention_mask = attention_mask.to(device)
        label = label.to(device)

        with torch.no_grad():
            logits = model(sentence, token_type_ids = None, attention_mask = attention_mask)[0]

        logits = logits.detach().cpu().numpy()
        logits = [list(p) for p in np.argmax(logits, axis = 2)]
        
        label = label.to('cpu').numpy()
        attention_mask = attention_mask.to('cpu').numpy()
        
        for i, mask in enumerate(attention_mask):
            ground_truth = []
            prediction = []
            
            for j, mark in enumerate(mask):
                # mark = 0, meaning its a pad word, dont compare
                if mark:
                    if idx2tag[label[i][j]] != "X" and idx2tag[label[i][j]] != "[CLS]" and idx2tag[label[i][j]] != "[SEP]" : # Exclude the X label
                        ground_truth.append(idx2tag[label[i][j]])
                        prediction.append(idx2tag[logits[i][j]])
                else:
                    break
            
            y_true.append(ground_truth)
            y_pred.append(prediction)

    return accuracy_score(y_true, y_pred), f1_score(y_true, y_pred), classification_report(y_true, y_pred)

In [None]:
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)

train_loss_history = train_model(model, train_loader, val_loader, optimizer, EPOCHS)

EPOCHS:  20%|██        | 1/5 [30:53<2:03:34, 1853.69s/it]

Train_loss: 0.02483141229657287


EPOCHS:  40%|████      | 2/5 [1:02:32<1:33:22, 1867.34s/it]

Train_loss: 0.017115386783118153


EPOCHS:  60%|██████    | 3/5 [1:35:28<1:03:19, 1899.89s/it]

Train_loss: 0.014158982931720939


EPOCHS:  80%|████████  | 4/5 [2:08:18<32:00, 1920.73s/it]  

Train_loss: 0.008856059799678382


EPOCHS: 100%|██████████| 5/5 [2:42:08<00:00, 1945.77s/it]

Train_loss: 0.007932643854367705





In [None]:
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr = 1e-5, eps = 1e-8)

train_loss_history = train_model(model, train_loader, val_loader, optimizer, EPOCHS)

EPOCHS:  20%|██        | 1/5 [32:21<2:09:25, 1941.37s/it]

Train_loss: 0.006330939796534004


EPOCHS:  40%|████      | 2/5 [1:05:20<1:37:37, 1952.59s/it]

Train_loss: 0.00451297173607269


EPOCHS:  60%|██████    | 3/5 [1:37:39<1:04:57, 1948.73s/it]

Train_loss: 0.0054425038764889765


EPOCHS:  80%|████████  | 4/5 [2:10:03<32:27, 1947.20s/it]  

Train_loss: 0.0035000090360113832


EPOCHS: 100%|██████████| 5/5 [2:42:26<00:00, 1949.21s/it]

Train_loss: 0.0017309317490208423





In [82]:
acc, f1, report = eval_model(model, val_loader)

In [None]:
# torch.save(model.state_dict(), '/content/innoscripta_bert_de_3.0.pth')

In [None]:
print(f'Accuracy = {acc}, F1 score = {f1}')
print()
print(report)

Accuracy = 0.9898530970770861, F1 score = 0.9604772557792692

              precision    recall  f1-score   support

    BIRTHDAY       0.94      1.00      0.97        33
        CITY       0.91      0.95      0.93       117
     COUNTRY       1.00      0.71      0.83         7
        NAME       0.96      0.97      0.96       117
        NOTE       1.00      1.00      1.00         9
    POSITION       0.97      0.98      0.97        92
     PUBDATE       0.98      1.00      0.99        65
      STATUS       0.97      0.99      0.98        92
     SURNAME       0.97      0.97      0.97       116
       TITLE       0.88      0.74      0.80        19

   micro avg       0.96      0.97      0.96       667
   macro avg       0.96      0.93      0.94       667
weighted avg       0.96      0.97      0.96       667



# Predict on random sentence

In [None]:
def predict(text, model = model, tokenizer = tokenizer):
    '''
    Function for token classification.

    in: text, str - text to use for token classification
        model, bert model - model to apply for a text
        tokenizer, bert tokenizer - tokenizer for sentence encoding
    '''
    sentence = tokenizer.encode(text, add_special_tokens = False)
    sentence = torch.tensor([sentence]).to(device)

    with torch.no_grad():
        logits = model(sentence)
    
    labels = np.argmax(logits[0].to('cpu').numpy(), axis = 2)

    tokens = tokenizer.convert_ids_to_tokens(sentence.to('cpu').numpy()[0])
    new_tokens, new_labels = [], []
    for token, label_idx in zip(tokens, labels[0]):
        if token.startswith("##"):
            new_tokens[-1] = new_tokens[-1] + token[2:]
        else:
            new_labels.append(idx2tag[label_idx][2:])
            new_tokens.append(token)

    for token, label in zip(new_tokens, new_labels):
        print("{}\t\t\t{}".format(label, token))

num = 40
predict(df.loc[num, 'text'])

			Öffentliche
			Bekanntmachung
			RegisSTAR
			Amtsgericht
			Zweibrücken
			Aktenzeichen
			:
			HRA
			1266
			Bekannt
			gemacht
			am
			:
PUBDATE			08
PUBDATE			.
PUBDATE			08
PUBDATE			.
PUBDATE			2005
			13
			:
			56
			Uhr
			Die
			in
			(
			)
			gesetzten
			Angaben
			der
			Geschäftsanschrift
			und
			des
			Unternehmensgegenstandes
			erfolgen
			ohne
			Gewähr
			.
			Veränderungen
			28
			.
			06
			.
			2005
			Malerbetrieb
			Helga
			Anderie
			,
			vormals
			Irmgard
			Hautz
			e
			.
			K
			.
			,
			Zweibrücken
			(
			Hofenfelsstraße
			57
			,
			66482
			Zweibrücken
			)
			.
			Malerbetrieb
			Helga
			Anderie
			Inh
			.
			Siegfried
			Wagner
			e
			.
			K
			.
			Der
			Inhaber
			/
			die
			Inhaberin
			handelt
			allein
			.
			Nicht
STATUS			mehr
POSITION			Inhaber
			:
SURNAME			Anderie
			,
NAME			Helga
			,
CITY			Zweibrücken
			,
			*
BIRTHDAY			04
BIRTHDAY			.
BIRTHDAY			01
BIRTHDAY			.
BIRTHDAY			1940
			;
SURNAME			Wagner
			,
NAME			Sieg