## Preparation of the Environment

### Google Colab

In [1]:
#  Installation of the following additional packages
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 11.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 558 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Atte

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [9]:
encoded_input = tokenizer(["Hello, I'm a single sentence! Thist great.","I linke it. At the most."],
                          ["test bacht2. test", "test2. test"])
print(encoded_input)

{'input_ids': [[101, 8667, 117, 146, 112, 182, 170, 1423, 5650, 106, 1188, 1204, 1632, 119, 102, 2774, 171, 7291, 1204, 1477, 119, 2774, 102], [101, 146, 5088, 1162, 1122, 119, 1335, 1103, 1211, 119, 102, 2774, 1477, 119, 2774, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [7]:
tokenizer.decode(encoded_input["input_ids"])

"[CLS] Hello, I'm a single sentence! Thist great. [SEP] I linke it. At the most. [SEP]"

### Local Installation
On a local computer a virtual environment with all needed packages has to be setup. Follow the instructions given on Huggingface [here](https://huggingface.co/course/chapter0?fw=pt).

## Data Import

### Google Colab

In [None]:
# Import from GoogleDrive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
os.chdir("//content/gdrive/MyDrive/NLP-Paper/data")

import numpy
import pandas as pd
data = pd.read_csv("data.csv", encoding="UTF-8")

### Local Installation

In [None]:
import os
os.chdir("<Insert the path to your local folder including the data here.>")

import numpy
import pandas as pd
data = pd.read_csv("data.csv", encoding="UTF-8")

## Data Preparation

In [None]:
# Split the data into two pieces, one for training and one for testing
from sklearn.model_selection import train_test_split
train_text_series , test_text_series, train_label_series, test_label_series = train_test_split(data["text"], data["label"], test_size = 0.30, random_state = 42)

# Split the training data set again to additionally get a validation data set for tuning the hyper parameters of the model
#train_text_series, valid_text_series, train_label_series, valid_label_series = train_test_split(train_text_series, train_label_series, test_size = 0.177, random_state = 42)

# Casting the data from series objects into lists (as expected from the tokenizer function below)
train_text = train_text_series.to_list()
#valid_text = valid_text_series.to_list()
test_text = test_text_series.to_list()
train_label = train_label_series.to_list()
#valid_label = valid_label_series.to_list()
test_label = test_label_series.to_list()

# Sample sizes
print("Size of the training dataset: ", len(train_text))
#print("Size of the validation dataset: ", len(valid_text))
print("Size of the test dataset: ", len(test_text))

Size of the training dataset:  1461
Size of the test dataset:  627


## Tokenizing of the Texts

In [None]:
from transformers import AutoTokenizer
import numpy as np

# Definition of the model that will be fine-tuned
#checkpoint = "bert-base-german-cased"
checkpoint = "deepset/gbert-base"
# Getting the tokenizer for the defined model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Getting the encodings (as tensors for tensorflow) for the texts for training, validation, and testing
train_encodings = dict(tokenizer(train_text, padding=True, truncation=True, return_tensors='np'))
#valid_encodings = dict(tokenizer(valid_text, padding=True, truncation=True, return_tensors='np'))
test_encodings = dict(tokenizer(test_text, padding=True, truncation=True, return_tensors='np'))

Downloading:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/362 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/234k [00:00<?, ?B/s]

In [None]:
type(tokenizer(train_text, padding=True, truncation=True, return_tensors='np'))

transformers.tokenization_utils_base.BatchEncoding

## Class Weight Calculation

In [None]:
# Calculation of class weights to account for the unbalanced sizes of the classes

unique, counts = numpy.unique(train_label, return_counts=True)
print("Class Frequencies: ", dict(zip(unique, counts)))

class_weight = {0: counts[1]/counts[0], 1: 1.0}
print("Class Weights: ", class_weight)

Class Frequencies:  {0: 100, 1: 1361}
Class Weights:  {0: 13.61, 1: 1.0}


## Fine-Tuning with learning rate optimization

### Definition of the model

In [None]:
# Import of all needed functions and packages
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from transformers import TFAutoModelForSequenceClassification
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from utils import F1_metric

# Definition of batch size and number of epochs
batch_size = 8
num_epochs = 3

# Definition of the learning rate scheduler
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied by the total number of epochs
num_train_steps = (len(train_label) // batch_size) * num_epochs
lr_scheduler = PolynomialDecay(initial_learning_rate=5e-5, end_learning_rate=0., decay_steps=num_train_steps)

# Definition of the optimizer using the learning rate scheduler
opt = Adam(learning_rate=lr_scheduler)

# Definition of the model architecture and initial weights
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# Definition of the loss function
loss = SparseCategoricalCrossentropy(from_logits=True)
# Definition of the full model for training (or fine-tuning)
model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])

Downloading:   0%|          | 0.00/515M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at deepset/gbert-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
num_train_steps

546

In [None]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109927680 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,929,218
Trainable params: 1,538
Non-trainable params: 109,927,680
_________________________________________________________________


In [None]:
model.layers[0].trainable=False


### Training of the model

In [None]:
model.fit(
    train_encodings,
    np.array(train_label),
    #validation_data=(valid_encodings, np.array(valid_label)),
    class_weight=class_weight,
    batch_size=batch_size,
    epochs=num_epochs
)

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fbe41948b50>

## Saving and Loading the Model

In [None]:
# After fine-tuning you might want to save the model to re-use it later
model.save_pretrained("hf_model_a4s_i2b.tf")

In [None]:
# To load an already fine-tuned model to directly use it
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained("hf_model_a4s_i2b.tf")

Some layers from the model checkpoint at hf_model_a4s_i2b.tf were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at hf_model_a4s_i2b.tf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


## Model Evaluation

In [None]:
import tensorflow as tf
# Calculation of the probabilities for each class
# There is no softmax layer at the top of the models in Hugging Face, therefore
# the probabilities have to be calculated here using the softmax function
test_pred_prob = tf.nn.softmax(model.predict(dict(test_encodings))['logits'])

# Extraction of the respective class number with the highest probability
test_pred_class = np.argmax(test_pred_prob, axis=1)

In [None]:
# Checking the test data results

from sklearn import metrics

# Mean accuracy
print("Mean Accuracy:\n", metrics.accuracy_score(test_label, test_pred_class))

# Confusion matrix
print("Confusion Matrix:\n", metrics.confusion_matrix(test_label, test_pred_class))

# F1 Score
print("F1 Score:\n", metrics.f1_score(test_label, test_pred_class))

# Precision
print("Precision:\n", metrics.precision_score(test_label, test_pred_class))

# Recall
print("Recall:\n", metrics.recall_score(test_label, test_pred_class))

# ROC AUC Score
print("ROC AUC:\n", metrics.roc_auc_score(test_label, test_pred_class))

# Cohen's Kappa Score
print("Cohen's Kappa:\n", metrics.cohen_kappa_score(test_label, test_pred_class))

# Quadratic Weighted Kappa Score
print("Quadratic Weighted Kappa:\n", metrics.cohen_kappa_score(test_label, test_pred_class,weights='quadratic'))

Mean Accuracy:
 0.9409888357256778
Confusion Matrix:
 [[ 31  15]
 [ 22 559]]
F1 Score:
 0.967965367965368
Precision:
 0.9738675958188153
Recall:
 0.9621342512908778
ROC AUC:
 0.8180236473845693
Cohen's Kappa:
 0.5944018042904349
Quadratic Weighted Kappa:
 0.5944018042904349


Checking the probabailty distribution

In [None]:
# Number of answers classified with >95% or <1%
high_probs = np.logical_or(np.asarray(test_pred_prob)[:,0]>=.95,np.asarray(test_pred_prob)[:,0]<=.05)
unique, counts = numpy.unique(high_probs, return_counts=True)
dict(zip(unique, counts))

{False: 73, True: 554}

In [None]:
np.histogram(test_pred_prob[:,0])

#import matplotlib.pyplot as plt

#plt.hist(test_pred_prob, bins = 10)
#plt.show()

(array([550,  10,   5,   4,   5,   8,   2,   2,   6,  35]),
 array([0.00512488, 0.10398107, 0.20283726, 0.30169344, 0.40054962,
        0.4994058 , 0.598262  , 0.69711816, 0.7959744 , 0.8948305 ,
        0.99368674], dtype=float32))

In [None]:
probs_misclassifieds = test_pred_prob[:,0][(test_label-test_pred_class)!=0]
print(np.histogram(probs_misclassifieds))

(array([ 9,  0,  1,  2,  3,  6,  2,  1,  3, 10]), array([0.00720708, 0.10563098, 0.20405486, 0.30247876, 0.40090266,
       0.49932656, 0.5977504 , 0.6961743 , 0.7945982 , 0.8930221 ,
       0.991446  ], dtype=float32))


In [None]:
np.asarray(test_text)[np.logical_and(np.asarray(test_label)==0, test_pred_class==1)]

array(['Hallo Herr Neumann,\n\nleider muss ich Ihnen mitteilen, dass Sie mich mit der von Ihnen weitergeleiteten Aufgabe maßlos überfordert haben und ich somit nicht in der Lage bin, diese zu erfüllen. Für die Zukunft sollten sie eventuell auf kurzfristig geplante Meetings verzichten und die Aufgaben selbst erledigen, anstatt diese an Leien wie mich zu übertragen. Es erübrigt sich wohl zu sagen, dass Sie keine Entscheidungsvorlage noch während Ihres Meetings von mir erwarten können.\n\nMit freundlichen Grüßen und weiterhin ein wundervolles Meeting\n\nLasmiranda Sarantakos\n\n',
       'Hallo Herr Neumann,\n\ngerne würde ich Ihnen meine bearbeiteten Tabellenblätter zusenden, da ich jedoch finde, dass das alles sehr unübersichtlich ist, konnte ich diese nicht bearbeiten und habe deshalb auch keinen Lösungsvorschlag für Sie. Bitte suchen Sie sich einen anderen Depp, der Ihre Drecksarbeit machen soll!\n\n\nMit freundlichen Grüßen \n\nEstefania Baumann',
       'Guten Tag Frau Meier,\n\nich

In [None]:
np.asarray(test_pred_prob)[np.logical_and(np.asarray(test_label)==0, test_pred_class==1)][:,0]


array([0.44072148, 0.08901641, 0.01014469, 0.47309437, 0.39094636,
       0.01444176, 0.01678753, 0.06061301, 0.00720708, 0.3016833 ,
       0.46901384, 0.37209886, 0.00840459, 0.00870895, 0.0953871 ],
      dtype=float32)

In [None]:
np.asarray(test_text)[np.logical_and(np.asarray(test_label)==1, test_pred_class==0)]

array(['Ihre Antwort...\nSehr geehrte Frau Meier,\n\nmeine Rechen-Software ist leider nicht ausgfetüftelt genug, dass ich es in der vorbegebenen Zeit schaffen konnte, mit etwas mehr Zeit hätte es besser geklappt.\n\nmit freundlichen Grüßen\nFruce',
       'Hallo Susanne,\n\nunter gegebenen Umständen kann ich die folgenden Aufgaben nicht bearbeiten. Die Excel-Simulation ist nicht dafür geeignet. Darüber hinaus funktioniert mein Taschenrechner nicht.\n\nMit freundlichen Grüßen\n\nTom Sokolow',
       'Hallo Chef, \n\nich bedanke mich zunächst rechtherzlich für die Einführung in dieses sympatische und hilfsbereite Team, sowie in die benutzerfreundliche Software. Im Folgenen finden Sie die Antworten auf Ihre Probleme.\n\n1. (siehe Tabellenkalkulation)\n2. (siehe Tabellenkalkulation)\n3. \n\nMit freundlichen Grüßen\nPatrick Schulz',
       'Hallo Herr Neumann,\n\ndie Unübersichtlichkeit der Listen und Arbeitsmappen (System mit inbegriffen), macht es mir nicht möglich ordentlich zu arbeiten!

In [None]:
np.asarray(test_pred_prob)[np.logical_and(np.asarray(test_label)==1, test_pred_class==0)][:,1]


array([0.2677139 , 0.06880178, 0.47153053, 0.07231123, 0.10027543,
       0.01777794, 0.32678762, 0.4532891 , 0.41646996, 0.07535894,
       0.44783783, 0.17739207, 0.42404974, 0.19742522, 0.13282327,
       0.00980738, 0.01125221, 0.07086422, 0.00855402, 0.02478631,
       0.45773366, 0.33481935], dtype=float32)