### German BERT model

This notebook focuses on training and testing of the BERT model that were proposed in this paper. The model was implemented using TensorFlows and HuggingFace.

Please keep in mind that these notebooks are primarily used for conducting experiments, live coding, and implementing and evaluating the approaches presented in the thesis. As a result, the code in this notebook may not strictly adhere to best practice coding standards.

In [2]:
# ONLY IF USED ON LOCAL VIEW
# only execute once
import os

# Getting the parent directory
os.chdir("..")
os.chdir("..")

In [3]:
import pandas as pd
from numpy import array, argmax
import numpy as np

import tensorflow as tf
from tensorflow.keras.utils import to_categorical

from sklearn.utils import shuffle
from transformers import AutoTokenizer, TFAutoModel

def import_test_train(local):
  """
  This imports the given train and testset locally or not and returns it.

  :param local: If set to true, it will return the trainset from a local view. Otherwise it will open drive mount and attempts to connect to your
  drive folders.
  """

  assert type(local) == bool, f"Type is not valid. Expected boolean, recieved: {type(local)}"

  if local:
    from google.colab import drive
    drive.mount('/content/gdrive')

    df_test = pd.read_csv('/content/gdrive/MyDrive/Experiment/testset_DE_Trigger.csv')
    df_train = pd.read_csv('/content/gdrive/MyDrive/Experiment/trainset_DE_Trigger.csv')

    return df_test, df_train

  else:
    df_test = pd.read_csv('./Experiment/testset_DE_Trigger.csv')
    df_train = pd.read_csv('./Experiment/trainset_DE_Trigger.csv')

    return df_test, df_train

# importing test and trainset
df_test, df_train = import_test_train(False)

# If you want to use it locally, make sure to execute the notebooks from the root directory of this project and uncomment the following line:
# df_test, df_train = import_test_train(False)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased")
bert = TFAutoModel.from_pretrained("dbmdz/bert-base-german-uncased")

Metal device set to: Apple M1 Max


Some layers from the model checkpoint at dbmdz/bert-base-german-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at dbmdz/bert-base-german-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [6]:
# max length of berttokenizer  is 512
max_length=100

#creating mask for tokens
Xids_train=np.zeros((df_train.shape[0],max_length))
Xmask_train=np.zeros((df_train.shape[0],max_length))
y_train=np.zeros((df_train.shape[0],1))

#creating mask for tokens
Xids_test=np.zeros((df_test.shape[0],max_length))
Xmask_test=np.zeros((df_test.shape[0],max_length))

In [7]:
for i,sequence in enumerate(df_train['content']):
    tokens=tokenizer.encode_plus(sequence,max_length=max_length,padding='max_length',add_special_tokens=True,
                           truncation=True,return_token_type_ids=False,return_attention_mask=True,
                           return_tensors='tf')

    Xids_train[i,:] = tokens['input_ids']
    Xmask_train[i,:] = tokens['attention_mask']
    y_train[i,0] = df_train.loc[i,'label_id']

y_train = to_categorical(y_train)

for i,sequence in enumerate(df_test['content']):
    tokens=tokenizer.encode_plus(sequence,max_length=max_length,padding='max_length',add_special_tokens=True,
                           truncation=True,return_token_type_ids=False,return_attention_mask=True,
                           return_tensors='tf')

    Xids_test[i,:] = tokens['input_ids']
    Xmask_test[i,:] = tokens['attention_mask']

In [8]:
dataset=tf.data.Dataset.from_tensor_slices((Xids_train,Xmask_train,y_train))

def map_func(input_ids,mask,labels):
    return {'input_ids':input_ids,'attention_mask':mask},labels

dataset=dataset.map(map_func)
dataset=dataset.shuffle(100000).batch(64).prefetch(1000)

DS_size=len(list(dataset))

train=dataset.take(round(DS_size*0.90))
val=dataset.skip(round(DS_size*0.90))

In [9]:
dataset_test=tf.data.Dataset.from_tensor_slices((Xids_test,Xmask_test))

def map_func(input_ids,mask):
    return {'input_ids':input_ids,'attention_mask':mask}

dataset_test=dataset_test.map(map_func)
# batching it to or the predictions will be multiplied by the shape
dataset_test=dataset_test.batch(64).prefetch(1000)

In [10]:
def emotion_model():
  input_ids=tf.keras.layers.Input(shape=(max_length,),name='input_ids',dtype='int32')
  input_mask=tf.keras.layers.Input(shape=(max_length,),name='attention_mask',dtype='int32')

  embedding=bert(input_ids,attention_mask=input_mask)[0]
  x=tf.keras.layers.GlobalMaxPool1D()(embedding)
  x=tf.keras.layers.BatchNormalization()(x)
  x=tf.keras.layers.Dense(256,activation='relu')(x)
  x=tf.keras.layers.Dropout(0.2)(x)
  output=tf.keras.layers.Dense(5,activation='softmax')(x)

  model=tf.keras.Model(inputs=[input_ids,input_mask],outputs=output)

  model.layers[2].trainable=False

  model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
              optimizer='adam',metrics=[tf.keras.metrics.AUC()])

  return model

## Define train and test

In [11]:
model = emotion_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 100)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 100)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1099276   ['input_ids[0][0]',           
 )                           ngAndCrossAttentions(last_   80         'attention_mask[0][0]']      
                             hidden_state=(None, 100, 7                                       

## Define model and saving path

In [12]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(
    monitor='val_auc',
    patience=2,
    min_delta=0.0010,
    mode='max'
)

In [None]:
model.fit(train,
          validation_data=val,
          epochs=30,
          callbacks=[es])

# Evaluation

In [None]:
y_pred=model.predict(dataset_test)
y_pred_new = np.argmax(y_pred,axis=1)
y_true = df_test['label_id'].values

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_true, y_pred_new))

# Create evaluation for each subset

In [None]:
def create_dataset(df):
  Xids_test=np.zeros((df.shape[0],max_length))
  Xmask_test=np.zeros((df.shape[0],max_length))

  for i,sequence in enumerate(df['content']):
    tokens=tokenizer.encode_plus(sequence,max_length=max_length,padding='max_length',add_special_tokens=True,
                           truncation=True,return_token_type_ids=False,return_attention_mask=True,
                           return_tensors='tf')

    Xids_test[i,:] = tokens['input_ids']
    Xmask_test[i,:] = tokens['attention_mask']

  dataset_test=tf.data.Dataset.from_tensor_slices((Xids_test,Xmask_test))

  def map_func(input_ids,mask):
      return {'input_ids':input_ids,'attention_mask':mask}

  dataset_test=dataset_test.map(map_func)
  dataset_test=dataset_test.batch(64).prefetch(1000)

  return dataset_test

In [None]:
df_tails = df_test[df_test.source == "Tails"]
y_true = df_tails['label_id'].values
df_tails = create_dataset(df_tails)

y_pred=model.predict(df_tails)
y_pred_new = np.argmax(y_pred,axis=1)

print(metrics.classification_report(y_true, y_pred_new))

In [None]:
df_dailydialog = df_test[df_test.source == "DailyDialog"]
y_true = df_dailydialog['label_id'].values
df_dailydialog = create_dataset(df_dailydialog)

y_pred=model.predict(df_dailydialog)
y_pred_new = np.argmax(y_pred,axis=1)

print(metrics.classification_report(y_true, y_pred_new))

In [None]:
df_GoEmotions = df_test[df_test.source == "GoEmotions"]
y_true = df_GoEmotions['label_id'].values
df_GoEmotions = create_dataset(df_GoEmotions)

y_pred=model.predict(df_GoEmotions)
y_pred_new = np.argmax(y_pred,axis=1)

print(metrics.classification_report(y_true, y_pred_new))

In [None]:
df_Isear = df_test[df_test.source == "Isear"]
y_true = df_Isear['label_id'].values
df_Isear = create_dataset(df_Isear)

y_pred=model.predict(df_Isear)
y_pred_new = np.argmax(y_pred,axis=1)

print(metrics.classification_report(y_true, y_pred_new))

In [None]:
df_emosti = df_test[df_test.source == "Emotion-stimulus"]
y_true = df_emosti['label_id'].values
df_emosti = create_dataset(df_emosti)

y_pred=model.predict(df_emosti)
y_pred_new = np.argmax(y_pred,axis=1)

print(metrics.classification_report(y_true, y_pred_new))

# Test predictions

_______________________


In [None]:
def predict_label(sentence):

  seq = tokenizer.encode_plus(sentence,max_length=max_length,padding='max_length',add_special_tokens=True,
                           truncation=True,return_token_type_ids=False,return_attention_mask=True,
                           return_tensors='tf')

  seq = [seq['input_ids'], seq['attention_mask']]

  result = model.predict(seq)[0]
  result = [round(i,4) for i in result]

  dict_res = {'anger':result[0],
              'fear':result[1],
              'joy':result[2],
              'neutral':result[3],
              'sadness':result[4]
  }

  return dict_res


In [None]:
predict_label('Das hat mich gefreut')



{'anger': 0.0024,
 'fear': 0.0005,
 'joy': 0.9183,
 'neutral': 0.0019,
 'sadness': 0.0768}

In [None]:
predict_label('Das hat mich nicht gefreut.')



{'anger': 0.0971,
 'fear': 0.0058,
 'joy': 0.1583,
 'neutral': 0.0027,
 'sadness': 0.7361}

In [None]:
predict_label('Tragischer Tod in der Innenstadt')



{'anger': 0.0031,
 'fear': 0.063,
 'joy': 0.0053,
 'neutral': 0.1133,
 'sadness': 0.8153}

In [None]:
predict_label('DEUTSCHE BANGEN IN AFGHANISTAN UM IHR LEBEN')



{'anger': 0.0562,
 'fear': 0.2647,
 'joy': 0.0037,
 'neutral': 0.3792,
 'sadness': 0.2961}

In [None]:
predict_label('Wetterbericht von heute')



{'anger': 0.017,
 'fear': 0.0202,
 'joy': 0.0798,
 'neutral': 0.7249,
 'sadness': 0.1581}