## Classification German News with BERT


In [1]:
!pip install -U transformers==4.9.2

Requirement already up-to-date: transformers==4.9.2 in /usr/local/lib/python3.6/dist-packages (4.9.2)
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
!pip install 'fhnw-nlp-utils>=0.1.6'
!pip install pyarrow

from fhnw.nlp.utils.storage import load_dataframe
from fhnw.nlp.utils.storage import download


import numpy as np
import pandas as pd

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [3]:
file = "data/german_news_articles_original_train_and_test_tokenized.parq"
data_all = load_dataframe(file)

In [4]:
data_all.sample(3)

Unnamed: 0,text_original,label,split,text_clean,token_clean,token_lemma,token_stem,token_clean_stopwords
579,US-Außenminister traf Palästinenserpräsident A...,International,test,US Außenminister traf Palästinenserpräsident A...,"[us, außenminister, traf, palästinenserpräside...","[us-amerikanischen, außenminister, treffen, pa...","[us, aussenminist, traf, palastinenserprasiden...","[us, außenminister, traf, palästinenserpräside..."
3922,Sachwalterschaften werden meist von Verwandten...,Panorama,train,Sachwalterschaften werden meist von Verwandten...,"[sachwalterschaften, meist, verwandten, bekann...","[sachwalterschaften, meist, verwenden, bekenne...","[sachwalterschaft, meist, verwandt, bekannt, b...","[sachwalterschaften, meist, verwandten, bekann..."
1199,"5. August, 10:00 Uhr: Der GameStandard zeigt d...",Web,train,August Uhr Der GameStandard zeigt die Übertra...,"[august, uhr, gamestandard, zeigt, übertragung...","[august, uhr, gamestandard, zeigen, übertragun...","[august, uhr, gamestandard, zeigt, ubertrag, l...","[august, uhr, gamestandard, zeigt, übertragung..."


In [5]:
from transformers import BertTokenizer, TFBertModel

In [6]:
tokenizer = BertTokenizer.from_pretrained("bert-base-german-cased")

In [7]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

data_all.drop(columns=['label_encoded', 'label_onehot'], errors='ignore', inplace=True)
data_all['label_encoded'] = encoder.fit_transform(data_all['label'])

num_labels = len(encoder.classes_)
print(f'Number of classes {num_labels}')

Number of classes 9


In [8]:
data_train_orig = data_all.loc[(data_all["split"] == "train")]
data_test_orig = data_all.loc[(data_all["split"] == "test")]

In [9]:
from sklearn.model_selection import train_test_split

train_ids, test_ids, train_labels, test_labels = train_test_split(
    data_train_orig, 
    data_train_orig["label_encoded"], 
    random_state=1, 
    test_size=0.25, 
    shuffle=True
)


In [10]:
MAXLEN = 512

def tokenize(review):
  encoded = tokenizer.encode_plus(
      text=review,
      add_special_tokens=True,  # Add `[CLS]` and `[SEP]`
      max_length=MAXLEN,  # Max length to truncate/pad
      padding='max_length',  # Pad sentence to max length
      return_attention_mask=False,  # attention mask not needed for our task
      return_token_type_ids=False,
      truncation=True, )
    
  return encoded['input_ids']

In [11]:
import tqdm
train_input_ids = np.array([tokenize(review) for review in tqdm.tqdm(train_ids['text_clean'])])
test_input_ids = np.array([tokenize(review) for review in tqdm.tqdm(test_ids['text_clean'])])

100%|██████████| 6933/6933 [00:38<00:00, 181.27it/s]
100%|██████████| 2312/2312 [00:13<00:00, 176.93it/s]


In [12]:
BATCH_SIZE = 4
EPOCHS = 2
LEARNING_RATE = 1e-5

In [13]:
import tensorflow as tf


train_dataset = (tf.data.Dataset.from_tensor_slices((train_input_ids, train_labels))
                    .shuffle(buffer_size=len(train_input_ids), reshuffle_each_iteration=True)
                    .repeat(EPOCHS)
                    .batch(BATCH_SIZE))

test_dataset = (tf.data.Dataset.from_tensor_slices((test_input_ids, test_labels))
                    .batch(BATCH_SIZE))

In [14]:
def build_model(max_len=MAXLEN):
    """ add multi class classification to pretrained model
    """

    input_word_ids = tf.keras.layers.Input(
        shape=(max_len,), dtype=tf.int32, name="input_word_ids"
    )

    bert_model = TFBertModel.from_pretrained("bert-base-german-cased")
    encoder_outputs = bert_model(input_word_ids)

    ##########################
    ## YOUR CODE HERE START ##
    ##########################
    
    # build a binary classification stack 
    # on top of the sequence embeddings

    # Either use last hidden states
    # or use the pooler output directly
    
    #cls_embedding = ...

    # Add a FFNN that takes in the embeddings and outputs a structure suitable
    # for a binary classification 
    #stack = ...
    #output = ...

    pooler_output = encoder_outputs[1]
    cls_embedding = pooler_output
    
    # x = tf.keras.layers.Dense(1024, activation='relu')(cls_embedding)

    no_classes = len(encoder.classes_)
    stack = tf.keras.layers.Dense(no_classes)(cls_embedding)
    output = tf.keras.layers.Activation('softmax')(stack)

    ##########################
    ## YOUR CODE HERE END ##
    ##########################

    model = tf.keras.models.Model(inputs=input_word_ids, outputs=output)
    
    return model

In [15]:
model = build_model(max_len=MAXLEN)
#model.layers[1].trainable = False
model.summary()

Some layers from the model checkpoint at bert-base-german-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-german-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f08221331d8> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x7f08221331d8> is not a module, class, method, function, traceback, frame, or code object

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  TFBaseModelOutputWithPool 109081344 
_________________________________________________________________
dense (Dense)     

In [17]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
# See https://stackoverflow.com/questions/62148508/how-can-i-overcome-valueerror-shapes-none-1-and-none-7-are-incompatible
loss = "sparse_categorical_crossentropy" # "categorical_crossentropy"
model.compile(optimizer, loss=loss, metrics=["accuracy"])


In [None]:
from datetime import datetime
import os

checkpoint_path = "training_berts_final/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)


hist = model.fit(
    train_dataset,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    steps_per_epoch=int(np.floor((len(train_input_ids) / BATCH_SIZE))),
    validation_data=test_dataset,
    callbacks=[cp_callback],
    verbose=1
)

Epoch 1/2
 380/1733 [=====>........................] - ETA: 7:39 - loss: 1.0056 - accuracy: 0.6776

In [None]:
!rm -rf saved_model2 || true
!mkdir -p saved_model2

tf.keras.models.save_model(model, 'saved_model2/my_model')

In [None]:
history = pd.DataFrame({'epoch': hist.epoch, **hist.history}).set_index('epoch')
history

In [None]:
import tensorflow as tf

new_model = tf.keras.models.load_model('saved_model2/my_model')
new_model.summary()

#model_loaded = build_model(max_len=MAXLEN)

In [None]:
latest = tf.train.latest_checkpoint('training_berts')
model_loaded.load_weights(latest)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = "categorical_crossentropy"

model_loaded.compile(optimizer, loss=loss, metrics=["accuracy"])
model_loaded.summary()

In [None]:
model.evaluate(test_input_ids, test_labels, verbose=2)

In [None]:
import tqdm

data_test_orig_ids = np.array([tokenize(review) for review in tqdm.tqdm(data_test_orig['text_clean'])])
data_test_orig_labels = data_test_orig["label_encoded"]

In [None]:
data_test_orig_labels.values.fill(5)
data_test_orig_labels

In [None]:
model.evaluate(data_test_orig_ids, data_test_orig_labels)

In [None]:
data_test_orig_dataset = (tf.data.Dataset.from_tensor_slices((data_test_orig_ids, data_test_orig_labels))
                    .batch(BATCH_SIZE))
model_loaded.evaluate(data_test_orig_dataset, verbose=2)

In [None]:
predictions = model.predict(data_test_orig_ids, batch_size=BATCH_SIZE, verbose=2, use_multiprocessing=True)

In [None]:
predictions.shape

In [None]:
classes = np.argmax(predictions, axis = 1)
classes, data_test_orig["label_encoded"]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(data_test_orig_labels, classes))