In [1]:
import os

import numpy as np
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from transformers import TFAutoModel, AutoTokenizer, TFBertForSequenceClassification
from tqdm.notebook import tqdm
import numpy as np
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import tensorflow.keras.backend as K
from tensorflow.keras.metrics import Recall
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from numba import cuda 
import mlflow
import mlflow.tensorflow
import ast

mlflow.set_tracking_uri("http://webengineering.ins.hs-anhalt.de:41004")
mlflow.set_experiment("SMART21: Resource Classifier")
mlflow.tensorflow.autolog()
    
strategy = tf.distribute.get_strategy()
device = cuda.get_current_device()

2021-08-11 18:35:19.765991: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
def regular_encode(texts, tokenizer, maxlen=512):
    """
    encodes text for a model
    """
    enc_di = tokenizer.batch_encode_plus(
        texts,
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [3]:
def build_model(transformer, max_len=512, hidden_dim=32, n_classes=1):
    """
    builds a model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    
    if n_classes == 2: # binary classification
        out = Dense(1, activation='sigmoid')(cls_token)
    else:
        out = Dense(n_classes, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    
    if n_classes > 2:
        model.compile(
            Adam(lr=1e-5),
            loss='categorical_crossentropy',
            metrics=['categorical_accuracy']
        )
    else:
        model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [4]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 10
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MODEL = 'bert-base-cased' # use any appropriate model (e.g. bert-base-cased) from https://huggingface.co/models

In [5]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Prepare Data

In [6]:
def convert_str_to_resource_class(x):
    lst = ast.literal_eval(x)
    
    if len(lst) > 0:
        return lst[0]
    else:
        return "NULL"
    
def convert_df_for_resource_training(data_frame):
    data_frame = pd.DataFrame.copy(data_frame)
    data_frame = data_frame[data_frame.category == 'resource']
    data_frame.type = data_frame.type.apply(lambda x: convert_str_to_resource_class(x))
    data_frame = data_frame[data_frame.type != 'NULL']
    
    return data_frame

In [7]:
data_path = "../../data/at/dbpedia/task1_dbpedia_train_cleaned.csv"
df = pd.read_csv(data_path, sep='$')

In [8]:
max_len = list()

for q in df.question.values:
    max_len.append(len(tokenizer.encode(q)))
    
MAX_LEN = np.array(max_len).max()
del max_len

In [9]:
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
skf.get_n_splits(df.id, df.category)

3

In [10]:
train_list = []
test_list = []

for train_index, test_index in skf.split(df.id, df.category):
    # print("TRAIN:", train_index, "TEST:", test_index)
    train_list.append(convert_df_for_resource_training(df.iloc[train_index]))
    test_list.append(convert_df_for_resource_training(df.iloc[test_index]))

In [11]:
train_list[0].head()

Unnamed: 0,id,question,category,type
5,6,What is the federated state located in the Wei...,resource,dbo:State
7,9,What are the opera which start with the letter z,resource,dbo:Opera
8,11,Which is the state and country of the Watergat...,resource,dbo:Country
11,14,What is the god worshipped by Jehovah's Witnes...,resource,dbo:Name
17,20,What are the ethnic group which start with the...,resource,dbo:EthnicGroup


In [12]:
tf_train_list, tf_test_list, lens, y_test_list = list(), list(), list(), list()

encoder = LabelEncoder()
encoder.fit(train_list[0].type.values.tolist() + test_list[0].type.values.tolist())
    
for i in range(n_splits):
    x_train = regular_encode(train_list[i].question.values.tolist(), tokenizer, maxlen=MAX_LEN)
    x_test = regular_encode(test_list[i].question.values.tolist(), tokenizer, maxlen=MAX_LEN)

    y_train = train_list[i].type.values
    y_test = test_list[i].type.values

    # encode textual labels into corresponding numbers
    
    encoded_y_train = encoder.transform(y_train) 
    encoded_y_test = encoder.transform(y_test)
    dummy_y_train = np_utils.to_categorical(encoded_y_train, num_classes=encoder.classes_.shape[0])
    dummy_y_test = np_utils.to_categorical(encoded_y_test, num_classes=encoder.classes_.shape[0]) # one hot encoded
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, dummy_y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )

    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_test, dummy_y_test))
        .batch(BATCH_SIZE)
    )
    
    tf_train_list.append(train_dataset)
    tf_test_list.append(test_dataset)
    lens.append(x_train.shape)
    y_test_list.append(encoded_y_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2021-08-11 18:35:30.275557: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-08-11 18:35:30.275691: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-08-11 18:35:30.276148: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2080 Super with Max-Q Design computeCapability: 7.5
coreClock: 1.08GHz coreCount: 48 deviceMemorySiz

## Train Model

In [13]:
class Metrics(Callback):
    def __init__(self, val_data, average='micro', mlflow=None, **kwargs):
        super(Metrics, self).__init__(**kwargs)
        self.validation_data = val_data
        self.average = average
        self.mlflow = mlflow
        
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        batches = len(self.validation_data)
        total = batches * BATCH_SIZE

        y_pred = np.array([])
        y_true = np.array([])

        for xVal, yVal in self.validation_data:
            y_pred = np.append(y_pred, np.argmax(self.model.predict(xVal), axis=1)).astype('int32')
            y_true = np.append(y_true, np.argmax(yVal, axis=1)).astype('int32')
        
        _val_f1 = f1_score(y_true, y_pred, average=self.average)
        _val_recall = recall_score(y_true, y_pred, average=self.average)
        _val_precision = precision_score(y_true, y_pred, average=self.average)
        
        if self.mlflow:
            mlflow.log_metric("val_f1", _val_f1)
            mlflow.log_metric("val_precision", _val_recall)
            mlflow.log_metric("val_recall", _val_precision)
            
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        
        print(" — val_f1: %f — val_precision: %f — val_recall %f" % (_val_f1, _val_precision, _val_recall))
        return

In [14]:
os.environ["N_SPLIT"] = "0"
i = int(os.environ.get('N_SPLIT'))

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN, n_classes=encoder.classes_.shape[0])

print(model.summary())

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    mode='min',
    verbose=1,
    patience=1,
    min_delta=0.001,
    restore_best_weights=True
)

with mlflow.start_run():
    mlflow.log_param("KG", "DBpedia")
    mlflow.log_param("EPOCHS", EPOCHS)
    mlflow.log_param("BATCH_SIZE", BATCH_SIZE)
    mlflow.log_param("MAX_LEN", MAX_LEN)
    mlflow.log_param("MODEL", MODEL)
    mlflow.log_param("DATA", data_path)
    mlflow.log_param("n_split_idx", i)

    n_steps = lens[0][0] // BATCH_SIZE # determine number of steps per epoch
    
    train_history = model.fit(
        tf_train_list[i],
        steps_per_epoch=n_steps,
        validation_data=tf_test_list[i],
        callbacks=[early_stopping, Metrics(tf_test_list[i], mlflow=mlflow)],
        epochs=EPOCHS
    )

2021-08-11 18:35:34.086422: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-08-11 18:35:34.619088: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the 

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.




Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 145)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  TFBaseModelOutputWithPool 108310272 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 287)               220703    
Total params: 108,530,975
Trainable params: 108,530,975
Non-trainable params: 0
_________________________________________________________________
None
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment vari

2021-08-11 18:35:40.231098: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2021-08-11 18:35:40.231120: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.
2021-08-11 18:35:40.231152: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1611] Profiler found 1 GPUs
2021-08-11 18:35:40.231384: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:/usr/lib/cuda/lib64:
2021-08-11 18:35:40.231963: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcupti.so




2021-08-11 18:35:40.496103: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2021-08-11 18:35:40.496336: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1743] CUPTI activity buffer flushed


Epoch 1/10


2021-08-11 18:35:49.893746: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-08-11 18:35:50.005445: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2699905000 Hz


   1/1257 [..............................] - ETA: 4:36:06 - loss: 5.9601 - categorical_accuracy: 0.0000e+00

2021-08-11 18:35:54.277696: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2021-08-11 18:35:54.277724: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.


   2/1257 [..............................] - ETA: 21:01 - loss: 5.8324 - categorical_accuracy: 0.0000e+00  

2021-08-11 18:35:54.864260: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2021-08-11 18:35:54.865741: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1743] CUPTI activity buffer flushed
2021-08-11 18:35:54.937088: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 2833 callback api events and 2858 activity events. 
2021-08-11 18:35:55.000269: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2021-08-11 18:35:55.085836: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: /tmp/tmp091h0nwf/train/plugins/profile/2021_08_11_18_35_55
2021-08-11 18:35:55.130915: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to /tmp/tmp091h0nwf/train/plugins/profile/2021_08_11_18_35_55/ins-alex-ThinkPad-T15g-Gen-1.trace.json.gz
2021-08-11 18:35:55.253495: I tensorflow/core/profiler/rpc/client/save_profile.cc

 — val_f1: 0.688478 — val_precision: 0.688478 — val_recall 0.688478
Epoch 2/10
 — val_f1: 0.737577 — val_precision: 0.737577 — val_recall 0.737577
Epoch 3/10
 — val_f1: 0.748070 — val_precision: 0.748070 — val_recall 0.748070
Epoch 4/10
 — val_f1: 0.753712 — val_precision: 0.753712 — val_recall 0.753712
Epoch 5/10
Restoring model weights from the end of the best epoch.
 — val_f1: 0.753712 — val_precision: 0.753712 — val_recall 0.753712
Epoch 00005: early stopping


2021-08-11 19:27:56.140688: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.






INFO:tensorflow:Assets written to: /tmp/tmpz1lqnwke/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmpz1lqnwke/model/data/model/assets
