In [1]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from transformers import TFAutoModel, AutoTokenizer, TFBertForSequenceClassification
from tqdm.notebook import tqdm
import numpy as np
from keras.utils import np_utils
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import Callback 
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from tensorflow.keras.metrics import FalseNegatives, FalsePositives, TrueNegatives, TruePositives
from numba import cuda 
import mlflow
import mlflow.tensorflow

mlflow.set_tracking_uri("http://webengineering.ins.hs-anhalt.de:41004")
mlflow.set_experiment("SMART21: Category Classifier")
mlflow.tensorflow.autolog()
    
strategy = tf.distribute.get_strategy()
device = cuda.get_current_device()

2021-08-11 13:02:34.869663: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
def regular_encode(texts, tokenizer, maxlen=512):
    """
    encodes text for a model
    """
    enc_di = tokenizer.batch_encode_plus(
        texts,
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [3]:
def build_model(transformer, max_len=512, hidden_dim=32, n_classes=1):
    """
    builds a model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    
    if n_classes == 2: # binary classification
        out = Dense(1, activation='sigmoid')(cls_token)
    else:
        out = Dense(n_classes, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    
    if n_classes > 2:
        model.compile(
            Adam(lr=1e-5),
            loss='categorical_crossentropy',
            metrics=['categorical_accuracy', FalseNegatives(), FalsePositives(), TrueNegatives(), TruePositives()]
        )
    else:
        model.compile(
            Adam(lr=1e-5),
            loss='binary_crossentropy',
            metrics=['accuracy', FalseNegatives(), FalsePositives(), TrueNegatives(), TruePositives()]
        )
    
    return model

In [4]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 10
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MODEL = 'bert-base-cased' # use any appropriate model (e.g. bert-base-cased) from https://huggingface.co/models

In [5]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Prepare Data

In [6]:
data_path = "../../data/at/dbpedia/task1_dbpedia_train_cleaned.csv"
df = pd.read_csv(data_path, sep='$')

In [7]:
max_len = list()

for q in df.question.values:
    max_len.append(len(tokenizer.encode(q)))
    
MAX_LEN = np.array(max_len).max()
del max_len

In [8]:
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
skf.get_n_splits(df.id, df.category)

3

In [9]:
train_list = []
test_list = []

for train_index, test_index in skf.split(df.id, df.category):
    # print("TRAIN:", train_index, "TEST:", test_index)
    train_list.append(df.iloc[train_index])
    test_list.append(df.iloc[test_index])

In [10]:
tf_train_list, tf_test_list, lens, y_test_list = list(), list(), list(), list()

for i in range(n_splits):
    x_train = regular_encode(train_list[i].question.values.tolist(), tokenizer, maxlen=MAX_LEN)
    x_test = regular_encode(test_list[i].question.values.tolist(), tokenizer, maxlen=MAX_LEN)

    y_train = train_list[i].category.values
    y_test = test_list[i].category.values

    # encode textual labels into corresponding numbers
    encoder = LabelEncoder()
    encoder.fit(y_train)
    encoded_y_train = encoder.transform(y_train) 
    encoded_y_test = encoder.transform(y_test)
    dummy_y_train = np_utils.to_categorical(encoded_y_train) # convert integers to dummy variables (i.e. one hot encoded)
    dummy_y_test = np_utils.to_categorical(encoded_y_test) # convert integers to dummy variables (i.e. one hot encoded)
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, dummy_y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )

    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_test, dummy_y_test))
        .batch(BATCH_SIZE)
    )
    
    tf_train_list.append(train_dataset)
    tf_test_list.append(test_dataset)
    lens.append(x_train.shape)
    y_test_list.append(encoded_y_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2021-08-11 13:02:43.705526: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-08-11 13:02:43.705652: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-08-11 13:02:43.706066: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2080 Super with Max-Q Design computeCapability: 7.5
coreClock: 1.08GHz coreCount: 48 deviceMemorySiz

## Train Model

In [11]:
os.environ["N_SPLIT"] = "0"
i = int(os.environ.get('N_SPLIT'))

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN, n_classes=df.category.nunique())

print(model.summary())

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    mode='min',
    verbose=1,
    patience=1,
    restore_best_weights=True
)

with mlflow.start_run():
    mlflow.log_param("KG", "DBpedia")
    mlflow.log_param("EPOCHS", EPOCHS)
    mlflow.log_param("BATCH_SIZE", BATCH_SIZE)
    mlflow.log_param("MAX_LEN", MAX_LEN)
    mlflow.log_param("MODEL", MODEL)
    mlflow.log_param("DATA", data_path)
    mlflow.log_param("n_split_idx", i)

    n_steps = lens[0][0] // BATCH_SIZE # determine number of steps per epoch

    train_history = model.fit(
        tf_train_list[i],
        steps_per_epoch=n_steps,
        validation_data=tf_test_list[i],
        callbacks=[early_stopping],
        epochs=EPOCHS
    )

2021-08-11 13:02:47.373905: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-08-11 13:02:47.794187: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the 

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.




Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 145)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  TFBaseModelOutputWithPool 108310272 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 3)                 2307      
Total params: 108,312,579
Trainable params: 108,312,579
Non-trainable params: 0
_________________________________________________________________
None
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment vari

2021-08-11 13:02:52.355059: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2021-08-11 13:02:52.355123: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.
2021-08-11 13:02:52.355206: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1611] Profiler found 1 GPUs
2021-08-11 13:02:52.355784: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:/usr/lib/cuda/lib64:
2021-08-11 13:02:52.357305: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcupti.so




2021-08-11 13:02:52.589028: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2021-08-11 13:02:52.589202: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1743] CUPTI activity buffer flushed


Epoch 1/10


2021-08-11 13:03:00.254795: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-08-11 13:03:00.348505: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2699905000 Hz


   1/1544 [..............................] - ETA: 4:35:08 - loss: 1.7218 - categorical_accuracy: 0.1875 - false_negatives: 10.0000 - false_positives: 27.0000 - true_negatives: 5.0000 - true_positives: 6.0000

2021-08-11 13:03:03.848915: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2021-08-11 13:03:03.848938: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.


   2/1544 [..............................] - ETA: 23:31 - loss: 1.3097 - categorical_accuracy: 0.4062 - false_negatives: 11.0000 - false_positives: 51.0000 - true_negatives: 13.0000 - true_positives: 21.0000

2021-08-11 13:03:04.394724: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2021-08-11 13:03:04.395484: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1743] CUPTI activity buffer flushed
2021-08-11 13:03:04.450263: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 2875 callback api events and 2896 activity events. 
2021-08-11 13:03:04.502438: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2021-08-11 13:03:04.575254: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: /tmp/tmpfex8mi3g/train/plugins/profile/2021_08_11_13_03_04
2021-08-11 13:03:04.621339: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to /tmp/tmpfex8mi3g/train/plugins/profile/2021_08_11_13_03_04/ins-alex-ThinkPad-T15g-Gen-1.trace.json.gz
2021-08-11 13:03:04.714688: I tensorflow/core/profiler/rpc/client/save_profile.cc

Epoch 2/10
Restoring model weights from the end of the best epoch.
Epoch 00002: early stopping


2021-08-11 13:24:26.880664: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.






INFO:tensorflow:Assets written to: /tmp/tmpdx4hhwvo/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmpdx4hhwvo/model/data/model/assets
