In [1]:
import os

import ipywidgets as widgets
from IPython.display import display

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import transformers
from transformers import TFAutoModel, AutoTokenizer, TFBertForSequenceClassification
from tqdm.notebook import tqdm
import numpy as np
from keras.utils import np_utils
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from tensorflow.keras.callbacks import Callback 
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from tensorflow.keras.metrics import FalseNegatives, FalsePositives, TrueNegatives, TruePositives
import ast
from sklearn.preprocessing import MultiLabelBinarizer
from numba import cuda 
import mlflow
import mlflow.tensorflow

mlflow.set_tracking_uri("http://webengineering.ins.hs-anhalt.de:41004")
mlflow.set_experiment("SMART21: Resource Classifier")
mlflow.tensorflow.autolog()

strategy = tf.distribute.get_strategy()
device = cuda.get_current_device()

2021-10-05 12:11:56.718129: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [2]:
test_data_name = widgets.RadioButtons(
    options=['Wikidata'],
    description='Test dataset:',
    disabled=False
)
display(test_data_name)

RadioButtons(description='Test dataset:', options=('Wikidata',), value='Wikidata')

In [3]:
external_data = widgets.SelectMultiple(
    options=['Wikidata', 'LC-QuAD2', 'WikidataTranslated'],
    value=['Wikidata'],
    #rows=10,
    description='External Data:',
    disabled=False
)
display(external_data)

SelectMultiple(description='External Data:', index=(0,), options=('Wikidata', 'LC-QuAD2', 'WikidataTranslated'…

In [4]:
model = widgets.RadioButtons(
    options=['bert-base-cased', 'bert-base-multilingual-cased'],
    description='Model:',
    disabled=False
)
display(model)

RadioButtons(description='Model:', options=('bert-base-cased', 'bert-base-multilingual-cased'), value='bert-ba…

In [5]:
epochs = widgets.IntSlider(
    value=10,
    min=1,
    max=20,
    step=1,
    description='EPOCHS:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)
display(epochs)

IntSlider(value=10, continuous_update=False, description='EPOCHS:', max=20, min=1)

In [6]:
MODEL = model.value
EPOCHS = epochs.value

In [7]:
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [8]:
def regular_encode(texts, tokenizer, maxlen=512):
    """
    encodes text for a model
    """
    enc_di = tokenizer.batch_encode_plus(
        texts,
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [9]:
def build_model(transformer, max_len=512, hidden_dim=32, n_classes=1):
    """
    builds a model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    
    if n_classes == 2: # binary classification
        out = Dense(1, activation='sigmoid')(cls_token)
    else:
        out = Dense(n_classes, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    
    if n_classes > 2: # multilabel
        model.compile(
            Adam(lr=1e-5),
            loss='binary_crossentropy',
            metrics=['categorical_accuracy']
        )
    else:
        model.compile(
            Adam(lr=1e-5),
            loss='binary_crossentropy',
            metrics=['accuracy', FalseNegatives(), FalsePositives(), TrueNegatives(), TruePositives()]
        )
    
    return model

In [10]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Prepare Data

In [11]:
def convert_str_to_resource_class(x):
    lst = ast.literal_eval(x)
    lst = [str(l) for l in lst]
    if len(lst) > 0:
        return str(sorted(lst))
    else:
        return "NULL"
    
def sort_type(data_frame):
    data_frame = pd.DataFrame.copy(data_frame)
    data_frame = data_frame[data_frame.category == 'resource']
    data_frame.type = data_frame.type.apply(lambda x: convert_str_to_resource_class(x))
    data_frame = data_frame[data_frame.type != 'NULL']
    
    return data_frame

def prepare_wdt_df(df):
    sample_amounts = dict()
    for i,j in sort_type(df).type.value_counts().iteritems():
        sample_amounts[i] = j
    cutoff_rate = np.percentile(list(sample_amounts.values()), 95)
    included_types = list()
    for i, j in sample_amounts.items():
        if j > cutoff_rate:
            included_types.append(i)

    df = sort_type(df)
    df_person = df[df.type == "['natural person', 'omnivore', 'person']"].sample(frac=0.3)
    df = df[(df.type.isin(included_types)) & (df.type != "['natural person', 'omnivore', 'person']")]
    df = df.append(df_person)
    df = df.sample(frac=1)
    df.type = df.type.apply(lambda x: ast.literal_eval(x))
    
    return df

In [22]:
wdt_df = pd.read_csv("../../data/at/wikidata/lcquad2_anstype_wikidata_train_cleaned.csv", sep='$')
lcquad2_df = pd.read_csv("../../data/at/common/lcquad2_train_types-1.csv", sep='$')
wdt_translated_df = pd.read_csv("../../data/at/wikidata/lcquad2_anstype_wikidata_train_translated.csv", sep='$')

wdt_df = prepare_wdt_df(wdt_df)
lcquad2_df = prepare_wdt_df(lcquad2_df)
wdt_translated_df = prepare_wdt_df(wdt_translated_df)

langs = ['de', 'es', 'zh', 'it', 'ro', 'vi', 'ru', 'fr', 'cs', 'jap']

wdt_tmp = wdt_translated_df.copy()

for l in langs:
    wdt_tmp['question'] = wdt_tmp[f'question_{l}']
    wdt_translated_df = wdt_translated_df.append(wdt_tmp)
     
wdt_translated_df = wdt_translated_df[wdt_df.columns].sample(frac=0.5)

# wdt_df.id = wdt_df.id.apply(lambda x: str(x) + "-wdt")
# wdt_df = wdt_df[wdt_df.category == 'resource']
# wdt_df.type = wdt_df.type.apply(lambda x: list(ast.literal_eval(x)))
# wdt_df_columns = wdt_df.columns
# wdt_df_columns = list(wdt_df_columns)
# wdt_df_columns[2] = '_category_'

In [23]:
mlb = MultiLabelBinarizer()
mlb.fit(np.append(wdt_df.type.values, lcquad2_df.type.values))
np.save('../../data/bin/label_encoders/wdt_lcquad_encoder.npy', mlb.classes_)

# wdt_df_dummies = pd.DataFrame(mlb.transform(wdt_df.type), columns=mlb.classes_)
# wdt_df_dummies = pd.get_dummies(wdt_df.type.apply(pd.Series).stack()).sum(level=0)
# wdt_df = pd.concat([wdt_df.reset_index(drop=True), wdt_df_dummies.reset_index(drop=True)], axis=1, ignore_index=True)

In [24]:
len(mlb.classes_)

305

In [25]:
# wdt_df.columns = list(wdt_df_columns) + list(wdt_df_dummies.columns)
wdt_df.id = wdt_df.id.astype(int)
wdt_df.category = wdt_df.category.astype(str)

data_dict = {
    'Wikidata': wdt_df,
    'WikidataTranslated': wdt_translated_df,
    'LC-QuAD2': lcquad2_df
}
# wdt_df_dummies.shape, wdt_df.shape

In [26]:
def add_external_data(df):
    for k in external_data.value:
        if k != test_data_name.value:
            df = df.append(data_dict[k])
    
    return df

In [27]:
max_len = list()

for q in add_external_data(data_dict[test_data_name.value]).question.values:
    max_len.append(len(tokenizer.encode(q)))
    
MAX_LEN = np.array(max_len).max()
del max_len

In [28]:
MAX_LEN = 145

In [29]:
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
skf.get_n_splits(data_dict[test_data_name.value].id, data_dict[test_data_name.value].category)

3

In [32]:
train_list = []
test_list = []

for train_index, test_index in skf.split(data_dict[test_data_name.value].id, data_dict[test_data_name.value].category):
    train_df = add_external_data(data_dict[test_data_name.value].iloc[train_index])
    test_df = data_dict[test_data_name.value].iloc[test_index]
    train_df = train_df[~train_df.question.isin(test_df.question)]
    train_list.append(train_df)
    test_list.append(test_df)

In [33]:
test_list[0].shape, train_list[0].shape

((1539, 4), (3251, 4))

In [34]:
tf_train_list, tf_test_list, lens = list(), list(), list()

# encoder = LabelEncoder()
# encoder.fit(train_list[0].type.values.tolist() + test_list[0].type.values.tolist())
    
for i in range(n_splits):
    x_train = regular_encode(train_list[i].question.values.tolist(), tokenizer, maxlen=MAX_LEN)
    x_test = regular_encode(test_list[i].question.values.tolist(), tokenizer, maxlen=MAX_LEN)

    dummy_y_train = mlb.transform(train_list[i].type)
    dummy_y_test = mlb.transform(test_list[i].type)
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, dummy_y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )

    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_test, dummy_y_test))
        .batch(BATCH_SIZE)
    )
    
    tf_train_list.append(train_dataset)
    tf_test_list.append(test_dataset)
    lens.append(x_train.shape)
    # y_test_list.append(dummy_y_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
2021-10-05 12:16:38.992541: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-10-05 12:16:38.993288: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-05 12:16:38.993674: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce RTX 2080 Super with Max-Q Design computeCapability: 7.5
coreClock: 1.08GHz coreCount: 48 deviceMemorySiz

## Train Model

In [35]:
split_idx = widgets.IntSlider(
    value=0,
    min=0,
    max=n_splits-1,
    step=1,
    description='N_SPLIT:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)
display(split_idx)

IntSlider(value=0, continuous_update=False, description='N_SPLIT:', max=2)

In [36]:
i = split_idx.value

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN, n_classes=dummy_y_train.shape[1])

print(model.summary())

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    mode='min',
    verbose=1,
    patience=1,
    min_delta=0.005,
    restore_best_weights=True
)

with mlflow.start_run():
    mlflow.log_param("EPOCHS", EPOCHS)
    mlflow.log_param("BATCH_SIZE", BATCH_SIZE)
    mlflow.log_param("MAX_LEN", MAX_LEN)
    mlflow.log_param("MODEL", MODEL)
    mlflow.log_param("TEST_DATA", test_data_name.value)
    mlflow.log_param("EXTERNAL_DATA", '+'.join(d for d in external_data.value))
    mlflow.log_param("n_split_idx", i)

    n_steps = lens[0][0] // BATCH_SIZE # determine number of steps per epoch

    train_history = model.fit(
        tf_train_list[i],
        steps_per_epoch=n_steps,
        validation_data=tf_test_list[i],
        callbacks=[early_stopping],
        epochs=EPOCHS
    )

2021-10-05 12:16:44.949216: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2021-10-05 12:16:45.746830: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the 

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.




Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 145)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  TFBaseModelOutputWithPool 108310272 
_________________________________________________________________
tf.__operators__.getitem (Sl (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 305)               234545    
Total params: 108,544,817
Trainable params: 108,544,817
Non-trainable params: 0
_________________________________________________________________
None
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment vari

2021-10-05 12:16:51.420939: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2021-10-05 12:16:51.420993: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.
2021-10-05 12:16:51.422968: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1611] Profiler found 1 GPUs
2021-10-05 12:16:51.423432: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcupti.so.11.2'; dlerror: libcupti.so.11.2: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/lib/cuda/include:/usr/lib/cuda/lib64:
2021-10-05 12:16:51.442183: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcupti.so




2021-10-05 12:16:51.665216: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2021-10-05 12:16:51.665405: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1743] CUPTI activity buffer flushed


Epoch 1/10


2021-10-05 12:17:00.072197: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-05 12:17:00.172147: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2699905000 Hz


  1/203 [..............................] - ETA: 38:13 - loss: 0.7650 - categorical_accuracy: 0.0625

2021-10-05 12:17:05.091192: I tensorflow/core/profiler/lib/profiler_session.cc:126] Profiler session initializing.
2021-10-05 12:17:05.091280: I tensorflow/core/profiler/lib/profiler_session.cc:141] Profiler session started.


  2/203 [..............................] - ETA: 7:56 - loss: 0.7516 - categorical_accuracy: 0.0312 

2021-10-05 12:17:06.120050: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2021-10-05 12:17:06.120929: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1743] CUPTI activity buffer flushed
2021-10-05 12:17:06.197529: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 2857 callback api events and 2882 activity events. 
2021-10-05 12:17:06.255763: I tensorflow/core/profiler/lib/profiler_session.cc:159] Profiler session tear down.
2021-10-05 12:17:06.337638: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: /tmp/tmp67y7zc4r/train/plugins/profile/2021_10_05_12_17_06
2021-10-05 12:17:06.380648: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to /tmp/tmp67y7zc4r/train/plugins/profile/2021_10_05_12_17_06/ins-alex-ThinkPad-T15g-Gen-1.trace.json.gz
2021-10-05 12:17:06.500018: I tensorflow/core/profiler/rpc/client/save_profile.cc

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Restoring model weights from the end of the best epoch.
Epoch 00007: early stopping


2021-10-05 12:27:40.945346: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.






INFO:tensorflow:Assets written to: /tmp/tmppxjfs2y0/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmppxjfs2y0/model/data/model/assets


In [39]:
x_pred = regular_encode(["in what newspaper did obama printed"], tokenizer, maxlen=MAX_LEN)
res = model.predict(x_pred)
res[res <= 0.1] = 0
res[res > 0.1] = 1

mlb.inverse_transform(res)



[('natural person', 'omnivore', 'person')]

In [38]:
model.save(f'../../data/bin/resource_wikidata_base_2020_lcquad')









































































































INFO:tensorflow:Assets written to: ../../data/bin/resource_wikidata_base_2020_lcquad/assets


INFO:tensorflow:Assets written to: ../../data/bin/resource_wikidata_base_2020_lcquad/assets
