In [None]:
import os

import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import gc
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import Callback 
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

## Helper Functions

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
def build_model(transformer, max_len=512, hidden_dim=32, num_of_classes=3):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    
    out = Dense(num_of_classes, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

## TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 96
MODEL = 'bert-base-multilingual-cased'

## Create fast tokenizer

In [None]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Load text data into memory

In [None]:
dbpedia_df = pd.read_csv("/kaggle/input/test-06366/dbpedia.csv", sep="|")
dbpedia_df.type_1.fillna(value="NoneType")
dbpedia_df.type_1 = dbpedia_df.type_1.astype(str)

wikidata_df = pd.read_csv("/kaggle/input/test-06366/wikidata.csv", sep="|")
wikidata_df.type_1.fillna(value="NoneType")
wikidata_df.type_1 = wikidata_df.type_1.astype(str)

common_df = dbpedia_df.append(wikidata_df)
literal_df = common_df[common_df.category == "literal"]

X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(common_df.question.astype(str), common_df.category, test_size=0.33, random_state=42)
X_lit_train, X_lit_test, y_lit_train, y_lit_test = train_test_split(literal_df.question.astype(str), literal_df.type_1, test_size=0.33, random_state=42)
X_db_train, X_db_test, y_db_train, y_db_test = train_test_split(dbpedia_df[dbpedia_df.category == "resource"].question.astype(str), dbpedia_df[dbpedia_df.category == "resource"].type_1, test_size=0.33, random_state=42)
X_wiki_train, X_wiki_test, y_wiki_train, y_wiki_test = train_test_split(wikidata_df[wikidata_df.category == "resource"].question.astype(str), wikidata_df[wikidata_df.category == "resource"].type_1, test_size=0.33, random_state=42)

## Category classifier

In [None]:
encoder_cat = LabelEncoder()
encoder_cat.fit(common_df.category)
encoded_Y_cat_train = encoder_cat.transform(y_cat_train)
encoded_Y_cat_test = encoder_cat.transform(y_cat_test)
dummy_y_cat_train = np_utils.to_categorical(encoded_Y_cat_train) # convert integers to dummy variables (i.e. one hot encoded)

In [None]:
%%time 

x_cat_train = regular_encode(X_cat_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
x_cat_test = regular_encode(X_cat_test.values.astype(str), tokenizer, maxlen=MAX_LEN)

In [None]:
train_dataset_cat = (
    tf.data.Dataset
    .from_tensor_slices((x_cat_train, dummy_y_cat_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset_cat = (
    tf.data.Dataset
    .from_tensor_slices(x_cat_test)
    .batch(BATCH_SIZE)
)

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_cat = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=common_df.category.nunique())

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=1,
    mode='min',
    restore_best_weights=True)

In [None]:
n_steps = x_cat_train.shape[0] // BATCH_SIZE
train_history = model_cat.fit(
    train_dataset_cat,
    steps_per_epoch=n_steps,
    epochs=EPOCHS,
    callbacks=[early_stopping]
)

In [None]:
y_cat_test_pred = np.argmax(model_cat.predict(test_dataset_cat, verbose=1), axis=1)

In [None]:
acc = accuracy_score(encoded_Y_cat_test, y_cat_test_pred)
print('Accuracy: {0}'.format(acc))

## Literal classifier

In [None]:
encoder_lit = LabelEncoder()
encoder_lit.fit(literal_df.type_1)
encoded_Y_lit_train = encoder_lit.transform(y_lit_train)
encoded_Y_lit_test = encoder_lit.transform(y_lit_test)
dummy_y_lit_train = np_utils.to_categorical(encoded_Y_lit_train) # convert integers to dummy variables (i.e. one hot encoded)

In [None]:
%%time 

x_lit_train = regular_encode(X_lit_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
x_lit_test = regular_encode(X_lit_test.values.astype(str), tokenizer, maxlen=MAX_LEN)

In [None]:
train_dataset_lit = (
    tf.data.Dataset
    .from_tensor_slices((x_lit_train, dummy_y_lit_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset_lit = (
    tf.data.Dataset
    .from_tensor_slices(x_lit_test)
    .batch(BATCH_SIZE)
)

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_lit = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=literal_df.type_1.nunique())

In [None]:
n_steps = x_lit_train.shape[0] // BATCH_SIZE
train_history = model_lit.fit(
    train_dataset_lit,
    steps_per_epoch=n_steps,
    epochs=EPOCHS,
    callbacks=[early_stopping]
)

In [None]:
y_lit_test_pred = np.argmax(model_lit.predict(test_dataset_lit, verbose=1), axis=1)

In [None]:
acc = accuracy_score(encoded_Y_lit_test, y_lit_test_pred)
print('Accuracy: {0}'.format(acc))

## DBpedia Resource classifier

In [None]:
encoder_db = LabelEncoder()
encoder_db.fit(dbpedia_df[dbpedia_df.category == "resource"].type_1)
encoded_Y_db_train = encoder_db.transform(y_db_train)
encoded_Y_db_test = encoder_db.transform(y_db_test)
dummy_y_db_train = np_utils.to_categorical(encoded_Y_db_train) # convert integers to dummy variables (i.e. one hot encoded)

In [None]:
%%time 

x_db_train = regular_encode(X_db_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
x_db_test = regular_encode(X_db_test.values.astype(str), tokenizer, maxlen=MAX_LEN)

In [None]:
train_dataset_db = (
    tf.data.Dataset
    .from_tensor_slices((x_db_train, dummy_y_db_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset_db = (
    tf.data.Dataset
    .from_tensor_slices(x_db_test)
    .batch(BATCH_SIZE)
)

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_db = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=dbpedia_df[dbpedia_df.category == "resource"].type_1.nunique())

In [None]:
n_steps = x_db_train.shape[0] // BATCH_SIZE
train_history = model_db.fit(
    train_dataset_db,
    steps_per_epoch=n_steps,
    epochs=EPOCHS,
    callbacks=[early_stopping]
)

In [None]:
y_db_test_pred = np.argmax(model_db.predict(test_dataset_db, verbose=1), axis=1)

In [None]:
f1 = f1_score(encoded_Y_db_test, y_db_test_pred, average='weighted')
print('Accuracy: {0}'.format(f1))

## Wikidata Resource classifier

In [None]:
encoder_wiki = LabelEncoder()
encoder_wiki.fit(wikidata_df[wikidata_df.category == "resource"].type_1)
encoded_Y_wiki_train = encoder_wiki.transform(y_wiki_train)
encoded_Y_wiki_test = encoder_wiki.transform(y_wiki_test)
dummy_y_wiki_train = np_utils.to_categorical(encoded_Y_wiki_train) # convert integers to dummy variables (i.e. one hot encoded)

In [None]:
%%time 

x_wiki_train = regular_encode(X_wiki_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
x_wiki_test = regular_encode(X_wiki_test.values.astype(str), tokenizer, maxlen=MAX_LEN)

In [None]:
train_dataset_wiki = (
    tf.data.Dataset
    .from_tensor_slices((x_wiki_train, dummy_y_wiki_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset_wiki = (
    tf.data.Dataset
    .from_tensor_slices(x_wiki_test)
    .batch(BATCH_SIZE)
)

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_wiki = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=wikidata_df[wikidata_df.category == "resource"].type_1.nunique())

In [None]:
n_steps = x_wiki_train.shape[0] // BATCH_SIZE
train_history = model_wiki.fit(
    train_dataset_wiki,
    steps_per_epoch=n_steps,
    epochs=EPOCHS,
    callbacks=[early_stopping]
)

In [None]:
y_wiki_test_pred = np.argmax(model_wiki.predict(test_dataset_wiki, verbose=1), axis=1)

In [None]:
f1 = f1_score(encoded_Y_wiki_test, y_wiki_test_pred, average='weighted')
print('Accuracy: {0}'.format(f1))

In [None]:
def get_top_n_args(M, n=10):
    return M.argsort(axis=1)[:,::-1][:,:n]

## Submission preparation (for DBpedia)

```output_example = [
    {
        "id" : "",
        "category" : "",
        "type" : []
    },
]```

In [None]:
dbpedia_faketest_df = dbpedia_df[:150]

x_fake_test = regular_encode(dbpedia_faketest_df.question.values.astype(str), tokenizer, maxlen=MAX_LEN)

faketest_dataset_db = (
    tf.data.Dataset
    .from_tensor_slices(x_fake_test)
    .batch(BATCH_SIZE)
)

In [None]:
#predicting categories
y_cat_test_pred = np.argmax(model_cat.predict(faketest_dataset_db, verbose=1), axis=1)
y_pred_cat_labels = encoder_cat.inverse_transform(y_cat_test_pred)

In [None]:
#predicting literals (not all of them will be used)
y_lit_test_pred = np.argmax(model_lit.predict(faketest_dataset_db, verbose=1), axis=1)
y_pred_lit_labels = encoder_lit.inverse_transform(y_lit_test_pred)

In [None]:
#predicting resources (not all of them will be used)
y_res_test_pred = get_top_n_args(model_db.predict(faketest_dataset_db, verbose=1))

In [None]:
preds_array = list()

for i in range(len(y_pred_cat_labels)):
    pred = {"id": dbpedia_faketest_df.iloc[i]["id"], "category": y_pred_cat_labels[i], "type": []}
    
    if y_pred_cat_labels[i] == "boolean":
        pred["type"] = [y_pred_cat_labels[i]]
        preds_array.append(pred)
    elif y_pred_cat_labels[i] == "literal":
        pred["type"] = [y_pred_lit_labels[i]]
        preds_array.append(pred)
    elif y_pred_cat_labels[i] == "resource":
        res_arr = encoder_db.inverse_transform(y_res_test_pred[i])
        pred["type"] = list(res_arr)
        preds_array.append(pred)
    else:
        raise AssertionError()