In [None]:
import os, sys

import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import gc
import pickle
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import Callback 
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

## Helper Functions

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
def build_model(transformer, max_len=512, hidden_dim=32, num_of_classes=3):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    
    out = Dense(num_of_classes, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

## TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 96
MODEL = 'bert-base-multilingual-cased'

## Create fast tokenizer

In [None]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Load text data into memory

In [None]:
dbpedia_df = pd.read_csv("/kaggle/input/test-06366/dbpedia.csv", sep="|")

for i in range(5):
    dbpedia_df["type_{0}".format(i+1)].fillna(value="NoneType")
    dbpedia_df["type_{0}".format(i+1)] = dbpedia_df["type_{0}".format(i+1)].astype(str)


#wikidata_df = pd.read_csv("/kaggle/input/test-06366/wikidata.csv", sep="|")
#wikidata_df.type_1.fillna(value="NoneType")
#wikidata_df.type_1 = wikidata_df.type_1.astype(str)

#common_df = dbpedia_df.append(wikidata_df)
#literal_df = common_df[common_df.category == "literal"]

#X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(common_df.question.astype(str), common_df.category, test_size=0.33, random_state=42)
#X_lit_train, X_lit_test, y_lit_train, y_lit_test = train_test_split(literal_df.question.astype(str), literal_df.type_1, test_size=0.33, random_state=42)
X_db_train, X_db_test, y1_db_train, y1_db_test, y2_db_train, y2_db_test, y3_db_train, y3_db_test, y4_db_train, y4_db_test, y5_db_train, y5_db_test = train_test_split(dbpedia_df[dbpedia_df.category == "resource"].question.astype(str), 
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_1,
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_2,
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_3,
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_4,
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_5,
                                                                test_size=0.33, random_state=42)

#X_wiki_train, X_wiki_test, y_wiki_train, y_wiki_test = train_test_split(wikidata_df[wikidata_df.category == "resource"].question.astype(str), wikidata_df[wikidata_df.category == "resource"].type_1, test_size=0.33, random_state=42)

In [None]:
dbpedia_df.head()

## Wikidata Resource classifier

In [None]:
encoder_wiki = LabelEncoder()
encoder_wiki.fit(wikidata_df[wikidata_df.category == "resource"].type_1)
encoded_Y_wiki_train = encoder_wiki.transform(y_wiki_train)
encoded_Y_wiki_test = encoder_wiki.transform(y_wiki_test)
dummy_y_wiki_train = np_utils.to_categorical(encoded_Y_wiki_train) # convert integers to dummy variables (i.e. one hot encoded)

In [None]:
%%time 

x_wiki_train = regular_encode(X_wiki_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
x_wiki_test = regular_encode(X_wiki_test.values.astype(str), tokenizer, maxlen=MAX_LEN)

In [None]:
train_dataset_wiki = (
    tf.data.Dataset
    .from_tensor_slices((x_wiki_train, dummy_y_wiki_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset_wiki = (
    tf.data.Dataset
    .from_tensor_slices(x_wiki_test)
    .batch(BATCH_SIZE)
)

In [None]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_wiki = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=wikidata_df[wikidata_df.category == "resource"].type_1.nunique())

In [None]:
n_steps = x_wiki_train.shape[0] // BATCH_SIZE
train_history = model_wiki.fit(
    train_dataset_wiki,
    steps_per_epoch=n_steps,
    epochs=EPOCHS,
    callbacks=[early_stopping]
)

In [None]:
y_wiki_test_pred = np.argmax(model_wiki.predict(test_dataset_wiki, verbose=1), axis=1)

In [None]:
f1 = f1_score(encoded_Y_wiki_test, y_wiki_test_pred, average='weighted')
print('Accuracy: {0}'.format(f1))

In [None]:
def get_top_n_args(M, n=10):
    return M.argsort(axis=1)[:,::-1][:,:n]

## Submission preparation (for DBpedia)

```output_example = [
    {
        "id" : "",
        "category" : "",
        "type" : []
    },
]```

In [None]:
dbpedia_faketest_df = dbpedia_df[:150]

x_fake_test = regular_encode(dbpedia_faketest_df.question.values.astype(str), tokenizer, maxlen=MAX_LEN)

faketest_dataset_db = (
    tf.data.Dataset
    .from_tensor_slices(x_fake_test)
    .batch(BATCH_SIZE)
)

In [None]:
model_cat = tf.keras.models.load_model("/kaggle/input/iswc-cat-and-lit-classifiers/model_cat")
encoder_cat = LabelEncoder()
encoder_cat.classes_ = np.load('/kaggle/input/iswc-cat-and-lit-classifiers/encoder_cat.npy', allow_pickle=True)

#predicting categories
y_cat_test_pred = np.argmax(model_cat.predict(faketest_dataset_db, verbose=1), axis=1)
y_pred_cat_labels = encoder_cat.inverse_transform(y_cat_test_pred)

%reset model_cat
del model_cat
gc.collect()

In [None]:
model_lit = tf.keras.models.load_model("/kaggle/input/iswc-cat-and-lit-classifiers/model_lit")
encoder_lit = LabelEncoder()
encoder_lit.classes_ = np.load('/kaggle/input/iswc-cat-and-lit-classifiers/encoder_lit.npy', allow_pickle=True)

#predicting literals (not all of them will be used)
y_lit_test_pred = np.argmax(model_lit.predict(faketest_dataset_db, verbose=1), axis=1)
y_pred_lit_labels = encoder_lit.inverse_transform(y_lit_test_pred)

print(sys.getrefcount(model_lit))

%reset model_lit
del model_lit
gc.collect()

In [None]:
tf.keras.backend.clear_session()

In [None]:
model_1 = tf.keras.models.load_model("/kaggle/input/../input/iswc-dbpedia-res-classifier-type-1-2/model_1")
encoder_1 = LabelEncoder()
encoder_1.classes_ = np.load('/kaggle/input/../input/iswc-dbpedia-res-classifier-type-1-2/encoder_type1.npy', allow_pickle=True)

y_type_1_test_pred = np.argmax(model_1.predict(faketest_dataset_db, verbose=1), axis=1)

print(sys.getrefcount(model_1))
tf.keras.backend.clear_session()
print(sys.getrefcount(model_1))

del model_1
gc.collect()

In [None]:
model_2 = tf.keras.models.load_model("/kaggle/input/../input/iswc-dbpedia-res-classifier-type-1-2/model_2")
encoder_2 = LabelEncoder()
encoder_2.classes_ = np.load('/kaggle/input/../input/iswc-dbpedia-res-classifier-type-1-2/encoder_type2.npy', allow_pickle=True)

y_type_2_test_pred = np.argmax(model_2.predict(faketest_dataset_db, verbose=1), axis=1)

print(sys.getrefcount(model_2))
tf.keras.backend.clear_session()
print(sys.getrefcount(model_2))

del model_2
gc.collect()

In [None]:
model_3 = tf.keras.models.load_model("/kaggle/input/../input/iswc-dbpedia-res-classifier-type-3-4/model_3")
encoder_3 = LabelEncoder()
encoder_3.classes_ = np.load('/kaggle/input/../input/iswc-dbpedia-res-classifier-type-3-4/encoder_type3.npy', allow_pickle=True)

y_type_3_test_pred = np.argmax(model_3.predict(faketest_dataset_db, verbose=1), axis=1)

print(sys.getrefcount(model_3))
tf.keras.backend.clear_session()
print(sys.getrefcount(model_3))

del model_3
gc.collect()

In [None]:
model_4 = tf.keras.models.load_model("/kaggle/input/../input/iswc-dbpedia-res-classifier-type-3-4/model_4")
encoder_4 = LabelEncoder()
encoder_4.classes_ = np.load('/kaggle/input/../input/iswc-dbpedia-res-classifier-type-3-4/encoder_type4.npy', allow_pickle=True)

y_type_4_test_pred = np.argmax(model_4.predict(faketest_dataset_db, verbose=1), axis=1)

print(sys.getrefcount(model_4))
tf.keras.backend.clear_session()
print(sys.getrefcount(model_4))

del model_4
gc.collect()

In [None]:
model_5 = tf.keras.models.load_model("../input/iswc-dbpedia-res-classifier-type-5-6/model_5")
encoder_5 = LabelEncoder()
encoder_5.classes_ = np.load('../input/iswc-dbpedia-res-classifier-type-5-6/encoder_type5.npy', allow_pickle=True)

y_type_5_test_pred = np.argmax(model_5.predict(faketest_dataset_db, verbose=1), axis=1)

print(sys.getrefcount(model_5))
tf.keras.backend.clear_session()
print(sys.getrefcount(model_5))

del model_5
gc.collect()

In [None]:
preds_array = list()

for i in range(len(y_pred_cat_labels)):
    pred = {"id": dbpedia_faketest_df.iloc[i]["id"], "category": y_pred_cat_labels[i], "type": []}
    
    if y_pred_cat_labels[i] == "boolean":
        pred["type"] = [y_pred_cat_labels[i]]
        preds_array.append(pred)
    elif y_pred_cat_labels[i] == "literal":
        pred["type"] = [y_pred_lit_labels[i]]
        preds_array.append(pred)
    elif y_pred_cat_labels[i] == "resource":
        type_1 = encoder_1.inverse_transform([y_type_1_test_pred[i]])
        type_2 = encoder_2.inverse_transform([y_type_2_test_pred[i]])
        type_3 = encoder_3.inverse_transform([y_type_3_test_pred[i]])
        type_4 = encoder_4.inverse_transform([y_type_4_test_pred[i]])
        type_5 = encoder_5.inverse_transform([y_type_5_test_pred[i]])
        
        tmp = [type_1, type_2, type_3, type_4, type_5]
        pred["type"] = [t[0] for t in tmp if t[0] != 'nan']
        preds_array.append(pred)
    else:
        raise AssertionError()

In [None]:
import json

with open('faketest_dbpedia_pred.json', 'w') as json_file:
    json.dump(preds_array, json_file)