In [1]:
import os, sys
import json

import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import gc
import pickle
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import Callback 
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

Using TensorFlow backend.


## Helper Functions

In [2]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

## TPU Configs

In [3]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [4]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 96 #TODO: check .95% of questions len
MODEL = 'bert-base-multilingual-cased'

## Create fast tokenizer

In [5]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




## Wikidata Resource classifier (commented)

In [6]:
#encoder_wiki = LabelEncoder()
#encoder_wiki.fit(wikidata_df[wikidata_df.category == "resource"].type_1)
#encoded_Y_wiki_train = encoder_wiki.transform(y_wiki_train)
#encoded_Y_wiki_test = encoder_wiki.transform(y_wiki_test)
#dummy_y_wiki_train = np_utils.to_categorical(encoded_Y_wiki_train) # convert integers to dummy variables (i.e. one hot encoded)

In [7]:
#%%time 

#x_wiki_train = regular_encode(X_wiki_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
#x_wiki_test = regular_encode(X_wiki_test.values.astype(str), tokenizer, maxlen=MAX_LEN)

In [8]:
#train_dataset_wiki = (
#    tf.data.Dataset
#    .from_tensor_slices((x_wiki_train, dummy_y_wiki_train))
#    .repeat()
#    .shuffle(2048)
#    .batch(BATCH_SIZE)
#    .prefetch(AUTO)
#)

#test_dataset_wiki = (
#    tf.data.Dataset
#    .from_tensor_slices(x_wiki_test)
#    .batch(BATCH_SIZE)
#)

In [9]:
#%%time
#with strategy.scope():
#    transformer_layer = TFAutoModel.from_pretrained(MODEL)
#    model_wiki = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=wikidata_df[wikidata_df.category == "resource"].type_1.nunique())

In [10]:
#n_steps = x_wiki_train.shape[0] // BATCH_SIZE
#train_history = model_wiki.fit(
#    train_dataset_wiki,
#    steps_per_epoch=n_steps,
#    epochs=EPOCHS,
#    callbacks=[early_stopping]
#)

In [11]:
#y_wiki_test_pred = np.argmax(model_wiki.predict(test_dataset_wiki, verbose=1), axis=1)

In [12]:
#f1 = f1_score(encoded_Y_wiki_test, y_wiki_test_pred, average='weighted')
#print('Accuracy: {0}'.format(f1))

In [13]:
#def get_top_n_args(M, n=10):
#    return M.argsort(axis=1)[:,::-1][:,:n]

## Submission preparation (for DBpedia)

```output_example = [
    {
        "id" : "",
        "category" : "",
        "type" : []
    },
]```

In [14]:
model_paths_list = [
    "iswc-cat-and-lit-classifiers/model_cat",
    "iswc-cat-and-lit-classifiers/model_lit",
    "iswc-dbpedia-res-classifier-type-1-2/model_1",
    "iswc-dbpedia-res-classifier-type-1-2/model_2",
    "iswc-dbpedia-res-classifier-type-3-4/model_3",
    "iswc-dbpedia-res-classifier-type-3-4/model_4",
    "iswc-dbpedia-res-classifier-type-5-6/model_5",
]

encoder_paths_list = [
    "iswc-cat-and-lit-classifiers/encoder_cat.npy",
    "iswc-cat-and-lit-classifiers/encoder_lit.npy",
    "iswc-dbpedia-res-classifier-type-1-2/encoder_type1.npy",
    "iswc-dbpedia-res-classifier-type-1-2/encoder_type2.npy",
    "iswc-dbpedia-res-classifier-type-3-4/encoder_type3.npy",
    "iswc-dbpedia-res-classifier-type-3-4/encoder_type4.npy",
    "iswc-dbpedia-res-classifier-type-5-6/encoder_type5.npy",
]

model_predictions_list = list()
model_predictions_labels_list = list()

In [15]:
def collect_garbage():
    mem = 0
    for i in range(5):
        mem+=gc.collect()
    
    print("Removed: ",mem)

In [16]:
low_confident_models = dict()

In [17]:
for i in range(len(model_paths_list)):
    try:
        del dbpedia_faketest_df, x_fake_test, model
    except:
        pass
        
    collect_garbage()
    
    dbpedia_df = pd.read_csv("/kaggle/input/test-06366/dbpedia.csv", sep="|")
    dbpedia_faketest_df = dbpedia_df.sample(4400, random_state=42)

    x_fake_test = regular_encode(dbpedia_faketest_df.question.values.astype(str), tokenizer, maxlen=MAX_LEN)

    faketest_dataset_db = (
        tf.data.Dataset
        .from_tensor_slices(x_fake_test)
        .batch(BATCH_SIZE)
    )
    
    model = tf.keras.models.load_model("/kaggle/input/{0}".format(model_paths_list[i]))
    encoder = LabelEncoder()
    encoder.classes_ = np.load('/kaggle/input/{0}'.format(encoder_paths_list[i]), allow_pickle=True)

    #predicting categories
    y_pred_proba = model.predict(faketest_dataset_db, verbose=1)
    y_pred_max = np.max(y_pred_proba, axis=1)
    y_pred = np.argmax(y_pred_proba, axis=1)
    y_pred_labels = encoder.inverse_transform(y_pred)
    
    model_predictions_list.append(y_pred)
    model_predictions_labels_list.append(y_pred_labels)
    
    if i > 1:
        low_confident_models[i] = [{'id':dbpedia_faketest_df.iloc[j].id, 'question': dbpedia_faketest_df.iloc[j].question, 'types': [dbpedia_faketest_df.iloc[j].type_5, dbpedia_faketest_df.iloc[j].type_4, dbpedia_faketest_df.iloc[j].type_3, dbpedia_faketest_df.iloc[j].type_2, dbpedia_faketest_df.iloc[j].type_1] , 'type_pred': [y_pred_labels[j]]} for j in range(len(y_pred_max)) if y_pred_max[j] < 0.5 and dbpedia_faketest_df.iloc[j].category not in ['literal', 'boolean']] #collect not confident questions
    
    #cleaning memory
    del dbpedia_df, dbpedia_faketest_df, x_fake_test, model
    collect_garbage()
    
    tf.keras.backend.clear_session()

    collect_garbage()

Removed:  0
Removed:  63570
Removed:  11314
Removed:  0
Removed:  63541
Removed:  11314
Removed:  0
Removed:  63545
Removed:  11314
Removed:  0
Removed:  63595
Removed:  11314
Removed:  0
Removed:  65961
Removed:  11314
Removed:  0
Removed:  79709
Removed:  11437
Removed:  0
Removed:  79709
Removed:  11437


In [18]:
CAT_IDX = 0 #categorical model index
LIT_IDX = 1 #literal model intex
MODEL_1_IDX = 2 #type 1 model index
MODEL_2_IDX = 3 #type 2 model index
MODEL_3_IDX = 4 #type 3 model index
MODEL_4_IDX = 5 #type 4 model index
MODEL_5_IDX = 6 #type 5 model index

In [19]:
dbpedia_df = pd.read_csv("/kaggle/input/test-06366/dbpedia.csv", sep="|")
dbpedia_faketest_df = dbpedia_df.sample(4400, random_state=42)

assert len(model_predictions_list[CAT_IDX]) == dbpedia_faketest_df.shape[0]

preds_array = list()

for i in range(len(model_predictions_list[CAT_IDX])):
    pred = {"id": dbpedia_faketest_df.iloc[i]["id"], "category": model_predictions_labels_list[CAT_IDX][i], "type": []}
    
    if model_predictions_labels_list[CAT_IDX][i] == "boolean":
        pred["type"] = [model_predictions_labels_list[CAT_IDX][i]]
        preds_array.append(pred)
    elif model_predictions_labels_list[CAT_IDX][i] == "literal":
        pred["type"] = [model_predictions_labels_list[LIT_IDX][i]]
        preds_array.append(pred)
    elif model_predictions_labels_list[CAT_IDX][i] == "resource":
        type_1 = model_predictions_labels_list[MODEL_1_IDX][i]
        type_2 = model_predictions_labels_list[MODEL_2_IDX][i]
        type_3 = model_predictions_labels_list[MODEL_3_IDX][i]
        type_4 = model_predictions_labels_list[MODEL_4_IDX][i]
        type_5 = model_predictions_labels_list[MODEL_5_IDX][i]
        
        tmp = [type_5, type_4, type_3, type_2, type_1] #reversed
        pred["type"] = [t for t in tmp if t != 'nan']
        preds_array.append(pred)
    else:
        assert False

In [20]:
with open('faketest_dbpedia_pred.json', 'w') as json_file:
    json.dump(preds_array, json_file)

In [21]:
with open('non-confident_questions.json', 'w') as json_file:
    json.dump(low_confident_models, json_file)