In [1]:
import os

import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import gc
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import Callback 
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

Using TensorFlow backend.


## Helper Functions

In [2]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [3]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [4]:
def build_model(transformer, max_len=512, hidden_dim=32, num_of_classes=3):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    
    out = Dense(num_of_classes, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

## TPU Configs

In [5]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [6]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 96
MODEL = 'bert-base-multilingual-cased'

## Create fast tokenizer

In [7]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




## Load text data into memory

In [8]:
dbpedia_df = pd.read_csv("/kaggle/input/test-06366/dbpedia.csv", sep="|")

for i in range(5):
    dbpedia_df["type_{0}".format(i+1)].fillna(value="NoneType")
    dbpedia_df["type_{0}".format(i+1)] = dbpedia_df["type_{0}".format(i+1)].astype(str)


wikidata_df = pd.read_csv("/kaggle/input/test-06366/wikidata.csv", sep="|")
wikidata_df.type_1.fillna(value="NoneType")
wikidata_df.type_1 = wikidata_df.type_1.astype(str)

common_df = dbpedia_df.append(wikidata_df)
literal_df = common_df[common_df.category == "literal"]

X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(common_df.question.astype(str), common_df.category, test_size=0.33, random_state=42)
X_lit_train, X_lit_test, y_lit_train, y_lit_test = train_test_split(literal_df.question.astype(str), literal_df.type_1, test_size=0.33, random_state=42)
X_db_train, X_db_test, y1_db_train, y1_db_test, y2_db_train, y2_db_test, y3_db_train, y3_db_test, y4_db_train, y4_db_test, y5_db_train, y5_db_test = train_test_split(dbpedia_df[dbpedia_df.category == "resource"].question.astype(str), 
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_1,
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_2,
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_3,
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_4,
                                                                dbpedia_df[dbpedia_df.category == "resource"].type_5,
                                                                test_size=0.33, random_state=42)

X_wiki_train, X_wiki_test, y_wiki_train, y_wiki_test = train_test_split(wikidata_df[wikidata_df.category == "resource"].question.astype(str), wikidata_df[wikidata_df.category == "resource"].type_1, test_size=0.33, random_state=42)

## DBpedia Resource classifier

In [9]:
y_train = [y1_db_train, y2_db_train, y3_db_train, y4_db_train, y5_db_train]
y_test = [y1_db_test, y2_db_test, y3_db_test, y4_db_test, y5_db_test]

In [10]:
%%time
#models_list = list()
le_list = list()

for i in range(4,5):
    encoder_db = LabelEncoder()
    encoder_db.fit(dbpedia_df[dbpedia_df.category == "resource"]["type_{0}".format(i+1)])
    encoded_Y_db_train = encoder_db.transform(y_train[i])
    encoded_Y_db_test = encoder_db.transform(y_test[i])
    dummy_y_db_train = np_utils.to_categorical(encoded_Y_db_train) # convert integers to dummy variables (i.e. one hot encoded)
    
    np.save('encoder_type{0}.npy'.format(i+1), encoder_db.classes_)
    
    le_list.append(encoder_db)    
    
    x_db_train = regular_encode(X_db_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
    x_db_test = regular_encode(X_db_test.values.astype(str), tokenizer, maxlen=MAX_LEN)
    
    train_dataset_db = (
        tf.data.Dataset
        .from_tensor_slices((x_db_train, dummy_y_db_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )

    test_dataset_db = (
        tf.data.Dataset
        .from_tensor_slices(x_db_test)
        .batch(BATCH_SIZE)
    )
    
    with strategy.scope():
        transformer_layer = TFAutoModel.from_pretrained(MODEL)
        model = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=dbpedia_df[dbpedia_df.category == "resource"]["type_{0}".format(i+1)].nunique())
        
    n_steps = x_db_train.shape[0] // BATCH_SIZE
    train_history = model.fit(
        train_dataset_db,
        steps_per_epoch=n_steps,
        epochs=EPOCHS #,
        #callbacks=[early_stopping]
    )
    
    y_db_test_pred = np.argmax(model.predict(test_dataset_db, verbose=1), axis=1)
    
    #models_list.append(model)
    
    f1 = f1_score(encoded_Y_db_test, y_db_test_pred, average='weighted')
    print('F1: {0}'.format(f1))
    
    model.save("model_{0}".format(i+1))
    del model
    
    gc.collect()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1083389348.0, style=ProgressStyle(descr…


Train for 401 steps
Epoch 1/2
Epoch 2/2
F1: 0.8187112871231605
CPU times: user 3min 12s, sys: 28.5 s, total: 3min 41s
Wall time: 5min 32s
