In [1]:
import os

import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import gc
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import Callback 
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

Using TensorFlow backend.


## Helper Functions

In [2]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [3]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [4]:
def build_model(transformer, max_len=512, hidden_dim=32, num_of_classes=3):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    
    out = Dense(num_of_classes, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

## TPU Configs

In [5]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [6]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 96
MODEL = 'bert-base-multilingual-cased'

## Create fast tokenizer

In [7]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




## Load text data into memory

In [8]:
dbpedia_df = pd.read_csv("/kaggle/input/test-06366/dbpedia.csv", sep="|")
dbpedia_df.type_1.fillna(value="NoneType")
dbpedia_df.type_1 = dbpedia_df.type_1.astype(str)

wikidata_df = pd.read_csv("/kaggle/input/test-06366/wikidata.csv", sep="|")
wikidata_df.type_1.fillna(value="NoneType")
wikidata_df.type_1 = wikidata_df.type_1.astype(str)

common_df = dbpedia_df.append(wikidata_df)
literal_df = common_df[common_df.category == "literal"]

X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(common_df.question.astype(str), common_df.category, test_size=0.33, random_state=42)
X_lit_train, X_lit_test, y_lit_train, y_lit_test = train_test_split(literal_df.question.astype(str), literal_df.type_1, test_size=0.33, random_state=42)
X_db_train, X_db_test, y_db_train, y_db_test = train_test_split(dbpedia_df[dbpedia_df.category == "resource"].question.astype(str), dbpedia_df[dbpedia_df.category == "resource"].type_1, test_size=0.33, random_state=42)
X_wiki_train, X_wiki_test, y_wiki_train, y_wiki_test = train_test_split(wikidata_df[wikidata_df.category == "resource"].question.astype(str), wikidata_df[wikidata_df.category == "resource"].type_1, test_size=0.33, random_state=42)

## Category classifier

In [9]:
encoder_cat = LabelEncoder()
encoder_cat.fit(common_df.category)
encoded_Y_cat_train = encoder_cat.transform(y_cat_train)
encoded_Y_cat_test = encoder_cat.transform(y_cat_test)
dummy_y_cat_train = np_utils.to_categorical(encoded_Y_cat_train) # convert integers to dummy variables (i.e. one hot encoded)

np.save('encoder_cat.npy', encoder_cat.classes_)

In [10]:
%%time 

x_cat_train = regular_encode(X_cat_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
x_cat_test = regular_encode(X_cat_test.values.astype(str), tokenizer, maxlen=MAX_LEN)

CPU times: user 16.2 s, sys: 101 ms, total: 16.3 s
Wall time: 16.3 s


In [11]:
train_dataset_cat = (
    tf.data.Dataset
    .from_tensor_slices((x_cat_train, dummy_y_cat_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset_cat = (
    tf.data.Dataset
    .from_tensor_slices(x_cat_test)
    .batch(BATCH_SIZE)
)

In [12]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_cat = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=common_df.category.nunique())

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1083389348.0, style=ProgressStyle(descr…


CPU times: user 35.8 s, sys: 4.78 s, total: 40.6 s
Wall time: 1min 2s


In [13]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    verbose=1,
    patience=1,
    mode='min',
    restore_best_weights=True)

In [14]:
n_steps = x_cat_train.shape[0] // BATCH_SIZE
train_history = model_cat.fit(
    train_dataset_cat,
    steps_per_epoch=n_steps,
    epochs=EPOCHS,
    callbacks=[early_stopping]
)

model_cat.save("model_cat")

Train for 1500 steps
Epoch 1/2
Epoch 2/2


In [15]:
y_cat_test_pred = np.argmax(model_cat.predict(test_dataset_cat, verbose=1), axis=1)
del model_cat



In [16]:
acc = accuracy_score(encoded_Y_cat_test, y_cat_test_pred)
print('Accuracy: {0}'.format(acc))

Accuracy: 0.9650651328032481


## Literal classifier

In [17]:
encoder_lit = LabelEncoder()
encoder_lit.fit(literal_df.type_1)
encoded_Y_lit_train = encoder_lit.transform(y_lit_train)
encoded_Y_lit_test = encoder_lit.transform(y_lit_test)
dummy_y_lit_train = np_utils.to_categorical(encoded_Y_lit_train) # convert integers to dummy variables (i.e. one hot encoded)

np.save('encoder_lit.npy', encoder_lit.classes_)

In [18]:
%%time 

x_lit_train = regular_encode(X_lit_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
x_lit_test = regular_encode(X_lit_test.values.astype(str), tokenizer, maxlen=MAX_LEN)

CPU times: user 3.94 s, sys: 0 ns, total: 3.94 s
Wall time: 3.95 s


In [19]:
train_dataset_lit = (
    tf.data.Dataset
    .from_tensor_slices((x_lit_train, dummy_y_lit_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset_lit = (
    tf.data.Dataset
    .from_tensor_slices(x_lit_test)
    .batch(BATCH_SIZE)
)

In [20]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model_lit = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=literal_df.type_1.nunique())

CPU times: user 3.77 s, sys: 903 ms, total: 4.67 s
Wall time: 6.43 s


In [21]:
n_steps = x_lit_train.shape[0] // BATCH_SIZE
train_history = model_lit.fit(
    train_dataset_lit,
    steps_per_epoch=n_steps,
    epochs=EPOCHS,
    callbacks=[early_stopping]
)

model_lit.save("model_lit")

Train for 402 steps
Epoch 1/2
Epoch 2/2


In [22]:
y_lit_test_pred = np.argmax(model_lit.predict(test_dataset_lit, verbose=1), axis=1)
del model_lit



In [23]:
acc = accuracy_score(encoded_Y_lit_test, y_lit_test_pred)
print('Accuracy: {0}'.format(acc))

Accuracy: 0.9691241335853812
