In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-1.18.0-py3-none-any.whl (14.2 MB)
[K     |████████████████████████████████| 14.2 MB 8.7 MB/s 
Collecting gunicorn; platform_system != "Windows"
  Downloading gunicorn-20.1.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 4.8 MB/s 
Collecting databricks-cli>=0.8.7
  Downloading databricks-cli-0.14.3.tar.gz (54 kB)
[K     |████████████████████████████████| 54 kB 2.0 MB/s 
Collecting alembic<=1.4.1
  Downloading alembic-1.4.1.tar.gz (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 17.3 MB/s 
Collecting querystring-parser
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting prometheus-flask-exporter
  Downloading prometheus_flask_exporter-0.18.2.tar.gz (22 kB)
Building wheels for collected packages: databricks-cli, alembic, prometheus-flask-exporter
  Building wheel for databricks-cli (setup.py) ... [?25l- \ | done
[?25h  Created wheel for databricks-cli: filen

In [2]:
import os

import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
import gc
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.callbacks import Callback 
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

Using TensorFlow backend.


## Helper Functions

In [3]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [4]:
def build_model(transformer, max_len=512, hidden_dim=32, num_of_classes=3):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    
    out = Dense(num_of_classes, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

## TPU Configs

In [5]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


In [6]:
AUTO = tf.data.experimental.AUTOTUNE

# Configuration
EPOCHS = 4
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 60

AUGMENT_DATA = True
IS_MULTILINGUAL = True
USE_ANNOTATED = False

if USE_ANNOTATED:
    dbpedia_path = 'dbpedia-annotated'
else:
    dbpedia_path = 'dbpedia-train-onetype'

if IS_MULTILINGUAL:
    MODEL = 'bert-base-multilingual-cased'
else:
    MODEL = 'bert-base-cased'

## Create fast tokenizer

In [7]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




## Load text data into memory

In [8]:
dbpedia_df = pd.read_csv("/kaggle/input/test-06366/{0}.csv".format(dbpedia_path), sep="|")

if USE_ANNOTATED:
    dbpedia_df['original_id'] = dbpedia_df.id.apply(lambda x: x.split('-')[0])
    dbpedia_df_tmp = pd.read_csv("/kaggle/input/test-06366/dbpedia.csv", sep="|")
    faketest_dbpedia_df = dbpedia_df_tmp.sample(4400, random_state=42) # Test set from organizers has ca. 4400 examples
else:
    faketest_dbpedia_df = dbpedia_df.sample(4400, random_state=42) # Test set from organizers has ca. 4400 examples

dbpedia_df.dropna(subset=['question'], inplace=True)   

dbpedia_df["type"].fillna(value="NoneType")
dbpedia_df["type"] = dbpedia_df["type"].astype(str)


if AUGMENT_DATA and IS_MULTILINGUAL and not USE_ANNOTATED:
    common_de_df = dbpedia_df.copy()
    common_de_df.question = dbpedia_df.question_de

    common_fr_df = dbpedia_df.copy()
    common_fr_df.question = dbpedia_df.question_fr

    common_es_df = dbpedia_df.copy()
    common_es_df.question = dbpedia_df.question_es

    common_ru_df = dbpedia_df.copy()
    common_ru_df.question = dbpedia_df.question_ru

    common_df = dbpedia_df.append(common_de_df).append(common_fr_df).append(common_es_df).append(common_ru_df)
    del common_de_df, common_fr_df, common_es_df, common_ru_df
    dbpedia_df = common_df
    
elif AUGMENT_DATA and not IS_MULTILINGUAL and not USE_ANNOTATED:
    common_r_de_df = dbpedia_df.copy()
    common_r_de_df.question = dbpedia_df.question_r_de

    common_r_ru_df = dbpedia_df.copy()
    common_r_ru_df.question = dbpedia_df.question_r_ru

    common_df = dbpedia_df.append(common_r_de_df).append(common_r_de_df)
    del common_r_ru_df, common_r_de_df
    dbpedia_df = common_df
    
if not USE_ANNOTATED:
    dbpedia_df.drop(['question_de', 'question_fr', 'question_es', 'question_ru'], axis=1, inplace=True)
    
dbpedia_df.dropna(subset=['question'], inplace=True)  

if USE_ANNOTATED:
    dbpedia_df = dbpedia_df[~dbpedia_df.original_id.isin(faketest_dbpedia_df.id.values)]
else:
    dbpedia_df = dbpedia_df[~dbpedia_df.id.isin(faketest_dbpedia_df.id.values)]


X_db_train, X_db_test, y_db_train, y_db_test = train_test_split(dbpedia_df[dbpedia_df.category == "resource"].question.astype(str), 
                                                                dbpedia_df[dbpedia_df.category == "resource"].type,
                                                                test_size=0.01, random_state=42)

In [9]:
dbpedia_df.head()

Unnamed: 0,id,question,question_r_de,question_r_ru,category,type
1,dbpedia_14427,What is the name of the opera based on Twelfth...,What is the name of the opera based on Twelfth...,What is the name of the opera based on Twelfth...,resource,dbo:Opera
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,When did Lena Horne receive the Grammy Award f...,When did Lena Horn receive the Grammy Award fo...,literal,date
4,dbpedia_3681,What is the subsidiary company working for Leo...,What is the subsidiary that works for Leonard ...,Which subsidiary does Leonard Maltin have?,resource,dbo:EducationalInstitution
6,dbpedia_12020,What is the federated state located in the Wei...,What is the federation state in the Weimar Rep...,What is a federal state in the Weimar Republic?,resource,dbo:State
7,dbpedia_15023,Did Buddhism was named after the immigration o...,Was Buddhism Named After Immigration To The Un...,Did Buddhism was named after immigration from ...,boolean,boolean


In [10]:
dbpedia_df.shape

(64984, 6)

## DBpedia Resource classifier

In [11]:
y_train = y_db_train
y_test = y_db_test

In [12]:
%%time
le_list = list()

encoder_db = LabelEncoder()
encoder_db.fit(dbpedia_df[dbpedia_df.category == "resource"]["type"])
encoded_Y_db_train = encoder_db.transform(y_train)
encoded_Y_db_test = encoder_db.transform(y_test)
dummy_y_db_train = np_utils.to_categorical(encoded_Y_db_train) # convert integers to dummy variables (i.e. one hot encoded)

le_list.append(encoder_db)    

np.save('encoder_type.npy', encoder_db.classes_)

x_db_train = regular_encode(X_db_train.values.astype(str), tokenizer, maxlen=MAX_LEN)
x_db_test = regular_encode(X_db_test.values.astype(str), tokenizer, maxlen=MAX_LEN)

train_dataset_db = (
    tf.data.Dataset
    .from_tensor_slices((x_db_train, dummy_y_db_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

test_dataset_db = (
    tf.data.Dataset
    .from_tensor_slices(x_db_test)
    .batch(BATCH_SIZE)
)

CPU times: user 17.5 s, sys: 1.32 s, total: 18.8 s
Wall time: 22.9 s


In [13]:
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN, num_of_classes=dbpedia_df[dbpedia_df.category == "resource"]["type"].nunique())

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1083389348.0, style=ProgressStyle(descr…




In [14]:
dbpedia_df[dbpedia_df.category == "resource"]["type"].nunique()

253

In [15]:
n_steps = x_db_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset_db,
    steps_per_epoch=n_steps,
    epochs=EPOCHS #,
    #callbacks=[early_stopping]
)

Train for 2189 steps
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [16]:
y_db_test_pred = np.argmax(model.predict(test_dataset_db, verbose=1), axis=1)

f1 = f1_score(encoded_Y_db_test, y_db_test_pred, average='weighted')
print('F1: {0}'.format(f1))
print('!!!TEST set is also multilingual now!!!')

model.save("model_resource")

gc.collect()

F1: 0.8527144611710824
!!!TEST set is also multilingual now!!!


0