In [None]:
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.preprocessing.text import Tokenizer
import os
from tqdm.notebook import tqdm
import transformers
import tokenizers

## **TPU Setting**

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)
num_tpu = strategy.num_replicas_in_sync
print('Number of TPU:', num_tpu)

## **Load Data**

In [None]:
#Load data
TRAIN_PATH1 = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv'
TRAIN_PATH2 = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv'
VAL_PATH = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv'
TEST_PATH = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv'
comment_train1 = pd.read_csv(TRAIN_PATH1)
comment_train2 = pd.read_csv(TRAIN_PATH2)
comment_train2['toxic'] = comment_train2['toxic'].round().astype(int)

#add more data with lebel 1 to training data
comment_train = pd.concat([
    comment_train1[['comment_text', 'toxic']],
    comment_train2[['comment_text', 'toxic']].query('toxic == 1'),
    comment_train2[['comment_text', 'toxic']].query('toxic == 0').sample(100000, random_state=111)
])

comment_train = comment_train.sample(frac=1)
comment_val = pd.read_csv(VAL_PATH)
comment_test = pd.read_csv(TEST_PATH)

## **Data Processing**

In [None]:
#funtion to clean comments in training data
import re
def comment_clean(text):
    text = re.sub("\\n", " ", text)
    #remove digits
    text = re.sub("\d", "", text)
    #remove user_id and IP address
    text = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', text)
    #remove text staring with user
    text = re.sub("\[\[User.*",'', text)

    #remove url and links
    text = re.sub('^((https?|ftp|smtp):\/\/)?(www.)?[a-z0-9]+\.[a-z]+(\/[a-zA-Z0-9#]+\/?)*$', ' ', text)
    clean_text = ''
    for char in text.strip():
      #remove special characters and punctuations
      if char.isprintable() and not char.isnumeric():
        clean_text += char
    return clean_text

In [None]:
comment_train['comment_text'] = comment_train['comment_text'].apply(comment_clean)
comment_val['comment_text'] = comment_val['comment_text'].apply(comment_clean)
comment_test['content'] = comment_test['content'].apply(comment_clean)

In [None]:
#add text tokenizer
def text_tokenize(texts, tokenizer, max_len):
    encode = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=max_len
    )
    
    return np.array(encode['input_ids'])

In [None]:
x_train = comment_train['comment_text']
y_train = np.array([toxic for toxic in comment_train['toxic']])

x_val = comment_val['comment_text']
y_val = np.array([toxic for toxic in comment_val['toxic']])

x_test = comment_test['content']

In [None]:
#load tokenizer
xlm_roberta_tokenizer = transformers.AutoTokenizer.from_pretrained('xlm-roberta-base')

In [None]:
#tokenize data
x_train_tokenized = text_tokenize(x_train, xlm_roberta_tokenizer, max_len=128)
x_val_tokenized = text_tokenize(x_val, xlm_roberta_tokenizer, max_len=128)
test_tokenized = text_tokenize(x_test, xlm_roberta_tokenizer, max_len=128)

In [None]:
#create tensorflow dataset
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train_tokenized, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(-1)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_val_tokenized, y_val))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(-1)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

In [None]:
from tensorflow.keras import backend as K
def focal_loss(gamma=2., alpha=.2):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

In [None]:
from tensorflow.keras.callbacks import Callback, LearningRateScheduler
from sklearn.metrics import accuracy_score, roc_auc_score
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch+1, score))

In [None]:
#learning rate scheduler
def build_lrfn(lr_start=0.000001, lr_max=0.000002, 
               lr_min=0.0000001, lr_rampup_epochs=7, 
               lr_sustain_epochs=0, lr_exp_decay=.87):
    lr_max = lr_max * 8

    def lrfn(epoch):
        if epoch < lr_rampup_epochs:
            lr = (lr_max - lr_start) / lr_rampup_epochs * epoch + lr_start
        elif epoch < lr_rampup_epochs + lr_sustain_epochs:
            lr = lr_max
        else:
            lr = (lr_max - lr_min) * lr_exp_decay**(epoch - lr_rampup_epochs - lr_sustain_epochs) + lr_min
        return lr
    
    return lrfn

In [None]:
#create customized callback
def callback():
    cb = []
    reduceLROnPlat = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',  
                                    factor=0.3, patience=3, 
                                    verbose=1, mode='auto', 
                                    epsilon=0.0001, cooldown=1, min_lr=0.000001)
    cb.append(reduceLROnPlat)
    RocAuc = RocAucEvaluation(validation_data=(x_val_tokenized, y_val), interval=1)
    cb.append(RocAuc)
    lrfn = build_lrfn()
    lr_callback = LearningRateScheduler(lrfn, verbose=1)
    cb.append(lr_callback)
    return cb

In [None]:
#build model
def create_model(max_len, transformer, loss):
    inputs = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="inputs")
    bert_outputs = transformer(inputs)[0]
    class_token = bert_outputs[:,0,:]
    dropout = tf.keras.layers.Dropout(0.3)(class_token)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(dropout)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-5), loss=loss, metrics=['accuracy'])
    return model

In [None]:
#load xlm-roberta transfomer and initiate model
with strategy.scope():
    xlmroberta_model = transformers.TFAutoModel.from_pretrained('jplu/tf-xlm-roberta-large')
    model = create_model(128, xlmroberta_model, loss=focal_loss())
model.summary()

In [None]:
#fit the model
history = model.fit(train_dataset, validation_data=valid_dataset, epochs=3, 
                    steps_per_epoch=x_train.shape[0] // BATCH_SIZE, callbacks=callback())

In [None]:
#fit on validation data
history_on_val = model.fit(valid_dataset.repeat(), epochs=8, steps_per_epoch=x_val.shape[0] // BATCH_SIZE)