# Jigsaw Multilingual Toxic Comment Classification
A notebook to bring all the packages under one roof. A collection of all modules implemented so far.

## BERT Implementation 1

In [1]:
#-----------------------------------------------------------------------------------------------#
#                                                                                               #
#   I M P O R T     L I B R A R I E S                                                           #
#                                                                                               #
#-----------------------------------------------------------------------------------------------#
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers

from tokenizers import BertWordPieceTokenizer

In [2]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [3]:
#***********************************************************************************************#
#                                                                                               #
#   description:                                                                                #
#   create a tokenizer to create tokens of the text input.                                      #
#                                                                                               #
#***********************************************************************************************#
def tokenizer():
    # First load the real tokenizer
    tokenizer = transformers.DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")
    # Save the loaded tokenizer locally
    tokenizer.save_pretrained('.')
    # Reload it with the huggingface tokenizers library
    fast_tokenizer = BertWordPieceTokenizer("vocab.txt", lowercase=False)
    # return the newly created tokenizer
    return tokenizer, fast_tokenizer

In [4]:
#***********************************************************************************************#
#                                                                                               #
#   description:                                                                                #
#   create a encoder for batch data of the competition.                                         #
#                                                                                               #
#***********************************************************************************************#
def data_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [5]:
#***********************************************************************************************#
#                                                                                               #
#   description:                                                                                #
#   create a module for any sort of data pre-processing required.                               #
#                                                                                               #
#***********************************************************************************************#
def preprocess(train1, valid, test, maxlen):
    # create an instant of the tokenizer
    _ , fast_tokenizer = tokenizer()
    # pre-process the data by tokenizing and then encoding it
    x_train = data_encode(train1.comment_text.astype(str), fast_tokenizer, maxlen=maxlen)
    x_valid = data_encode(valid.comment_text.astype(str), fast_tokenizer, maxlen=maxlen)
    x_test = data_encode(test.content.astype(str), fast_tokenizer, maxlen=maxlen)

    y_train = train1.toxic.values
    y_valid = valid.toxic.values
        
    return x_train, x_valid, x_test, y_train, y_valid

In [6]:
#***********************************************************************************************#
#                                                                                               #
#   description:                                                                                #
#   create a dataloader to load the toxic comment classification data.                          #
#                                                                                               #
#***********************************************************************************************#
def dataLoad(BATCH_SIZE=32, MAX_LEN=192):
    
    # configuration variable
    AUTO = tf.data.experimental.AUTOTUNE
    
    # Dataset path
    path="/kaggle/input/jigsaw-multilingual-toxic-comment-classification/"
    
    # LOADING THE DATA
    train1 = pd.read_csv(path+"jigsaw-toxic-comment-train.csv")
    valid = pd.read_csv(path+"validation.csv")
    test = pd.read_csv(path+"test.csv")
    
    # apply the pre-processing step ont he loaded dataset
    x_train, x_valid, x_test, y_train, y_valid = preprocess(train1, valid, test, MAX_LEN)
    
    # get data sizes
    tr_size = x_train.shape[0]
    vl_size = x_valid.shape[0]
    
    # Training dataset
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    
    # Validation dataset
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_valid, y_valid))
        .batch(BATCH_SIZE)
        .cache()
        .prefetch(AUTO)
    )

    # Testing dataset
    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(x_test)
        .batch(BATCH_SIZE)
    )
    
    # Return the newly created datsets
    return train_dataset, valid_dataset, test_dataset, tr_size, vl_size

In [7]:
#***********************************************************************************************#
#                                                                                               #
#   description:                                                                                #
#   create a BERT model.                                                                        #
#                                                                                               #
#***********************************************************************************************#
def build_model(transformer, max_len=512):
    """
    Function for creating the BERT-1 model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [8]:
# Configuration
EPOCHS = 5
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 250

In [9]:
%%time
with strategy.scope():
    # Create an instance of the model
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )

    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=910749124.0, style=ProgressStyle(descri…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 250)]             0         
_________________________________________________________________
tf_distil_bert_model (TFDist ((None, 250, 768),)       134734080 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 134,734,849
Trainable params: 134,734,849
Non-trainable params: 0
_________________________________________________________________
CPU times: user 34 s, sys: 12 s, total: 46.1 s
Wall time: 49.8 s


In [10]:
# Load Data by calling the appropriate functions
train_dataset, valid_dataset, test_dataset, tr_size, vl_size = dataLoad(BATCH_SIZE=BATCH_SIZE, MAX_LEN=MAX_LEN)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




100%|██████████| 874/874 [00:31<00:00, 27.58it/s]
100%|██████████| 32/32 [00:01<00:00, 25.19it/s]
100%|██████████| 250/250 [00:10<00:00, 24.44it/s]


In [11]:
n_steps = tr_size // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Train for 1746 steps, validate for 63 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
n_steps = vl_size // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

Train for 62 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
# Create a submission file with final predictions
sub = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv")

sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission_1.csv', index=False)



In [14]:
# Borrow other submission files
sub_2 = pd.read_csv("/kaggle/input/toxic-submissions-data-for-ensemble/sub-1.csv")
sub_2.to_csv('submission_2.csv', index=False)

In [15]:
# Create and ensemble
sub_2['toxic'] = sub['toxic'] * 0.4 + sub_2['toxic'] * 0.6
sub_2.to_csv('submission.csv', index=False)