# Importing required Libraries

In [None]:
!pip install ipython-autotime
!pip install --upgrade transformers
%matplotlib inline
%load_ext autotime

In [None]:
import os,re
import unicodedata
import gc
import time
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import BinaryCrossentropy

import transformers
from transformers import TFAutoModel, AutoTokenizer
from transformers import T5Tokenizer, TFT5Model
from transformers import TFRobertaModel, RobertaTokenizerFast, RobertaConfig
from tokenizers import BertWordPieceTokenizer
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

from tqdm.notebook import tqdm
from numba import jit, cuda 

# TPU Configuration

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

# Reading Data

In [None]:
# the data paths
data_path = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/'
translated_data_path = '/kaggle/input/jigsaw-multilingual-toxic-test-translated/'

# loading all the train datasets

train_data1 = pd.read_csv(data_path + 'jigsaw-toxic-comment-train.csv')
# train_data2 = pd.read_csv(data_path + 'jigsaw-toxic-comment-train-processed-seqlen128.csv')
train_data3 = pd.read_csv(data_path + 'jigsaw-unintended-bias-train.csv')
# train_data4 = pd.read_csv(data_path + 'jigsaw-unintended-bias-train-processed-seqlen128.csv')

# loading all the validation and test datasets

# validation_data1 = pd.read_csv(data_path + 'validation.csv')
# validation_data2 = pd.read_csv(data_path + 'validation-processed-seqlen128.csv')
valid_translated = pd.read_csv(translated_data_path + 'jigsaw_miltilingual_valid_translated.csv')

# test_data1 = pd.read_csv(data_path + 'test.csv')
# test_data2 = pd.read_csv(data_path + 'test-processed-seqlen128.csv')
test_translated = pd.read_csv(translated_data_path + 'jigsaw_miltilingual_test_translated.csv')

# Exploratory Data Analysis

*Link to the notebook:*


# Helper Functions

Data cleaning Functions

In [None]:
# Stopword list
pattern = re.compile(r'\b('+r'|'.join(stopwords.words('english'))+r')\b\s*')

# @cuda.jit(device=True)
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

# @tf.function()
def clean_text(text):
    text = unicode_to_ascii(text.lower().strip())
    
    #replacing email addresses with blank space
    text = re.sub(r"[a-zA-Z0-9_\-\.]+@[a-zA-Z0-9_\-\.]+\.[a-zA-Z]{2,5}"," ",text)
    
    #replacing urls with blank space
    text = re.sub(r"\bhttp:\/\/([^\/]*)\/([^\s]*)|https:\/\/([^\/]*)\/([^\s]*)"," ",text)
    
    # creating a space between a word and the punctuation following it
    text = re.sub(r"([?.!,¿])", r" \1 ", text)
    text = re.sub(r'[" "]+', " ", text)
    
    # replacing all the stopwords
    text = pattern.sub('',text)
    
    # removes all the punctuations
    text = re.sub(r"[^a-zA-Z]+", " ", text)
    
    text = text.strip()

    # adding a start and an end token to the sentence so that the model know when to start and stop predicting.
#     text = '<start> ' + text + ' <end>'
    
    return text

clean_text_vect = np.vectorize(clean_text)

In [None]:
def chunk_clean(array,chunk_size=256):
    cleaned_array = []
    
    for i in tqdm(range(0, len(array), chunk_size)):
        text_chunk = clean_text_vect(array[i:i+chunk_size])
        cleaned_array.extend(text_chunk)

    return np.array(cleaned_array)

Tokenizing and Encoding Functions

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    
    tokenizer.pad_token = tokenizer.pad_token
    tokenizer.unk_token = tokenizer.unk_token
    tokenizer.eos_token = tokenizer.eos_token
    
    enc_di = tokenizer.batch_encode_plus(
        list(texts), 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen,
        add_special_tokens=True
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
def chunk_encode(texts,tokenizer,maxlen=512,chunk_size=256):
    all_enc=[]
    for i in tqdm(range(0,len(texts),chunk_size)):
        enc = list(regular_encode(texts[i:i+chunk_size],tokenizer,maxlen=maxlen))
        all_enc.extend(enc)
        
    return np.array(all_enc)

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):

    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []

    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])

    return np.array(all_ids)

# Defining the Tokenizer

Pre-trained models

In [None]:
MODEL = 'google/electra-large-generator'
MODEL2 = 'google/electra-large-discriminator'
MODEL3 = 'gpt2-medium'
MODEL4 = 'roberta-large'

Initializing the Tokenizers

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL4)
print(tokenizer.save_pretrained('.'))
print(tokenizer)

In [None]:
# bert_tokenizer = transformers.BertTokenizerFast.from_pretrained('bert-large-uncased')
# bert_tokenizer.save_pretrained('.')
# fast_tokenizer = BertWordPieceTokenizer('./vocab.json', lowercase=False)
# fast_tokenizer


# Data Preparation

Data splitting

In [None]:
train_data3.toxic = train_data3.toxic.round().astype(int)
valid_translated['comment_text'] = valid_translated['translated']

data = pd.concat([
    train_data1[['comment_text', 'toxic']],
    train_data3[['comment_text', 'toxic']].query('toxic==1'),
    train_data3[['comment_text', 'toxic']].query('toxic==0').sample(n=200000, random_state=0),
    valid_translated[['comment_text','toxic']]
])
data.toxic = data.toxic.round().astype(int)
data.drop_duplicates(inplace=True)

final_test_data = test_translated.translated.values

print('Number of toxic comments = ',list(data.toxic).count(1))
print('Number of non-toxic comments = ',list(data.toxic).count(0))

In [None]:
np.random.seed(2048)
train, valid, test = np.split(data.sample(frac=1), [int(.94*len(data)), int(.97*len(data))])

print("Train rows = ", train.shape[0])
print("validate rows = ", valid.shape[0])
print("Test rows = ", test.shape[0])
print("\nFinal Test Data rows = ",len(final_test_data))

In [None]:
del train_data1
del train_data3
del valid_translated
del data
gc.collect()

Cleaning & Tokenizing Input data and Preparing Labels

In [None]:
MAX_LEN = 128

In [None]:
x_train = chunk_clean(train.comment_text.values)
x_valid = chunk_clean(valid.comment_text.values)
x_test = chunk_clean(test.comment_text.values)

final_test_data = chunk_clean(final_test_data)

In [None]:
x_train = chunk_encode(x_train, tokenizer, maxlen=MAX_LEN)
x_valid = chunk_encode(x_valid, tokenizer, maxlen=MAX_LEN)
x_test = chunk_encode(x_test, tokenizer, maxlen=MAX_LEN)

final_test_data = chunk_encode(final_test_data, tokenizer, maxlen=MAX_LEN)

In [None]:
# x_train = fast_encode(x_train, fast_tokenizer, maxlen=MAX_LEN)
# x_valid = fast_encode(x_valid, fast_tokenizer, maxlen=MAX_LEN)
# x_test = fast_encode(x_test, fast_tokenizer, maxlen=MAX_LEN)

# final_test_data = fast_encode(final_test_data, fast_tokenizer, maxlen=MAX_LEN)

In [None]:
y_train = np.array(train.toxic.values)
y_train.resize((len(y_train),1))

y_valid = np.array(valid.toxic.values)
y_valid.resize((len(y_valid),1))

y_test = np.array(test.toxic.values)
y_test.resize((len(y_test),1))

In [None]:
print('New shape of comments and labels after TOKENIZATION and PROCESSING:-')
print('-'*50)
print('Data for Training and Evaluation:\n')
print('x_train shape = ',x_train.shape)
print('x_valid shape = ',x_valid.shape)
print('x_test shape = ',x_test.shape)
print('-'*30)
print('Labels shapes:\n')
print('y_train shape = ',y_train.shape)
print('y_valid shape = ',y_valid.shape)
print('y_test shape = ',y_test.shape)
print('-'*50)
print('The Final data for Predication:\n')
print('final_test_data shape = ',final_test_data.shape)

# Converting to Tensorflow dataset

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [None]:
train = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
#     .repeat()
#     .shuffle(256)
    .prefetch(AUTO)
)

test = (
    tf.data.Dataset
    .from_tensor_slices((x_test,y_test))
    .batch(BATCH_SIZE)
    .cache()
#     .shuffle(256)
    .prefetch(AUTO)
)

final_test_data = (
    tf.data.Dataset
    .from_tensor_slices(final_test_data)
    .batch(BATCH_SIZE)
)

# Loading Model into TPU 

Creating the model

In [None]:
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-5,
    decay_steps=1000,
    decay_rate=0.9)

In [None]:
def build_model(transformer, max_len=512):
    
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
            
    out = Dense(64,activation=tf.nn.swish)(cls_token)
    out = Dense(16,activation=tf.nn.swish)(out)
    out = Dense(1, activation='sigmoid')(out)
    
    model = Model(inputs=input_word_ids, outputs=out)
    
    model.compile(Adam(lr=1e-5),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    
    return model

In [None]:
configs = {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.2,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

configuration = RobertaConfig.from_dict(configs)
configuration

In [None]:
with strategy.scope():
    transformer_layer = transformers.TFAutoModel.from_pretrained(MODEL4)
#     transformer_layer = TFRobertaModel(configuration)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

# Training the Model

In [None]:
EPOCHS = 3

Stage 1

In [None]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train,
    steps_per_epoch=n_steps,
    validation_data=valid,
#     validation_steps=100,
    epochs=EPOCHS
)

In [None]:
epochs_range = range(EPOCHS)

plt.figure(figsize=(16, 5))

plt.subplot(121)
plt.plot(epochs_range,train_history.history['accuracy'], label='accuracy')
plt.plot(epochs_range,train_history.history['val_accuracy'], label = 'val_accuracy')
plt.ylim(0.75,1)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='center right')

plt.subplot(122)
plt.plot(epochs_range,train_history.history['loss'], label='loss')
plt.plot(epochs_range,train_history.history['val_loss'], label = 'val_loss')
plt.ylim(0.1,0.35)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='center right')

In [None]:
loss,accuracy = model.evaluate(test,verbose=1)
print('Loss = ',loss*100,'%')
print('Accuracy = ',accuracy*100,'%')

Stage 2

In [None]:
# n_steps = x_valid.shape[0] // BATCH_SIZE
# train_history_2 = model.fit(
#     valid_dataset.repeat(),
#     steps_per_epoch=n_steps,
#     epochs=EPOCHS*2
# )

Creating submission file

In [None]:
sub1 = pd.read_csv(data_path + 'sample_submission.csv')
sub1['toxic'] = model.predict(final_test_data, verbose=1)

In [None]:
sub1.to_csv('submission.csv', index=False)
sub1.head(15)

# THE END