In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import string
import re
import nltk
import spacy
import random
import missingno as msno
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
import os
from tqdm.notebook import tqdm
import tensorflow_hub as hub
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
import tensorflow.keras.backend as K
%matplotlib inline

In [12]:
train_tr = pd.read_csv("../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-tr-cleaned.csv")
train_tr =shuffle( pd.concat([
    train_tr[["comment_text", "toxic"]].query("toxic == 1"),
    train_tr[["comment_text", "toxic"]].query("toxic == 0").sample(30000, random_state = 1)
]))

train_ru = pd.read_csv("../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-ru-cleaned.csv")
train_ru = shuffle(pd.concat([
    train_ru[["comment_text", "toxic"]].query("toxic == 1"),
    train_ru[["comment_text", "toxic"]].query("toxic == 0").sample(30000, random_state = 1)
]))

train_it = pd.read_csv("../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-it-cleaned.csv")
train_it = shuffle(pd.concat([
    train_it[["comment_text", "toxic"]].query("toxic == 1"),
    train_it[["comment_text", "toxic"]].query("toxic == 0").sample(30000, random_state = 1)
]))

train_fr = pd.read_csv("../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-fr-cleaned.csv")
train_fr = shuffle(pd.concat([
    train_fr[["comment_text", "toxic"]].query("toxic == 1"),
    train_fr[["comment_text", "toxic"]].query("toxic == 0").sample(30000, random_state = 1)
]))

train_pt = pd.read_csv("../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-pt-cleaned.csv")
train_pt = shuffle(pd.concat([
    train_pt[["comment_text", "toxic"]].query("toxic == 1"),
    train_pt[["comment_text", "toxic"]].query("toxic == 0").sample(30000, random_state = 1)
]))

train_es = pd.read_csv("../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-es-cleaned.csv")
train_es = shuffle(pd.concat([
    train_es[["comment_text", "toxic"]].query("toxic == 1"),
    train_es[["comment_text", "toxic"]].query("toxic == 0").sample(30000, random_state = 1)
]))

In [13]:
valid = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
test = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv') 

In [14]:
valid_ru = shuffle(pd.concat([train_ru.query("toxic == 1").sample(1250),
                     pd.read_csv("../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-tr-cleaned.csv")[["comment_text", "toxic"]]\
                      .query("toxic == 0").sample(1250)]))

valid_fr = shuffle(pd.concat([train_fr.query("toxic == 1").sample(1250),
                     pd.read_csv("../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-fr-cleaned.csv")[["comment_text", "toxic"]]\
                      .query("toxic == 0").sample(1250)]))

valid_pt = shuffle(pd.concat([train_pt.query("toxic == 1").sample(1250),
                     pd.read_csv("../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-pt-cleaned.csv")[["comment_text", "toxic"]]\
                      .query("toxic == 0").sample(1250)]))

valid_it = valid[["comment_text", "lang", "toxic"]].query("lang == 'it'")
valid_es = valid[["comment_text", "lang", "toxic"]].query("lang == 'es'")
valid_tr = valid[["comment_text", "lang", "toxic"]].query("lang == 'tr'")

In [15]:
# Feeding all six languages to xlm roberta

In [16]:
train = shuffle(pd.concat([
    train_ru,
    train_es,
    train_fr,
    train_it,
    train_pt,
    train_tr
]))

valid = shuffle(pd.concat([
    valid_ru,
    valid_es,
    valid_fr,
    valid_it,
    valid_pt,
    valid_tr
]))

In [17]:
train = pd.concat([
    train,
    valid
]).sample(120000, random_state = 1)

In [18]:
tokenizer = AutoTokenizer.from_pretrained("jplu/tf-xlm-roberta-large")

In [19]:
MAX_LEN = 192

In [20]:
def preprocess(data, max_seq_length = MAX_LEN, tokenizer = tokenizer):    
    ids = []
    masks = []
    segment = []
    for i in tqdm(range(len(data))):
        
        tokens = tokenizer.tokenize(data[i])
        if len(tokens) > max_seq_length - 2:
            tokens = tokens[ : max_seq_length - 2]

        # Converting tokens to ids
        input_ids = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens + ["[SEP]"])

        # Input mask
        input_masks = [1] * len(input_ids)

        # padding upto max length
        padding = max_seq_length - len(input_ids)
        input_ids.extend([0] * padding)
        input_masks.extend([0] * padding)
        segment_ids =[0]* max_seq_length
        
        
        ids.append(input_ids)
        masks.append(input_masks)
        segment.append(segment_ids)
    
    return (np.array(ids), np.array(masks), np.array(segment))

In [21]:
train_ids, train_masks, train_segment =  preprocess(train["comment_text"].values)

HBox(children=(FloatProgress(value=0.0, max=120000.0), HTML(value='')))




In [22]:
test_ids, test_masks, test_segment =  preprocess(test["content"].values)

HBox(children=(FloatProgress(value=0.0, max=63812.0), HTML(value='')))




In [23]:
y_train = train["toxic"].values

In [24]:
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [25]:
BATCH_SIZE = 16 * strategy.num_replicas_in_sync

In [26]:
def model(roberta_layer, max_len = MAX_LEN):
    
        input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
        input_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
        segment_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

        pooled_output, sequence_output = roberta_layer([input_word_ids, input_mask, segment_ids])

        # There are two outputs: a pooled_output of shape [batch_size, 768] with representations for 
        # the entire input sequences and a sequence_output of shape [batch_size, max_seq_length, 768] 
        # with representations for each input token (in context)


        x = pooled_output
#         x = tf.keras.layers.Flatten()(x)
#         x = tf.keras.layers.Dense(128, activation = "relu")(x)
#         x = tf.keras.layers.Dense(1, activation = "sigmoid")(x)
        x1 = tf.keras.layers.Dropout(0.1)(x) 
        x1 = tf.keras.layers.Conv1D(128,2, padding = "same")(x1)
        x1 = tf.keras.layers.ReLU()(x1)
        x1 = tf.keras.layers.Conv1D(16,2, padding = "same")(x1)
        x1 = tf.keras.layers.ReLU()(x1)
        x1 = tf.keras.layers.Dense(1)(x1)
        x1 = tf.keras.layers.Flatten()(x1)
        x = tf.keras.layers.Dense(1, activation = "sigmoid")(x1)


        model = tf.keras.Model(inputs = [input_word_ids, input_mask, segment_ids], outputs = x)
        return model

In [27]:
with strategy.scope():
    roberta_layer = TFAutoModel.from_pretrained("jplu/tf-xlm-roberta-large", trainable = True)
    model = model(roberta_layer)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3271420488.0, style=ProgressStyle(descr…


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 192)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 192)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 192)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode ((None, 192, 1024),  559890432   input_word_ids[0][0]             
                                                                 input_mask[0][0]            

In [28]:
skf = StratifiedKFold(n_splits=3, shuffle = True)
skf.get_n_splits(train_ids, y_train)

i = 1
preds = []
for train_index, test_index in skf.split(train_ids, y_train):
    print("\n")
    print("#" * 20)
    print(f"FOLD No {i}")
    print("#" * 20)
    
    
    tr_ids = train_ids[train_index]
    tr_masks = train_masks[train_index]
    tr_segment = train_segment[train_index]
    
    vd_ids = train_ids[test_index]
    vd_masks = train_masks[test_index]
    vd_segment = train_segment[test_index]
    
    y_tr = y_train[train_index]
    y_vd = y_train[test_index]
    
    
    history = model.fit(
    (tr_ids, tr_masks, tr_segment), y_tr,
    epochs=2,
    batch_size=BATCH_SIZE,
    validation_data = ((vd_ids, vd_masks, vd_segment), y_vd),
    steps_per_epoch = len(tr_ids)//BATCH_SIZE)

    predictions = model.predict((test_ids, test_masks, test_segment))
    preds.append(predictions)
    
    i += 1
    K.clear_session()




####################
FOLD No 1
####################
Epoch 1/2


  num_elements)


Epoch 2/2


####################
FOLD No 2
####################
Epoch 1/2
Epoch 2/2


####################
FOLD No 3
####################
Epoch 1/2
Epoch 2/2


In [31]:
predictions = (0.2 * preds[0] + 0.3 * preds[1] + 0.5 * preds[2])

In [34]:
sub["toxic"] = predictions
sub.set_index("id", inplace = True)
sub.to_csv("submission.csv")