<a href="https://colab.research.google.com/github/TapasKumarDutta1/multilingial/blob/master/multilingual.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import os
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from tensorflow.keras.layers import *
import re
import tensorflow as tf
from nltk import sent_tokenize


def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        return_attention_masks=False,
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen,
    )

    return np.array(enc_di["input_ids"])


def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation="sigmoid")(cls_token)
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss="binary_crossentropy", metrics=["accuracy"])

    return model


try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print("Running on TPU ", tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)


AUTO = tf.data.experimental.AUTOTUNE

GCS_DS_PATH = KaggleDatasets().get_gcs_path()
batch = 2048
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = "jplu/tf-xlm-roberta-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
include_test = False


def load_data():
    trn = pd.read_csv(
        "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv",
        usecols=["toxic", "comment_text"],
    )
    val = pd.read_csv(
        "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv",
        usecols=["toxic", "comment_text", "lang"],
    )
    tst = pd.read_csv(
        "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv",
        usecols=["lang", "content"],
    )
    trn1 = pd.read_csv(
        "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv",
        usecols=["toxic", "comment_text"],
    )
    trn["toxic"] = trn["toxic"].round().astype(int)
    train = pd.concat(
        [trn1, trn.query("toxic==1"), trn.query("toxic==0").sample(100000)], 0
    )
    train = train.reset_index(drop=True)

    if include_test:
        sub = pd.read_csv("../input/multilingual1/submission.csv")
        tst["toxic"] = sub["toxic"]
        train = pd.concat([train, tst], 0).reset_index(drop=True)
    return train, tst, val


train, test, valid = load_data()

LANGS = {
    "en": "english",
    "it": "italian",
    "fr": "french",
    "es": "spanish",
    "tr": "turkish",
    "ru": "russian",
    "pt": "portuguese",
}


def get_sentences(text, lang="en"):
    return sent_tokenize(text, LANGS.get(lang, "english"))


def exclude_duplicate_sentences(text, lang="en"):
    sentences = []
    for sentence in get_sentences(text, lang):
        sentence = sentence.strip()
        if sentence not in sentences:
            sentences.append(sentence)
    return " ".join(sentences)


def clean_text(text, lang="en"):
    text = str(text)
    text = re.sub(r'[0-9"]', "", text)
    text = re.sub(r"#[\S]+\b", "", text)
    text = re.sub(r"@[\S]+\b", "", text)
    text = re.sub(r"https?\S+", "", text)
    text = re.sub(r"\s+", " ", text)
    text = exclude_duplicate_sentences(text, lang)
    return text.strip()


train["comment_text"] = train.apply(lambda x: clean_text(x["comment_text"]), axis=1)
valid["comment_text"] = valid.apply(
    lambda x: clean_text(x["comment_text"], x["lang"]), axis=1
)
test["comment_text"] = test.apply(lambda x: clean_text(x["content"], x["lang"]), axis=1)


x_train = regular_encode(train.comment_text.values, tokenizer, maxlen=192)
x_valid = regular_encode(valid.comment_text.values, tokenizer, maxlen=192)
x_test = regular_encode(test.content.values, tokenizer, maxlen=192)

y_train = train.toxic.values
y_valid = valid.toxic.values


train_set = (
    tf.data.Dataset.from_tensor_slices((x_train, y_train))
    .shuffle(2048)
    .batch(BATCH_SIZE)
)
validation_set = (
    tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .cache()
)
tst_set = tf.data.Dataset.from_tensor_slices((x_test)).batch(batch)

trn_ln = train.shape[0]
val_len = valid.shape[0]


def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(192,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    avg = AveragePooling1D(pool_size=192)(sequence_output)
    tot = MaxPooling1D(pool_size=192)(sequence_output)
    conc = Concatenate()([tot, avg])
    out = Dense(100, activation="relu")(conc)
    drp = Dropout(0.3)(out)
    conc = Flatten()(drp)
    out_1 = Dense(1, activation="sigmoid")(conc)

    model = Model(inputs=input_word_ids, outputs=out_1)
    model.summary()
    model.compile(Adam(lr=1e-5), loss="binary_crossentropy", metrics=["accuracy"])

    return model


with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    mod = build_model(transformer_layer, 192)
mod.summary()

mod.fit(
    train_set,
    steps_per_epoch=trn_ln // BATCH_SIZE,
    validation_data=validation_set,
    epochs=3,
)

mod.fit(validation_set, steps_per_epoch=val_len // BATCH_SIZE, epochs=2)

sub = pd.read_csv(
    "/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv"
)
pre = mod.predict(tst_set, verbose=1)
sub["toxic"] = pre
sub.to_csv("submission.csv", index=False)
