In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()
print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
import os
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import TFAutoModel, AutoTokenizer
import transformers
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from tokenizers import BertWordPieceTokenizer
from tqdm import tqdm

In [None]:
df = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.head(10)

In [None]:
for i in df:
    print(i,df[i].isna().sum(),df[i].nunique())

In [None]:
df.drop_duplicates(subset ="comment_text", 
                     keep = False, inplace = True)

In [None]:
for i in df:
    if(df[i].isna().sum()>1000000):
        del df[i]

In [None]:
df.groupby('publication_id')['toxic'].mean().sort_values(ascending=False)

In [None]:
df.groupby('funny')['toxic'].mean().sort_values(ascending=False)

In [None]:
df.groupby('wow')['toxic'].mean().sort_values(ascending=False)

In [None]:
df.groupby('sexual_explicit')['toxic'].mean().sort_values(ascending=False)

In [None]:
df.groupby('identity_annotator_count')['toxic'].mean().sort_values(ascending=False)

In [None]:
df.groupby('toxicity_annotator_count')['toxic'].mean().sort_values(ascending=False)

In [None]:
print(df['sexual_explicit'][5]
,df['comment_text'][5])

In [None]:
df_nontoxic = df[(df['toxic'] == 0) & (df['severe_toxicity'] == 0) & (df['obscene'] == 0) & (df['threat'] == 0) & (df['insult'] == 0) & (df['identity_attack'] == 0) & (df['sexual_explicit'] == 0)]
df_toxic = df[(df['toxic'] != 0) | (df['severe_toxicity'] != 0) | (df['obscene'] != 0) | (df['threat'] != 0) | (df['insult'] !=0) | (df['identity_attack'] != 0) | (df['sexual_explicit'] != 0)]

In [None]:
new1 = df_nontoxic[['id', 'comment_text', 'toxic']].copy() 
new2 = df_toxic[['id', 'comment_text', 'toxic']].copy()
new2 = new2.assign(toxic=1)

In [None]:
chakri = pd.concat([new1, new2], ignore_index=True)

In [None]:
length=chakri['comment_text'].apply(lambda x: len(x))
import matplotlib.pyplot as plt
plt.hist(length)

In [None]:
chakri['length']=length

In [None]:
chakri.groupby('length')['toxic'].mean().sort_values(ascending=False)

In [None]:
 chakri.drop(columns='length',inplace=True)

In [None]:
del df
del df_toxic
del df_nontoxic
del new1
del new2

In [None]:
chakri['toxic'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train_test, X_val, Y_train_test, y_valid = train_test_split(chakri["comment_text"], chakri['toxic'], test_size=0.2,random_state=44)
X_train ,X_test ,y_train,Y_test = train_test_split(X_train_test, Y_train_test, test_size=0.07,random_state=44)

In [None]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

In [None]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    Taken from Kaggle notebooks
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
AUTO = tf.data.experimental.AUTOTUNE
EPOCHS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'jplu/tf-xlm-roberta-large'

In [None]:
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
tokenizer.save_pretrained('.')
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

In [None]:
x_train = fast_encode(X_train.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_valid = fast_encode(X_val.astype(str), fast_tokenizer, maxlen=MAX_LEN)
x_test = fast_encode(X_test.astype(str), fast_tokenizer, maxlen=MAX_LEN)

In [None]:
train_dataset = (tf.data.Dataset.from_tensor_slices((x_train, y_train)).repeat()
                .shuffle(2048)
                .batch(BATCH_SIZE)
                .prefetch(AUTO))

In [None]:
valid_dataset = (tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
                .batch(BATCH_SIZE).cache()
                .prefetch(AUTO))

In [None]:
test_dataset = (tf.data.Dataset.from_tensor_slices(x_test).batch(BATCH_SIZE))

In [None]:
# class MyModel(Model):
#     def __init__(self, **kwargs):
#         super().__init__(**kwargs)
#     def predict1(self, x, threshold=0.32):
#         proba = super().predict(x)
#         print(proba)
#         return proba[proba>threshold].astype(int)

In [None]:
def build_model(transformer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model =Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

In [None]:
print(x_train.shape[0])
print(x_valid.shape[0])

In [None]:
n_steps = x_train.shape[0]
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

In [None]:
n_steps = (x_valid.shape[0])
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS*2
)

In [None]:
hist = train_history.history
plt.plot(list(range(3)),hist['accuracy'],label='acc')
plt.plot(list(range(3)),hist['val_accuracy'],label='val acc')
plt.legend()
plt.title('Train Vs Val Accuracy')
plt.plot()

In [None]:
hist = train_history.history
plt.plot(list(range(3)),hist['loss'],label='train loss')
plt.plot(list(range(3)),hist['val_loss'],label='val loss')
plt.legend()
plt.plot()

In [None]:
y = model.predict(test_dataset)

In [None]:
y.shape

In [None]:
pd.DataFrame({"y": y}).y.value_counts()

In [None]:
output=pd.DataFrame()
output['comment_text']=X_test
output['toxic']=y  

In [None]:
output.head(10)

In [None]:
output.tail(10)

### ROC Score

In [None]:
true_y = Y_test.values
pred_y = output.toxic.values

In [None]:
from numpy import sqrt, argmax
from matplotlib import pyplot
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(Y_test.values, output.toxic.values)
# calculate the g-mean for each threshold
gmeans = sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))
pyplot.plot([0,1], [0,1], linestyle='--', label='No Skill')
pyplot.plot(fpr, tpr, marker='.', label='Model')
pyplot.scatter(fpr[ix], tpr[ix], marker='o', color='black', label='Best')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

In [None]:
bestThreshold = 0.295179

In [None]:
def getPredictions(pred_y,threshold):
    temp = np.copy(pred_y)
    temp[temp < threshold] = 0
    temp[temp >= threshold] = 1
    return temp

Best AUC score threshold - 0.295179 0.1 test size
Best AUC score threshold - 0.377759 0.07 test size

Best Accuracy score threshold - 0.68

In [None]:
def getTPR(cm):
    TN = cm[0][0]
    FN = cm[1][0]
    TP = cm[1][1]
    FP = cm[0][1]
    return TP/(TP+FN)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(true_y, getPredictions(bestThreshold))
print(cm)
getTPR(cm)

In [None]:
cm = confusion_matrix(true_y, getPredictions(0.311984))
print(cm)
getTPR(cm)

In [None]:
sentences = [
    "what the hell is going on here",
    "hey, you are looking sexy",
    "What a piece of shit",
    "Go kill yourself",
    "What's up buddy 🖕",
    "Holy shit! An actual unpopular opinion, take an upvote you freak",
    "Why are there so many replies from salty men (pun intended) in this thread talking about \
how wearing a condom is like infringing on their rights or some shit lmao.",
    "I'm sure women would fully support it.",
    "It hurts you that much to wear one?",
    "Men are such little bitches about condoms. \
If i have to deal with birth control, periods, and other wacky vagina stuff you can deal with a piece of rubber",
    "stopped at 'as a female'.",
    "I wonder how many simps upvoted this shit",
    "Yeah ok. Makes me vomit.",
    "Here I am, busting my ass in a restaurant when I should be bumming around giving $3 to random college chicks",
    "The amount of idiots defending this in these comments is staggering",
    "What in the fuck?",
    "the time of the week were this shit is reposted again this shits probably been on facebook already",
    "Why are people so fucking stupid.",
    "Jesus how many fucking times is this going to be reposted by karma whores?",
    "BuT tHeY'rE iNsPiRaTiOnAl",
    "Prostitutes don't get raped; they get paid. Also, how many rapes do you hear about at nudist colonies? I think I rest my case.",
    "Literally used by design to keep you bickering while both sides rob you blind.",
    "This is a prime example of why I dont get modernism. Like.... the portrai on the right looks like my \
four year old drew it and someone shaded it for them. Im sure that incredibly ignorant but I dont know why",
    "When you're a mediocre painter so you draw absolutely terrible images and call it 'art'",
    "I'm guessing people jizz themselves over the one on the right? Probably sold for a ridiculous amount of money",
    "It's trash. Looks like a fucking monkey drew it",
    "This guy was never called an asshole.",
    "Good. Stop printing money you fucking boomers.",
    "mitch McConnell is a disgrace to republicans, i say that with confidence.",
    "Literally the truth. The American system is broken when one fucking \
person can prevent whatever they want. Where are these so-called checks and balances?",
    "Can't wait until he is gone",
    "Fucking hate politics.",
    "This would be cooler if their music was better and the lead singer wasn’t a douche",
    "Just drop a really heavy object on all 3",
    "Wow they look like shit",
    "Only an American idiot would misspell Woodstock.",
    "So... everyone is just gonna ignore the obvious typo? Makes me think of of a Rick Flair convention. WOOOOOOSSTOCK",
    "Good looking to describe men and hot to describe women just to make sure everyone knows you’re not gay.",
    "Complete garbage, but super engrossing",
    "Be an ignorant fool who just gathers money for no reason while they could just help the rest of the \
world but they instead gather money that they don't even spend them for something....",
    "being cis and straight lmao",
    "Or you know you just turned to an easy crutch in a moment of weakness, fear and desperation like so many do. Open mindedness never leads to worshipping a god. You have to close your mind for that because knowledge and reason are the enemy of faith.",
    "They are disgusting, expensive, loud, smelly, selfish, and obnoxious. awful in every possible way. And then factor in the possibility of having one that is damaged or broken in some manner, its like playing russian roulette with your future.",
    
]

In [None]:
len(sentences)

In [None]:
custom_df = pd.DataFrame({'comment_text': sentences})

In [None]:
custom_set = fast_encode(custom_df.comment_text.astype(str), fast_tokenizer, maxlen=MAX_LEN)


In [None]:
model.predict(custom_set)

In [None]:
threshold=0.32

In [None]:
custom_df['toxic']=getPredictions(model.predict(custom_set),threshold);

In [None]:
custom_df[['comment_text','toxic']].to_csv('output.csv', index=False)


In [None]:
custom_df

In [None]:
import pickle
import joblib
# pickle.dump(model, open('model.pkl','wb'))
filename = 'finalized_model.sav'
joblib.dump(model, filename)

In [None]:
#numpy
#pandas
#tensorflow
# import os
# import tensorflow as tf
# from tensorflow.keras.layers import Dense, Input
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.models import Model
# from tensorflow.keras.callbacks import ModelCheckpoint
# from kaggle_datasets import KaggleDatasets
# from transformers import TFAutoModel, AutoTokenizer
# import transformers
# from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
# from tokenizers import BertWordPieceTokenizer
# from tqdm import tqdm
#matplotlib