In [1]:
!pip install tensorflow pandas matplotlib sklearn --quiet

In [2]:
!pip install unidecode contractions   --quiet

[K     |████████████████████████████████| 235 kB 32.1 MB/s 
[K     |████████████████████████████████| 106 kB 64.6 MB/s 
[K     |████████████████████████████████| 287 kB 68.4 MB/s 
[?25h

In [3]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
import unidecode
import contractions
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("darkgrid")
plt.style.use("fivethirtyeight")

In [4]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
!nvidia-smi

Fri Jul 15 10:45:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Toxicity Classification/train.csv')

#  Preprocess

In [None]:
df.head()

In [None]:
df['comment_text'][0]

In [None]:
# Text preprocessing steps - remove numbers, capital letters, punctuation, '\n'
import re
import string

# remove all numbers with letters attached to them
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

# '[%s]' % re.escape(string.punctuation),' ' - replace punctuation with white space
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

# Remove all '\n' in the string and replace it with a space
remove_n = lambda x: re.sub("\n", " ", x)

# Remove all non-ascii characters 
remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]',r' ', x)

# Apply all the lambda functions wrote previously through .map on the comments column
df['comment_text']=[unidecode.unidecode(a) for a in df['comment_text']]
df['comment_text']=[contractions.fix(x) for x in df['comment_text']]

df['comment_text'] = df['comment_text'].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)
df['comment_text'] = df['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

#### Before Prepocess

/nExplanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27

#### After Preprocess

explanation edits made username hardcore metallica fan reverted vandalisms closure gas voted new york dolls fac please remove template talk page since retired

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [None]:
X= df['comment_text']
y =df[df.columns[2:]].values

In [None]:
MAX_FEATURES = 200000 # number of words in the vocab

In [None]:
TextVectorization??

In [None]:
vectorizer = TextVectorization(max_tokens = MAX_FEATURES,output_sequence_length=3000,output_mode='int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorized_text = vectorizer(X.values)

In [None]:
vectorized_text

In [None]:
#MCSBAP - map, chache, shuffle, batch, prefetch  from_tensor_slices, list_file

dataset =tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [None]:
dataset.as_numpy_iterator().next()

In [None]:
len(dataset)

In [None]:
len(dataset)*16

In [None]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

# Create Sequential Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout , Bidirectional, LSTM , Embedding

In [None]:
model = Sequential()
model.add(Embedding(MAX_FEATURES+1,32))
model.add(Bidirectional(LSTM(32,activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
early_stopping_cb=tf.keras.callbacks.EarlyStopping(patience=5,restore_best_weights=True)

In [None]:
import time
def checkpoint_path(checkpoint_dir="/content/drive/MyDrive/jigsaw-toxic-comment-classification-challenge/Checkpoint"):
    os.makedirs(checkpoint_dir,exist_ok=True)
    filename=time.strftime("Checkpoint_%Y_%m_%d_%H_%M_%S_.h5")
    chck_path=os.path.join(checkpoint_dir,filename)
    print(f" {filename} will be saved at {chck_path}")
    return chck_path

In [None]:
CKPT_path = checkpoint_path()
checkpointing_cb = tf.keras.callbacks.ModelCheckpoint(CKPT_path, save_best_only=True)

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_visible_devices(physical_devices[0], device_type = 'GPU')

In [None]:
EPOCHS = 20
history = model.fit(train,validation_data=val , epochs=EPOCHS, callbacks=[early_stopping_cb,checkpointing_cb],verbose =1)

In [None]:
ckpt_model = tf.keras.models.load_model(CKPT_path)
EPOCHS = 10
history = ckpt_model.fit(train,validation_data=val , epochs=EPOCHS, callbacks=[early_stopping_cb,checkpointing_cb],verbose =1)

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.grid(True)
plt.gca().set_ylim(0, 1)
plt.show()

In [None]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [None]:
model.predict(np.expand_dims(input_text,0))

In [None]:
batchX,batchY = test.as_numpy_iterator().next()

In [None]:
(model.predict(batchX)>0.5).astype(int)

# Evaluating the Model

In [None]:
model = tf.keras.models.load_model('/content/drive/MyDrive/Toxicity Classification/toxicity.h5')

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}')

# Gradio

In [None]:
!pip install gradio jinja2 --quiet

In [None]:
import gradio as gr

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)
    
    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.4)
    
    return text

In [None]:
interface = gr.Interface(fn=score_comment, 
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [None]:
interface.launch(share=True)

In [None]:
df['comment_text'][12]

In [None]:
df[df['toxic'] ==1].iloc[:,1:]

In [None]:
df['comment_text'][159566]

In [None]:
df[df['toxic'] ==0].iloc[:,1:]