In [1]:
import os
import pandas as pd 
import tensorflow as tf 
import numpy as np




In [2]:
from tensorflow.keras.layers import TextVectorization

In [3]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding

In [4]:
ds_path = os.path.join('jigsaw-toxic-comment-classification-challenge', 'train.csv', 'train.csv')

In [5]:
df = pd.read_csv(ds_path)

In [6]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
df.iloc[6]['comment_text']

'COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK'

In [8]:
df[df.columns[2:]].iloc[6]

toxic            1
severe_toxic     1
obscene          1
threat           0
insult           1
identity_hate    0
Name: 6, dtype: int64

In [9]:
X = df['comment_text']
y = df[df.columns[2:]].values   # To numpy array

In [10]:
MAX_VOCAB = 200000

In [11]:
vectorizer = TextVectorization(
    max_tokens=MAX_VOCAB,
    output_sequence_length=1800,
    output_mode='int' 
)




In [12]:
vectorizer.adapt(X.values)




In [13]:
vectorizer.get_vocabulary()[:10]    # Vocabulary table; 1st 10 entries

['', '[UNK]', 'the', 'to', 'of', 'and', 'a', 'you', 'i', 'is']

In [14]:
vectorized_text = vectorizer(X.values)

In [15]:
# map, cache, shuffle, batch, prefetch from tensor_sloces or list_file
ds = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
ds = ds.cache()
ds = ds.shuffle(160000)
ds = ds.batch(16)
ds = ds.prefetch(8)

In [16]:
train = ds.take(int(len(ds)*0.7))
val = ds.skip(int(len(ds)*0.7)).take(int(len(ds)*0.2))
test = ds.skip(int(len(ds)*0.9)).take(int(len(ds)*0.1))

In [17]:
int(len(ds)*0.7)

6981

In [18]:
len(val)

1994

In [19]:
model = Sequential()
model.add(Embedding(MAX_VOCAB+1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(6, activation='relu'))

In [20]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')




In [21]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [22]:
history = model.fit(train, epochs=1, validation_data=val)



In [23]:
model.save('toxicity.h5')

  saving_api.save_model(


In [None]:
# model = tf.keras.models.load_model('toxicity.h5')

> Single prediction

In [29]:
input_text = vectorizer("You are a fucking idiot!")

In [30]:
model.predict(np.expand_dims(input_text,0))



array([[2.2814603, 0.       , 1.172041 , 0.       , 1.018754 , 0.       ]],
      dtype=float32)

> Evaluate on test set

In [31]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [32]:
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    X_test, y_test = batch
    pred = model.predict(X_test)

    y_test = y_test.flatten()
    pred = pred.flatten()

    precision.update_state(y_test, pred)
    recall.update_state(y_test, pred)
    accuracy.update_state(y_test, pred)

In [36]:
print(f'Precision: {precision.result().numpy()}, Recall: {recall.result().numpy()}, Accuracy: {accuracy.result().numpy()}')

Precision: 0.46442195773124695, Recall: 0.7389812469482422, Accuracy: 0.44954127073287964


> Gradio

In [49]:
import gradio as gr

In [79]:
def score_text(toxic_text):
    vectorized_ = vectorizer([toxic_text])
    result = model.predict(vectorized_)

    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, result[0][idx]>0.5)

    return text

In [84]:
interface = gr.Interface(
    fn=score_text,
    inputs=gr.Textbox(lines=2, placeholder='Text to predict'),
    outputs='text'
)

In [85]:
interface.launch()

Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.


