# 0. Imports

In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt




In [None]:
df = pd.read_csv(os.path.join('data', 'train', 'train.csv'))

# 1. Preprocess

In [7]:
from tensorflow.keras.layers import TextVectorization

In [9]:
X = df['comment_text']
y = df[df.columns[2:]].values


In [12]:
vectorizer = TextVectorization(max_tokens= 200000, output_sequence_length= 1800, output_mode='int')
vectorizer.adapt(X.values)
vectorized_text = vectorizer(X.values)




In [24]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8)

In [25]:
batch_X, batch_y = dataset.as_numpy_iterator().next()

In [26]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [33]:
train_generator= train.as_numpy_iterator()
train_generator.next()

# 2. Create Sequencial Model

In [41]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [48]:
model = Sequential()
# Create Embedding Layer
model.add(Embedding(200001, 32))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final Layer
model.add(Dense(6,   activation= 'sigmoid'))

In [49]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [50]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 32)          6400032   
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_8 (Dense)             (None, 128)               8320      
                                                                 
 dense_9 (Dense)             (None, 256)               33024     
                                                                 
 dense_10 (Dense)            (None, 128)               32896     
                                                                 
 dense_11 (Dense)            (None, 6)                 774       
                                                      

In [51]:
hist = model.fit(train, epochs=1, validation_data=val)



# 3. Make Predictions

In [65]:
input_text = vectorizer('You freaking suck!')

model.predict(np.expand_dims(input_text,0))

# Evaluate Model

In [67]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [68]:
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)



In [69]:
print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

Precision: 0.8640965819358826, Recall: 0.6254227757453918, Accuracy: 0.46740221977233887


# Test and Gradio

In [70]:
import gradio as gr

In [71]:
model.save('toxicity.h5')

  saving_api.save_model(


In [72]:
model = tf.keras.models.load_model('toxicity.h5')
input_str = vectorizer('hey i freaken hate you!')

res = model.predict(np.expand_dims(input_str,0))
res

In [82]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)

    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += "{}: {}\n".format(col, results[0][idx]>0.5)

    return text

In [None]:
interface = gr.Interface(fn= score_comment, inputs= gr.components.Textbox(lines=2, placeholder= 'Comment to score'), outputs= 'text')

interface.launch(share=True)