# Importing Starter Libraries

In [118]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

# Connecting to my Google Drive

In [119]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')
# !ls "/content/drive/My Drive/YOUTUBE/Nicholas/Toxic_Comments/train.csv/train.csv"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Viewing the Dataset

In [120]:
df = pd.read_csv(
    os.path.join('/content','drive','My Drive','YOUTUBE','Deep_Learning','Toxic_Comments','train.csv','train.csv')
)

In [121]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [122]:
df.iloc[2]['comment_text']

"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."

In [123]:
df[df.columns[2:]].iloc[5]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 5, dtype: int64

# Data-Pre-Processing


In [124]:
from tensorflow.keras.layers import TextVectorization

In [125]:
# TextVectorization??

In [126]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [127]:
X

0         Explanation\nWhy the edits made under my usern...
1         D'aww! He matches this background colour I'm s...
2         Hey man, I'm really not trying to edit war. It...
3         "\nMore\nI can't make any real suggestions on ...
4         You, sir, are my hero. Any chance you remember...
                                ...                        
159566    ":::::And for the second time of asking, when ...
159567    You should be ashamed of yourself \n\nThat is ...
159568    Spitzer \n\nUmm, theres no actual article for ...
159569    And it looks like it was actually you who put ...
159570    "\nAnd ... I really don't think you understand...
Name: comment_text, Length: 159571, dtype: object

In [128]:
y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [129]:
MAX_WORDS = 200000

In [130]:
vectorizer = TextVectorization(max_tokens = MAX_WORDS,
                              output_sequence_length = 1800,
                              output_mode = 'int')

In [131]:
vectorizer.adapt(X.values) #train the vocabulary

In [132]:
vectorizer("Hello world, life is great")[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([288, 263, 306,   9, 275])>

In [133]:
vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'to',
 'of',
 'and',
 'a',
 'you',
 'i',
 'is',
 'that',
 'in',
 'it',
 'for',
 'this',
 'not',
 'on',
 'be',
 'as',
 'have',
 'are',
 'your',
 'with',
 'if',
 'article',
 'was',
 'or',
 'but',
 'page',
 'my',
 'an',
 'from',
 'by',
 'do',
 'at',
 'about',
 'me',
 'so',
 'wikipedia',
 'can',
 'what',
 'there',
 'all',
 'has',
 'will',
 'talk',
 'please',
 'would',
 'its',
 'no',
 'one',
 'just',
 'like',
 'they',
 'he',
 'dont',
 'which',
 'any',
 'been',
 'should',
 'more',
 'we',
 'some',
 'other',
 'who',
 'see',
 'here',
 'also',
 'his',
 'think',
 'im',
 'because',
 'know',
 'how',
 'am',
 'people',
 'why',
 'edit',
 'articles',
 'only',
 'out',
 'up',
 'when',
 'were',
 'use',
 'then',
 'may',
 'time',
 'did',
 'them',
 'now',
 'being',
 'their',
 'than',
 'thanks',
 'even',
 'get',
 'make',
 'good',
 'had',
 'very',
 'information',
 'does',
 'could',
 'well',
 'want',
 'such',
 'sources',
 'way',
 'name',
 'these',
 'deletion',
 'pages',
 'first',
 'help'

In [134]:
vectorized_text = vectorizer(X.values) #Tokenizing the dataset

In [135]:
vectorized_text

<tf.Tensor: shape=(159571, 1800), dtype=int64, numpy=
array([[  645,    76,     2, ...,     0,     0,     0],
       [    1,    54,  2489, ...,     0,     0,     0],
       [  425,   441,    70, ...,     0,     0,     0],
       ...,
       [32445,  7392,   383, ...,     0,     0,     0],
       [    5,    12,   534, ...,     0,     0,     0],
       [    5,     8,   130, ...,     0,     0,     0]])>

In [136]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text,y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) #prevents bottlenecks

In [137]:
dataset.as_numpy_iterator().next()

(array([[   48,   361,    37, ...,     0,     0,     0],
        [   46, 20154,    33, ...,     0,     0,     0],
        [  176,   527,    81, ...,     0,     0,     0],
        ...,
        [  266,    12,  1220, ...,     0,     0,     0],
        [  206,   168,  2893, ...,     0,     0,     0],
        [   48,    15,   381, ...,     0,     0,     0]]),
 array([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]]))

In [138]:
batch_X,batch_y = dataset.as_numpy_iterator().next()

In [139]:
batch_X

array([[  312,     7,   130, ...,     0,     0,     0],
       [  171,  7475,   520, ...,     0,     0,     0],
       [   46,  1377,    31, ...,     0,     0,     0],
       ...,
       [  626,  6062,   260, ...,     0,     0,     0],
       [12624,    11,  2068, ...,     0,     0,     0],
       [ 1506,    36,    64, ...,     0,     0,     0]])

In [140]:
batch_y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]])

In [141]:
batch_X.shape, batch_y.shape

((16, 1800), (16, 6))

In [142]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.take(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.take(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [143]:
len(train), len(val), len(test)

(6981, 1994, 997)

# Build Deep Learning Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
model = Sequential() #connects with Sequential API
model.add(Embedding(MAX_WORDS+1,32)) # builds an embedding layer
model.add(Bidirectional(LSTM(32, activation='tanh'))) # builds Bidirectional LSTME layer

# Hidden Dense layers
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))

# Final Layer
model.add(Dense(6,activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy',optimizer='Adam')

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirection  (None, 64)                16640     
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dense_1 (Dense)             (None, 256)               33024     
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 6)                 774       
                                                        

In [None]:
history = model.fit(train,epochs = 1, validation_data=val)



In [None]:
history.history

{'loss': [0.06202995777130127], 'val_loss': [0.045904748141765594]}

# Making Predictions

In [None]:
input_txt = vectorizer('You freaking suck!')

In [None]:
model.predict(np.expand_dims(input_txt,0))



array([[0.9980577 , 0.27051795, 0.9598281 , 0.03966446, 0.86352515,
        0.12269929]], dtype=float32)

In [None]:
input_txt1 = vectorizer('i love you')

In [None]:
model.predict(np.expand_dims(input_txt1,0))



array([[1.53829185e-02, 1.77391084e-05, 2.26276577e-03, 4.85309400e-04,
        3.14603187e-03, 5.90398326e-04]], dtype=float32)

In [None]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)



array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [None]:
batch_y

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 0]])

# Evaluation of the Model

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
precision = Precision()
recall = Recall()
accuracy = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
  X_true, y_true = batch
  yhat = model.predict(X_true)
  y_true = y_true.flatten()
  yhat = yhat.flatten()

  precision.update_state(y_true,yhat)
  recall.update_state(y_true, yhat)
  accuracy.update_state(y_true,yhat)




In [None]:
print(f'''
Precision:{precision.result().numpy()}
Recall:{recall.result().numpy()},
Accuracy:{accuracy.result().numpy()},
''')


Precision:0.7637920379638672
Recall:0.7139893174171448,
Accuracy:0.48946839570999146,



# Gradio App

In [144]:
!pip install gradio jinja2




In [145]:
!pip install --upgrade gradio



In [146]:
import tensorflow as tf
import gradio as gr

In [147]:
model = tf.keras.models.load_model('/content/drive/My Drive/YOUTUBE/Deep_Learning/Toxic_Comments/toxic.h5')

In [148]:
model

<keras.src.engine.sequential.Sequential at 0x7e14b6ef0d30>

In [149]:
import numpy as np

In [150]:
result = model.predict(np.expand_dims(vectorizer("i hate you and I will kill you"),0))



In [151]:
result > 0.5

array([[ True, False,  True, False,  True, False]])

In [152]:
result

array([[0.95137745, 0.09861078, 0.7192829 , 0.04625104, 0.61729527,
        0.10847434]], dtype=float32)

In [153]:
df.columns[2:]

Index(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [154]:
def score_comment(comment):
  vectorized_comment = vectorizer([comment])
  results = model.predict(vectorized_comment)

  text =''
  for idx,col in enumerate(df.columns[2:]):
    text += '{}: {}\n'.format(col, results[0][idx]>0.5)

  return text


In [155]:
interface = gr.Interface(
    fn=score_comment,
    inputs=gr.Textbox(lines=2, placeholder='Comment to score'),
    outputs='text'
)

In [156]:
interface.launch(share = True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://aac51e306a87bda363.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


