In [None]:
# Based upon the 'Build a Comment Toxicity Model with Deep Learning and Python' tutorial by Nicholas Renotte
# Link: https://www.youtube.com/watch?v=ZUqB-luawZg

In [None]:
# Upgrading Pip Installer
#!pip3 install --upgrade pip # Only for non 'mlp' environment

In [None]:
# Installing Dependencies
#!pip3 install tensorflow # Only for non 'mlp' environment
#!pip3 install tensorflow-gpu # Only for non 'mlp' environment
#!pip3 install pandas
#!pip3 install matplotlib
#!pip3 install sklearn
#!pip3 install gradio jinja2

In [None]:
# Availalbe Pip Libraries
#!pip3 list

In [None]:
# Available Conda Libraries
#!conda list

In [None]:
# Importing Dependencies
import gradio as gr
import os
import pandas as pd
import tensorflow as tf
import numpy as np

# For graphical representations
from matplotlib import pyplot as plt

# Layers used to build up a deep neural network
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import TextVectorization

# For determining the accuracy of the model
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.metrics import Precision
from tensorflow.keras.metrics import Recall

# One of many ways to create a model, easiest and quickest in this scenario
from tensorflow.keras.models import Sequential


In [None]:
# Importing Data

# 'os.path.join' presents the full file path to the desired data file
dfComments = pd.read_csv('../../Data/train.csv')
#dfComments = pd.read_csv(os.path.join('/Users', 'username', 'train.csv'))

In [None]:
# Verifying that the data has been properly loaded in
dfComments.head()

In [None]:
# Tail end of the dataframe
dfComments.tail()

In [None]:
# Retrieving the contents of the very first comment
dfComments.iloc[0]['comment_text']

In [None]:
# Finding some toxic comments
dfComments[dfComments['toxic'] == 1].head()

In [None]:
# Retrieving the toxicity value of a comment
dfComments[dfComments.columns[2:]].iloc[6]

In [None]:
# Pre-Processing The Data

# Splitting the dataset into its comments and their overall features
commentsText = dfComments['comment_text']

# Transforming via '.values' into a numpy array
# Each comment text will have a vector defining as to which sort of toxicity 
## categories they may or may not fall into
commentsFeatures = dfComments[dfComments.columns[2:]].values 

In [None]:
print(dfComments.columns)

In [None]:
print(commentsText.head())

In [None]:
print(commentsFeatures[0:5])

In [None]:
# Defining the maximum number of words allowed within the vocabulary model's dictionary
## The greater the number, the larger the model
## Each word will be tokenized to a unique number so as to be able to identify it
MAXWORDS = 200000

In [None]:
# Bringing up documentation
TextVectorization??

In [None]:
# Initializing the text vectorization layer
## Lowers and removes punctuation
## 'max_tokens': The maximum number of words allowed in terms of vocabulary
## 'output_sequence_length': The maximum allowed input length of a sentence
## 'output_mode': In what format is the word supposed to be map to a number, in this case an integer value
vectorizer = TextVectorization(max_tokens = MAXWORDS,
                               output_sequence_length = 1800,
                               output_mode = 'int')

In [None]:
# Vectorizer is learning each and every word up to the pre-set maximum within the given comments' text
## The pandas series, essential a signle column file format, is transformed into a numpy array via '.values'
print(type(commentsText))
print(type(commentsText.values))
vectorizer.adapt(commentsText.values)

In [None]:
# The vectorizer can now transform a passed in sentence passed upon its now established dictionary
print(vectorizer('Today was a great day.')[:5])

In [None]:
# Creating a vectorized version of all of the comments' text data with the now adapted vectorizer
vectorizedCommentsText = vectorizer(commentsText.values)

In [None]:
# Checking the lengths of the original and vectorized comments' text data
print(len(commentsText))

# If a sentence was shorter than the 1800 limit word length, than the 
## unused columns are set to a value of 0 via padding
vectorizedCommentsText

In [None]:
# Creating a TensorFlow data pipeline
## Particularly useful when you have a quantity of data which cannot simply 
### be simultaneously brought into memeory
## MCSHBAP Acronym: Map, Cache, Shuffle, Batch, Prefetch
### Basic data pipeline generation steps

### Initiated either by the 'from_tensor_slices' or 'list_files' method
## 'vectorizedCommentsText': The 'x' value or input features
## 'commentFeatures': The 'y' value or target
dataset = tf.data.Dataset.from_tensor_slices((vectorizedCommentsText, commentsFeatures))

# Chaching the data
dataset = dataset.cache()

# Shuffling the data with a passed in buffer size
dataset = dataset.shuffle(160000)

# Batching the dataset as a series of 16 samples
dataset = dataset.batch(16)

# Helping to prevent bottlenecking
dataset = dataset.prefetch(8)

In [None]:
# Retrieving one batch of the above data pipeline
## Represented as the vectorized comments' text along with their associated feature labels
batchX, batchY = dataset.as_numpy_iterator().next()

In [None]:
batchX

In [None]:
# Vector of value sets
batchX.shape

In [None]:
batchY

In [None]:
batchY.shape

In [None]:
# The length of the dataset in batches
print(len(dataset))

# Partitioning the dataset into the testing, validation and training datasets
## Can extract these partitions directly from the data pipeline
## Rounding percentages of partitioning process to an integer value
## Remembering to skip over those portions already partitioned to not reuse duplicate data and forget the leftovers
train = dataset.take(int(len(dataset) * 0.7))
validation = dataset.skip(int(len(dataset) * 0.7)).take(int(len(dataset) * 0.2))
test = dataset.take(int(len(dataset) * 0.9)).take(int(len(dataset) * 0.1))

# Forward pass, backwards pass, upgrade the gradient

print('Batches Per Partition: ')
print(f'Training: {len(train)}')
print(f'Validation: {len(validation)}')
print(f'Testing: {len(test)}')

In [None]:
# Creating A Sequential Model
# Model sequential API being instantiated
model = Sequential()

# Adding Layers

# Sequences passed down into the embedding layer, acting as a sort of personality test per word, learned as 
## the deep neural network is trained, no need to pass in a pre-existing embedding
# Number of words + 1 do to unknown words being embedded as a whole
# 1 Embedding per word, each being 32 feature values in length
model.add(Embedding(MAXWORDS + 1, 32))

# Bidirectional wrapper for neural layer networks, very important for Natural Language Processing (NLP)
## Allows for information to be passed in both directions as opposed to the default 1
## Important due to how previous words in a sentence can affect meaning of following words
### Example: "I don't hate you." 
### If this was read only from left to right, the "don't" would not affect the interpretation of the word 'hate'
### as a toxic value, as opposed to if it can be read from right to left as well
# GPU acceleration needed for an LSTM layer is an activation of 'tanh' as dictated by tensorflow
model.add(Bidirectional(LSTM(32, activation = 'tanh')))

# Feature extractor dense fully connected layers
model.add(Dense(128, activation = 'relu'))
model.add(Dense(256, activation = 'relu'))
model.add(Dense(128, activation = 'relu'))

# Final layer
## Maps to the 6 different outputs possible in terms of the target toxicity
model.add(Dense(6, activation = 'sigmoid'))

In [None]:
print(commentsFeatures.shape)
print(commentsFeatures[0])

In [None]:
# Compiling the model
# Using binary crossentropy as opposed to categorical as each output is it's own independent feature
## and not part of a whole, this is essentially a multi output model
model.compile(loss = 'BinaryCrossentropy', optimizer = 'Adam')

In [None]:
# Bidirectional doubles the number of units within the LSTM layer
model.summary()

In [None]:
# Training the deep neural network model
## Passing in the training data, how long the desire to train and the validation data
## Loss should be progressively decreasing as training nears completion
# history = model.fit(train, epochs = 10, validation_data = validation)
model.fit(train, epochs = 10, validation_data = validation)

# Computer Specifications:
# MacOS Monterey: Version 12.4
# Chip: Apple M1 Max, 10 Cores (8 Performance & 2 Efficency), GPU 32 Cores
# Memory: 64 GB

# Epoch Training:
# Epoch 1: Time = 7949s, Loss = 0.0619, Validation Loss = 0.0443
# Epoch 2: Time = 7913s, Loss = 0.0456, Validation Loss = 0.0388
# Epoch 3: Time = 7862s, Loss = 0.0408, Validation Loss = 0.0356
# Epoch 4: Time = 7982s, Loss = 0.0357, Validation Loss = 0.0320
# Epoch 5: Time = 7901s, Loss = 0.0326, Validation Loss = 0.0283
# Epoch 6: Time = 7873s, Loss = 0.0297, Validation Loss = 0.0256
# Epoch 7: Time = 7899s, Loss = 0.0267, Validation Loss = 0.0232
# Epoch 8: Time = 7909s, Loss = 0.0238, Validation Loss = 0.0202
# Epoch 9: Time = 7964s, Loss = 0.0215, Validation Loss = 0.0201
# Epoch 10: Time = 8056s, Loss = 0.0199, Validation Loss = 0.0171

# Total Training Time: 79308s =  22 Hours,  1 Minutes,  48 Seconds

In [None]:
# Loss metrics of model training
#history.history
epochs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
training_losses = [0.0619, 0.0456, 0.0408, 0.0357, 0.0326, 0.0297, 0.0267, 0.0238, 0.0215, 0.0199]
validation_losses = [0.0443, 0.0388, 0.0356, 0.0320, 0.0283, 0.0256, 0.0232, 0.0202, 0.0201, 0.0171]

In [None]:
# Plotting change in losses over training epochs

#plt.figure(figsize = (8, 5))
#pd.DataFrame(history.history).plot()

plt.plot(epochs, training_losses, color = 'blue', ls = '-', marker = '.', label = 'Training Loss')
plt.plot(epochs, validation_losses, color = 'black', ls = '-', marker = '.', label = 'Validation Loss')
plt.xlabel('Training Epochs')
plt.ylabel('Loss')
plt.title('Model Training & Validation Loss Over Training Epochs')
plt.legend()
plt.show()

In [None]:
# Making Predictions
# The sample text has to be vectorized and the words tokenized for the model to be able to make sense of it
sampleInputText = vectorizer('You freaking suck.')

# A sequence of integers
print(sampleInputText)

In [None]:
# The model expects a series of values or a batch, not a single value, so the vectorized 
## sample input has to be wrapped within a numpy array
## The input shape for the model and the passed in value must match
print(np.array([sampleInputText]))
print(np.expand_dims(sampleInputText, 0))

#model.predict(np.array([sampleInputText]))

# Cleaner version than above example
sampleResult = model.predict(np.expand_dims(sampleInputText, 0))

In [None]:
batch = test.as_numpy_iterator().next()
batchX, batchY = test.as_numpy_iterator().next()

In [None]:
batchY

In [None]:
(model.predict(batchX) > 0.5).astype(int)

In [None]:
# Evaluating The Model
# Allows for aggregation of metrics over time as the measurement are iterated over and over again over batches
modelPrecision = Precision()
modelRecall = Recall()
modelCategoricalAccuracy = CategoricalAccuracy()

In [None]:
# Looping through every single batch
for batch in test.as_numpy_iterator():
    
    # Unpacking the batch
    xTrue, yTrue = batch
    
    # Making the precition
    yHat = model.predict(xTrue)
    
    # Flattening the predictions into one very large vector
    ## Instead of a 6x6 matrix, it becomes a 36x1
    yTrue = yTrue.flatten()
    yHat = yHat.flatten()
    
    # Calculating the metrics for the batch, then updating the existing Key Performance Indicators (KPIs)
    modelPrecision.update_state(yTrue, yHat)
    modelRecall.update_state(yTrue, yHat)
    modelCategoricalAccuracy.update_state(yTrue, yHat)

In [None]:
# Prinintg Results
print(f'Precision: {modelPrecision.result().numpy()}, Recall: {modelRecall.result().numpy()}, Accuracy: {modelCategoricalAccuracy.result().numpy()}')

In [None]:
# Saving The Model
print(os.getcwd())

os.chdir('../../Models')
print(os.getcwd())

model.save('commentToxicityModel.h5')
print(os.listdir(os.getcwd()))

os.chdir('../Python/PythonMachineLearning')
print(os.getcwd())

In [None]:
# Loading The Model
model = tf.keras.models.load_model('commentToxicityModel.h5')

In [None]:
# Testing & Gradio Application

In [None]:
# Function to be connected to Gradio

# A text comment is passed in as an argument
def scoreCommentToxicity(commentText):
    
    # The comment is passed through the vectorizer, tokenizing the words
    # Text is converted into a sequence of numbers
    vectorizedComment = vectorizer([commentText])
    
    # The tokenized comment is passed through the model to predict whether or not it is of a toxic nature
    results = model.predict(vectorizedComment)
    
    text = ''
    
    # Unpacks the 'results' by looping through each of the dataframe's columns
    # Prints out whether or not the column's feature is true or false for its associated comment
    for index, column in enumerate(dfComments.columns[2:]):
        text += '{}: {}\n'.format(column, results[0][index] > 0.5)
        
    return text

In [None]:
# Creating The Gradio Interface
## 'fn': The function the Gradio interface makes use of
## 'inputs': The type of input
## 'outputs': The style of output
gradioInterface = gr.Interface(fn = scoreCommentToxicity,
                               inputs = gr.inputs.Textbox(lines = 2, placeholder = 'Comment to be scored.'),
                               outputs = 'text')

In [None]:
# Launching The Interface
## If the 'share' value is set to 'True', the gradio application will be made public for a limited amount of time
#gradioInterface.launch(share = False)

In [None]:
scoreCommentToxicity('I really hate you and I am going to attack you fucker.')