## Detect Toxic Comments or Statements Using Recurrent_Neural_Network (RNN)
### Deep learning AI Algorithm used by Tiktok and other Apps to filter obscene statements or comments


### Import the Libraries

In [None]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Dropout,Embedding
from tensorflow.keras.layers import TextVectorization
# tf.data.Dataset.list_files()

In [None]:
#load data
df=pd.read_csv(os.path.join('data','train.csv'))
# view the data
df.head(n=5)



### Preprocessing

In [None]:

#defining the Independent variables(Features) X from the depenedent varaiable Y
X=df['comment_text'].values # the independent variable
y=df[df.columns[2:]].values # dependent variables

#init maxima words that would be held as vocabs; many words increase model accuracy whilst slowing training process
MAX_VOCAB=200000 

# init textVectorisation that will be used to map text features into respective integer values
vectorizer=TextVectorization(max_tokens=MAX_VOCAB,output_sequence_length=2000,output_mode='int')

# make the vactorizer learn words from the X features
vectorizer.adapt(X)

#make the vectorizer map the X features to an Integer through mapping techniques
vectorized_features=vectorizer(X)



### Creating the TensorFlow data Pipeline to ease labeling of the data and training procedures

In [None]:

#init the data variable using the tensorflow data pipeline with X-vectorized and y-target variable
dataset=tf.data.Dataset.from_tensor_slices((vectorized_features,y))

#cache the data
dataset=dataset.cache()

#shuffle the data, the bigger the shuffle size so is the model and training process but highly efficient
dataset=dataset.shuffle(160000)

#allocate batch size, depends don the GPU size of your system mine has no GPU thus 8 or 16 is better
dataset=dataset.batch(16)

#add prefetch for continous data inflow at constance
dataset=dataset.prefetch(8)

### Batche(s) for X and y 

In [None]:

# X contains the features while y the target as labels
batch_X, batch_y=dataset.as_numpy_iterator().next()

#viewin the batches
batch_X.shape

#batch_y
batch_y.shape


### Allocating the dataset for Training (70%), Validation(20%) and Testing(10%)

In [None]:

# training data 70%
training_data=dataset.take(int(len(dataset)*0.7))

# validation data size 20%
validation_data=dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2+1))

# testing data 10%
testing_data=dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1+1))




### Trainin The Model Using The Long Term Short Memory LSTM  of RNN 

In [None]:
#init the seq model
model=Sequential()
#add embedding layer
model.add(Embedding(MAX_VOCAB+1,32))

#add the Bidirectional and init the LSTM
model.add(Bidirectional(LSTM(32, activation='tanh')))

#making the  recurrent neural network fully connected using Dense
model.add(Dense(128,activation='relu'))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))

#output neurons (6) using sigmoid for 0 or 1 outoput
model.add(Dense(6, activation='sigmoid'))

#compile the model
model.compile(optimizer='Adam', loss='BinaryCrossentropy',metrics=['accuracy'])

In [None]:
#display the model summary
model.summary()

### Actual Training

In [None]:
#creating a callback log directory helps in referencing and tracking of the train ability
log_dir='logs'

#dir_isntatiation
call_backs_dir=tf.keras.callbacks.TensorBoard(log_dir=log_dir)

#trainining process
ai_model_output=model.fit(training_data, epochs=1, callbacks=call_backs_dir, validation_data=validation_data,)

### Model Evaluation

In [None]:
from tensorflow.keras.metrics import CategoricalAccuracy, Recall, Precision

recall=Recall()
categorical=CategoricalAccuracy()
precision=Precision()


In [None]:
# iterating over the test data in order to extract  x and y 
for batch in testing_data.as_numpy_iterator():
    x_true,y_true=batch
    
    #make prediction on the x_true values
    y_hat=model.predict(x_true)
    
    #flatten the prediction matrix to a one dimensional arrray
    
    y_true=y_true.flatten()
    y_hat=y_hat.flatten()
    
    
    #update the model evaluation metrics
    recall.update_state(y_true,y_hat)
    categorical.update_state(y_true,y_hat)
    precision.update_state(y_true,y_hat)
    
    

In [None]:
#output the performance of the model using the initialised metrics

print(f'\nPrecison:{precision.result().numpy()}\n\n Accuracy:{categorical.result().numpy()}\n\n Recall:{recall.result().numpy()}')

### save the model for future use

In [None]:
#save using the pickle
import pickle

pickle_file=open(os.path.join('models','RNN_Toxic_Comment_Model_Pickle.pkl'),mode='wb')
pickle.dump(model,pickle_file)
pickle_file.close()
print('model saved as pickel file successfully\n')


#save using the tensorflow 

model.save(os.path.join('models','RNN_Toxic_Comment_tf_model.h5'))
print('model saved as tensorflow model successfully\n')




### Loading the models

In [None]:
# Loading the model saved by pickle file 
pickle_f=open(os.path.join('models','RNN_Toxic_Comment_Model_Pickle.pkl'),mode='rb')
pickle_model=pickle.load(pickle_f)
pickle_f.close()

print('pickle model loaded successfully\n')


#loading the model saved by the tensorflow
model_tf=tf.keras.models.load_model(os.path.join('models','RNN_Toxic_Comment_tf_model.h5'))

print('tensorflow model loaded successfully')



In [None]:
#text to predict
test_comment='you are a very dumb child'

#vectorizing the text for model to read it
vectorized_test_comment=vectorizer(test_comment)

#loading the saved model both for pickle file format .pkl and tensorflow .h5
model_pickle_prediction_results=pickle_model.predict(np.expand_dims(vectorized_test_comment,0))
model_tf_prediction_results=pickle_model.predict(np.expand_dims(vectorized_test_comment,0))


#viewing the results
print('\nmodel pickle predictdion results: {}'.format(model_pickle_prediction_results>0.5))

print('\nmodel tensorflow predictdion results: {}'.format(model_pickle_prediction_results>0.5))

df_columns_predictions=df[df.columns[2:]].head(n=1)
df_columns_predictions
