In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data_dir = '/content/drive/My Drive/Colab Notebooks/jigsaw-toxic-comment-classification'

# Full path to the train.csv file
train_file_path = os.path.join(data_dir, 'train.csv')
print(train_file_path)


In [None]:
print(os.listdir(data_dir))

In [None]:
train_file_path = '/content/drive/MyDrive/Colab Notebooks/jigsaw-toxic-comment-classification/train.csv/train.csv'
print(os.path.isfile(train_file_path))  # This should print True if the file exists


Read csv file data to pandas DataFrame

In [None]:
df=pd.read_csv(train_file_path)

Display first 10 rows of DataFrame or dataset

In [None]:
df.head()

Display Last 10 rows of DataFrame or dataset

In [None]:
df.tail()

used to access a specific entry in a pandas DataFrame
index value 159489

In [None]:
df.iloc[159489]['comment_text']

used to select a specific row from a subset of columns in a pandas DataFrame

In [None]:
df[df.columns[2:]].iloc[6]

used to filter rows in a pandas DataFrame where the value in the toxic column is equal to 1

In [None]:
df[df['toxic']==1]

Calculates number of Toxic comments in the DataFrame


In [None]:
total_toxic_comments = df[df['toxic'] == True].shape[0]
print(f"Total count of toxic comments: {total_toxic_comments}")

In [None]:
from tensorflow.keras.layers import TextVectorization

In [None]:
x=df['comment_text']
y=df[df.columns[2:]].values
print(x)


In [None]:
print(y)

In [None]:
MAX_FEATURES=200000 #number of words in vocab

The below code helps to convert text data to numberical values for the input to the model.
It sets the max-vocabulary words to be used, fixes the length of the sentence and defines the outptu to be int format.

In [None]:
#initailize the text vectorization layer
vectorizer=TextVectorization(max_tokens=MAX_FEATURES, #number of words in the vocab
                            output_sequence_length=1800, #length of a sentence
                            output_mode='int') #output mode must be integer

x below is a series object of Panda Series. It contains the text data from 'comment_text" column.


In [None]:
type(x)

.value attribute converts the data into numPy array format.


---

numpy.ndarray stands for N dimensional array



In [None]:
type(x.values)

In [None]:
vectorizer.adapt(x.values)

In [None]:
vectorizer("Hello World, How are you ? hope you are fine")[:10]

In [None]:
vectorizer.get_vocabulary()

In [None]:
vectorizer.adapt(x.values)

In [None]:
vectorized_text=vectorizer(x.values)

In [None]:
len(x)

In [None]:
vectorized_text

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))
dataset =dataset.cache()
dataset = dataset.shuffle(160000) #buffer size 160000
dataset = dataset.batch(16)
dataset = dataset.prefetch(8) # helps prevent bottlenecks

In [None]:
batch_x,batch_y=dataset.as_numpy_iterator().next()

16 comments each of length 1800


In [None]:
batch_x.shape

16 correcponding labels with respect to x
16 samples in the batch, each with 6 label categories

In [None]:
batch_y.shape


Total number of batches in the dataset

In [None]:
len(dataset) #length in batches

In [None]:
train = dataset.take(int(len(dataset)*0.7)) # 70% of total data
val = dataset.skip(int(len(dataset)*0.7)).take(int(len(dataset)*0.2)) #.skip will skip the 70% data and then get another 20% data
test = dataset.skip(int(len(dataset)*0.9)).take(int(len(dataset)*0.1))

In [None]:
len(train)

In [None]:
len(val)

In [None]:
len(test)

onverts the TensorFlow dataset train into a NumPy iterator

In [None]:
train_generator = train.as_numpy_iterator()

In [None]:
train_generator.next()

In [None]:
#create a Sequential Model

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding #layers used to build deep learning layers

In [None]:
model = Sequential() #instantiate the sequential api to add the various layers to the model
#create the embedding layer
model.add(Embedding(MAX_FEATURES+1, 32)) # +1 is for unknown words
# one embedding per word
#Bidirectional passes the info in both directions backward and forward
model.add(Bidirectional(LSTM(32, activation='tanh')))  # 32 different LSTM units
# feature extractor of fully cocnnected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
#final output layer converts value between 0 to 1
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='adam',metrics=['accuracy'])  #configure learning process

In [None]:
model.summary()

In [None]:
history = model.fit(train, epochs=3, validation_data=val)  #train the model

Epoch 1/3
[1m6981/6981[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11885s[0m 2s/step - accuracy: 0.9426 - loss: 0.0837 - val_accuracy: 0.9933 - val_loss: 0.0449
Epoch 2/3
[1m6110/6981[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m24:01[0m 2s/step - accuracy: 0.9862 - loss: 0.0455

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()

In [None]:
input_text=vectorizer('You freaking suck')

In [None]:
batch = test.as_numpy_iterator().next()

In [None]:
test.as_numpy_iterator().next()

In [None]:
model.predict(np.expand_dims(input_text,0 ))

In [None]:
res =model.predict(np.expand_dims(input_text,0 ))

In [None]:
(res > 0.5).astype(int)

In [None]:
batch_X, batch_y = test.as_numpy_iterator().next()

In [None]:
(model.predict(batch_X) > 0.5).astype(int)

In [None]:
res.shape

In [None]:
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test.as_numpy_iterator():
    # Unpack the batch
    X_true, y_true = batch
    # Make a prediction
    yhat = model.predict(X_true)

    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()

    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

In [None]:
!pip install gradio jinja2

In [None]:
import tensorflow as tf
import gradio as gr

In [None]:
model.save('toxicity.h5')

In [None]:
model = tf.keras.models.load_model('toxicity.h5')

In [None]:
input_str = vectorizer('hey i freaken hate you!')

In [None]:
res = model.predict(np.expand_dims(input_str,0))

In [None]:
res

In [None]:
def score_comment(comment):
    vectorized_comment = vectorizer([comment])
    results = model.predict(vectorized_comment)

    text = ''
    for idx, col in enumerate(df.columns[2:]):
        text += '{}: {}\n'.format(col, results[0][idx]>0.5)

    return text

In [None]:
interface = gr.Interface(fn=score_comment,
                         inputs=gr.inputs.Textbox(lines=2, placeholder='Comment to score'),
                        outputs='text')

In [None]:
interface.launch(share=True)