<a href="https://www.kaggle.com/code/skwsampath/ml-project?scriptVersionId=168012599" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# 0. Install Dependencies and Bring in Data

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tabulate import tabulate

from tensorflow import keras
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/comment-classification/train.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.iloc[0]['comment_text']

In [None]:
df[df.columns[2:]].iloc[0]

# 1. Preprocess

In [None]:
df.columns[2:]

In [None]:
df[df.columns[2:]].values

In [None]:
X = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
MAX_FEATURES = 200000 # Maximum number of features in the vocabulary
NUM_CLASSES = 6

In [None]:
# Text vectorization layer
vectorizer = TextVectorization(max_tokens=MAX_FEATURES,
                               output_sequence_length=1800,
                               output_mode='int')

In [None]:
vectorizer.adapt(X.values)

In [None]:
vectorizer('Hello world, life is great')[:5]

In [None]:
vectorized_text = vectorizer(X.values)
vectorized_text

In [None]:
# Create a TensorFlow dataset from tensor slices
dataset = tf.data.Dataset.from_tensor_slices((vectorized_text, y))

# Apply MCSHBAP-map,chache,shuffle,batch,prefetch pipeline
dataset = dataset.cache()
dataset = dataset.shuffle(buffer_size=160000)
dataset = dataset.batch(128)
dataset = dataset.prefetch(8)# Helps in reducing bottlenecks

In [None]:
batch_X,batch_y = dataset.as_numpy_iterator().next()

In [None]:
batch_X.shape

In [None]:
batch_y.shape

In [None]:
# Calculate the sizes of train, val, and test sets
train_size = int(0.7 * len(dataset))
val_size = int(0.2 * len(dataset))
test_size = len(dataset) - train_size - val_size

# Split the dataset into train, val, and test sets
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size).take(val_size)
test_dataset = dataset.skip(train_size + val_size).take(test_size)

# 2. Create Sequential Model

In [None]:
# Define the model architecture
model = Sequential()

model.add(Embedding(MAX_FEATURES + 1, 32))
#model.add(Bidirectional(LSTM(64, activation='tanh')))

forward_layer = LSTM(64)
backward_layer = LSTM(64, activation='tanh',go_backwards=True)
model.add(Bidirectional(forward_layer, backward_layer=backward_layer))

model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(NUM_CLASSES, activation='sigmoid')) 

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [None]:
model.build(input_shape=(None, None))

In [None]:
model.summary()

In [None]:
# Train the model
history = model.fit(train_dataset, epochs=5, validation_data = val_dataset)

In [None]:
# Extract loss and accuracy values from the history object
train_loss = history.history['loss']
val_loss = history.history['val_loss']
train_accuracy = history.history['accuracy']
val_accuracy = history.history['val_accuracy']
epochs = range(1, len(train_loss) + 1)

# Plot loss
plt.plot(epochs, train_loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.figure()

# Plot accuracy
plt.plot(epochs, train_accuracy, 'bo', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'b', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


# 3. Make Predictions

In [None]:
input_text = vectorizer('You freaking suck! I am going to hit you.')

In [None]:
input_text

In [None]:
np.expand_dims(input_text,0)

In [None]:
model.predict(np.expand_dims(input_text,0))

In [None]:
res = model.predict(np.expand_dims(input_text,0))

In [None]:
predictions = model.predict(test_dataset)
predicted_labels = np.argmax(predictions, axis=1) # Convert probabilities to class labels

# 4. Evaluate Model

In [None]:
# Evaluate the model on the test dataset
evaluation = model.evaluate(test_dataset)

# Print the evaluation results
print("Test Loss:", evaluation[0])
print("Test Accuracy:", evaluation[1])

In [None]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

In [None]:
for batch in test_dataset.as_numpy_iterator(): 
    # Unpack the batch 
    X_true, y_true = batch
    # Make a prediction 
    yhat = model.predict(X_true)
    
    # Flatten the predictions
    y_true = y_true.flatten()
    yhat = yhat.flatten()
    
    pre.update_state(y_true, yhat)
    re.update_state(y_true, yhat)
    acc.update_state(y_true, yhat)
# Print finish message after processing all batches
print("Processing complete.")

In [None]:
print(f'Precision: {pre.result().numpy()}, Recall:{re.result().numpy()}, Accuracy:{acc.result().numpy()}')

# 5. Test and Gradio

In [None]:
#os.remove('comment_classification.keras')

In [None]:
input_str = vectorizer('hey i freaken hate you!')

In [None]:
res = model.predict(np.expand_dims(input_str,0))

In [None]:
res

In [None]:
model.save('comment_classification.keras')  

In [None]:
#loaded_model = tf.keras.models.load_model('comment_classification.keras')

In [62]:
def score_comment(comment):
    vectorized_comment = vectorizer(comment)  # Transform the comment using TextVectorization
    results = model.predict(np.expand_dims(vectorized_comment, 0))

    # Prepare data for tabulate
    data = []
    for idx, col in enumerate(df.columns[2:]):
        prediction = "True" if results[0][idx] > 0.5 else "False"
        data.append([col, prediction])

    # Create tabular output
    table = tabulate(data, headers=["Category", "Prediction"], tablefmt="grid")

    return table

# Example usage
comment = input("Enter the comment: ")
print(score_comment(comment))

Enter the comment:  kill you


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
+---------------+--------------+
| Category      | Prediction   |
| toxic         | True         |
+---------------+--------------+
| severe_toxic  | False        |
+---------------+--------------+
| obscene       | False        |
+---------------+--------------+
| threat        | False        |
+---------------+--------------+
| insult        | False        |
+---------------+--------------+
| identity_hate | False        |
+---------------+--------------+
