In [None]:
# Installing the required Python packages for the model:
# - tensorflow: for deep learning
# - transformers: for using BERT models
# - scikit-learn: for preprocessing and evaluation
# - pandas: for data handling

!pip install tensorflow transformers scikit-learn pandas

In [None]:
# To connect to the Google Drive (Optional use when using Google Colab)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Importing essential libraries:
# - pandas: for reading and manipulating CSV data
# - tensorflow: for building the deep learning model
# - transformers: to use BERT tokenizer and model
# - train_test_split: to split the dataset into training and validation sets

import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split

In [None]:
# Loading the dataset from Google Drive
# Displays the first few rows to understand the data structure

df = pd.read_csv('/content/drive/MyDrive/train.csv')  # Do this if using google Colab and upload the train.csv file on drive
# or 
# df = pd.read_csv("train.csv")
df.head()

In [None]:
# Extracting the features and labels:
# - X contains the comment texts
# - y contains the labels for different types of toxicity

X = df['comment_text']
y = df[df.columns[2:]].values

In [None]:
# Load the BERT tokenizer from Hugging Face
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Function to tokenize a list of texts using the BERT tokenizer
# It pads, truncates, and returns TensorFlow tensors

def tokenize_texts(texts, max_len=128):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors="tf"
    )

In [None]:
# Tokenize all comment texts into BERT input format
tokens = tokenize_texts(X)
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

In [None]:
# Convert token tensors to NumPy arrays and split into training and validation sets
X_ids = input_ids.numpy()
X_mask = attention_mask.numpy()

X_ids_train, X_ids_val, X_mask_train, X_mask_val, y_train, y_val = train_test_split(
    X_ids, X_mask, y, test_size=0.2, random_state=42
)

In [None]:
# Create TensorFlow datasets for training and validation
# Each input is a dictionary of input IDs and attention masks, paired with labels
train_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': X_ids_train,
    'attention_mask': X_mask_train
}, y_train)).batch(16).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': X_ids_val,
    'attention_mask': X_mask_val
}, y_val)).batch(16).prefetch(tf.data.AUTOTUNE)

In [None]:
# Build the model:
# - Load the BERT model
# - Add input layers for BERT
# - Use BERT pooled output
# - Add dense and dropout layers
# - Final layer with sigmoid activation for multi-label classification
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Build classification model
input_ids = tf.keras.Input(shape=(128,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.Input(shape=(128,), dtype=tf.int32, name='attention_mask')

bert_output = bert_model(input_ids, attention_mask=attention_mask)[1]  # pooled output
x = tf.keras.layers.Dense(128, activation='relu')(bert_output)
x = tf.keras.layers.Dropout(0.3)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
output = tf.keras.layers.Dense(6, activation='sigmoid')(x)

model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

In [None]:
# Compile the model with:
# - Adam optimizer with low learning rate
# - Binary crossentropy loss for multi-label classification
# - Accuracy as the evaluation metric

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.summary()

In [None]:
# Train the model for 2 epochs
# Includes validation on the validation dataset
model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2
)

In [None]:
# Saving the ML Model
model.save("bert_toxic_model_saved", save_format="tf")  # This creates a folder

In [None]:
# This is to save the model in zip file (Optional use when using Google Colab)
import shutil
shutil.make_archive('bert_toxic_model_saved', 'zip', 'bert_toxic_model_saved')

# This is to download that zip file
from google.colab import files
files.download('bert_toxic_model_saved.zip')