In [1]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
from sklearn.model_selection import train_test_split, KFold
from tensorflow.data import Dataset
import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
import os
import warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
tf.get_logger().setLevel('ERROR')
warnings.filterwarnings("ignore")

E0000 00:00:1726665551.815467      13 common_lib.cc:798] Could not set metric server port: INVALID_ARGUMENT: Could not find SliceBuilder port 8471 in any of the 0 ports provided in `tpu_process_addresses`="local"
=== Source Location Trace: ===
learning/45eac/tfrc/runtime/common_lib.cc:479
D0918 13:19:11.823516771      13 config.cc:196]                        gRPC EXPERIMENT call_status_override_on_cancellation   OFF (default:OFF)
D0918 13:19:11.823531249      13 config.cc:196]                        gRPC EXPERIMENT call_v3                                OFF (default:OFF)
D0918 13:19:11.823534656      13 config.cc:196]                        gRPC EXPERIMENT canary_client_privacy                  ON  (default:ON)
D0918 13:19:11.823537071      13 config.cc:196]                        gRPC EXPERIMENT capture_base_context                   ON  (default:ON)
D0918 13:19:11.823539426      13 config.cc:196]                        gRPC EXPERIMENT client_idleness                        ON  (defau

In [2]:
# TPU initialization
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver)

I0000 00:00:1726665587.497260      13 service.cc:145] XLA service 0x5bd28597a480 initialized for platform TPU (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1726665587.497324      13 service.cc:153]   StreamExecutor device (0): TPU, 2a886c8
I0000 00:00:1726665587.497328      13 service.cc:153]   StreamExecutor device (1): TPU, 2a886c8
I0000 00:00:1726665587.497331      13 service.cc:153]   StreamExecutor device (2): TPU, 2a886c8
I0000 00:00:1726665587.497334      13 service.cc:153]   StreamExecutor device (3): TPU, 2a886c8
I0000 00:00:1726665587.497336      13 service.cc:153]   StreamExecutor device (4): TPU, 2a886c8
I0000 00:00:1726665587.497339      13 service.cc:153]   StreamExecutor device (5): TPU, 2a886c8
I0000 00:00:1726665587.497342      13 service.cc:153]   StreamExecutor device (6): TPU, 2a886c8
I0000 00:00:1726665587.497344      13 service.cc:153]   StreamExecutor device (7): TPU, 2a886c8


In [3]:
# Loading the dataset
file_path = '/kaggle/input/toxic-dataset/combined_train_big.csv'
data = pd.read_csv(file_path)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Text tokenization function
def tokenize_texts(texts, labels, tokenizer, max_length=512):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )
        input_ids.append(encoding['input_ids'])
        attention_masks.append(encoding['attention_mask'])

    return (
        np.array(input_ids),
        np.array(attention_masks),
        np.array(labels)
    )

# Function for creating TensorFlow datasets
def create_tf_dataset(input_ids, attention_masks, labels, batch_size=32):
    dataset = Dataset.from_tensor_slices(({'input_ids': input_ids, 'attention_mask': attention_masks}, labels))
    dataset = dataset.shuffle(buffer_size=len(input_ids)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset


In [4]:
# Initialize dictionary for class weights
def get_class_weights(y_train):
    class_weights_dict = {}
    for i in range(y_train.shape[1]):
        class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train[:, i]), y=y_train[:, i])
        class_weights_dict[i] = class_weights[1]
    return class_weights_dict

BATCH_SIZE = 32

In [5]:
# Function to train and evaluate model
@tf.autograph.experimental.do_not_convert
def train_and_evaluate(train_dataset, val_dataset, class_weights_dict):
    with strategy.scope():
        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)
        model.classifier.activation = tf.keras.activations.sigmoid
        
        EPOCHS = 3
        steps_per_epoch = len(train_texts) // BATCH_SIZE
        num_train_steps = steps_per_epoch * EPOCHS
        num_warmup_steps = int(num_train_steps * 0.1)

        optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=num_warmup_steps, num_train_steps=num_train_steps)
        loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=False)

        model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)

        history = model.fit(train_dataset, epochs=EPOCHS, validation_data=val_dataset, class_weight=class_weights_dict, callbacks=[early_stopping])
        
    return model


In [6]:
# Cross-validation
kf = KFold(n_splits=4)
for train_index, val_index in kf.split(data):
    train_texts = data['comment_text'].iloc[train_index].tolist()
    val_texts = data['comment_text'].iloc[val_index].tolist()
    train_labels = data.iloc[train_index][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()
    val_labels = data.iloc[val_index][['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values.tolist()
    
    train_input_ids, train_attention_masks, train_labels = tokenize_texts(train_texts, train_labels, tokenizer)
    val_input_ids, val_attention_masks, val_labels = tokenize_texts(val_texts, val_labels, tokenizer)
    
    train_dataset = create_tf_dataset(train_input_ids, train_attention_masks, train_labels, BATCH_SIZE)
    val_dataset = create_tf_dataset(val_input_ids, val_attention_masks, val_labels, BATCH_SIZE)
    
    class_weights_dict = get_class_weights(np.array(train_labels))
    
    model = train_and_evaluate(train_dataset, val_dataset, class_weights_dict)

I0000 00:00:1726665953.621150      13 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Cause: for/else statement not yet supported


2024-09-18 13:27:19.459808: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.
I0000 00:00:1726666042.918943     793 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(4858de178e0d715e:0:0), session_name()
I0000 00:00:1726666078.573064     793 tpu_compile_op_common.cc:245] Compilation of 4858de178e0d715e:0:0 with session name  took 35.65406863s and succeeded
I0000 00:00:1726666078.661586     793 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(4858de178e0d715e:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_5122297613318733526", property.function_library_fingerprint = 17217561791339723853, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, t



I0000 00:00:1726666736.340465     786 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(c7d75aece12fd226:0:0), session_name()
I0000 00:00:1726666767.063154     786 tpu_compile_op_common.cc:245] Compilation of c7d75aece12fd226:0:0 with session name  took 30.722646132s and succeeded
I0000 00:00:1726666767.166112     786 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(c7d75aece12fd226:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_5122297613318733526", property.function_library_fingerprint = 17217561791339723853, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, std::string(property.shapes_prefix) = "2,512,;2,512,;2,6,;2,;", property.guaranteed_constants_size = 0, embedd



2024-09-18 13:39:41.211299: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
I0000 00:00:1726666782.035300     752 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(fdf31cc3fc1b7c21:0:0), session_name()
I0000 00:00:1726666785.910481     752 tpu_compile_op_common.cc:245] Compilation of fdf31cc3fc1b7c21:0:0 with session name  took 3.875139905s and succeeded
I0000 00:00:1726666785.940368     752 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(fdf31cc3fc1b7c21:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_test_function_2009172832449519177", property.function_library_fingerprint = 17168402920806792758, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topolog

Epoch 2/3
Epoch 3/3


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


2024-09-18 14:11:59.052546: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.
I0000 00:00:1726668722.213102     745 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(27f0953edccf7fbf:0:0), session_name()
I0000 00:00:1726668755.531585     745 tpu_compile_op_common.cc:245] Compilation of 27f0953edccf7fbf:0:0 with session name  took 33.318417005s and succeeded
I0000 00:00:1726668755.625689     745 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(27f0953edccf7fbf:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_10241426794807312982", property.function_library_fingerprint = 11486421508345242871, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1,



I0000 00:00:1726669410.809143     818 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(8516bf5ca15e9f0b:0:0), session_name()
I0000 00:00:1726669443.161667     818 tpu_compile_op_common.cc:245] Compilation of 8516bf5ca15e9f0b:0:0 with session name  took 32.352483506s and succeeded
I0000 00:00:1726669443.261911     818 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(8516bf5ca15e9f0b:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_10241426794807312982", property.function_library_fingerprint = 11486421508345242871, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, std::string(property.shapes_prefix) = "2,512,;2,512,;2,6,;2,;", property.guaranteed_constants_size = 0, embed



2024-09-18 14:24:15.266237: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
I0000 00:00:1726669456.053447     745 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(1cba89f4931f67c8:0:0), session_name()
I0000 00:00:1726669460.032569     745 tpu_compile_op_common.cc:245] Compilation of 1cba89f4931f67c8:0:0 with session name  took 3.979073768s and succeeded
I0000 00:00:1726669460.058069     745 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(1cba89f4931f67c8:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_test_function_11800034296103869966", property.function_library_fingerprint = 15581170155482090615, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topolo

Epoch 2/3
Epoch 3/3


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


2024-09-18 14:56:17.391155: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.
I0000 00:00:1726671380.537177     783 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(79f1a9bcfe8d002b:0:0), session_name()
I0000 00:00:1726671413.553165     783 tpu_compile_op_common.cc:245] Compilation of 79f1a9bcfe8d002b:0:0 with session name  took 33.01593638s and succeeded
I0000 00:00:1726671413.640791     783 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(79f1a9bcfe8d002b:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_15571124377847101865", property.function_library_fingerprint = 6584244247188261076, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, t



I0000 00:00:1726672074.704310     754 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(8e7f770baacd0f89:0:0), session_name()
I0000 00:00:1726672106.942216     754 tpu_compile_op_common.cc:245] Compilation of 8e7f770baacd0f89:0:0 with session name  took 32.237868053s and succeeded
I0000 00:00:1726672107.047152     754 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(8e7f770baacd0f89:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_15571124377847101865", property.function_library_fingerprint = 6584244247188261076, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, std::string(property.shapes_prefix) = "2,512,;2,512,;2,6,;2,;", property.guaranteed_constants_size = 0, embedd



2024-09-18 15:08:38.964789: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
I0000 00:00:1726672119.756164     793 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(c1246ea737cdacb8:0:0), session_name()
I0000 00:00:1726672123.854651     793 tpu_compile_op_common.cc:245] Compilation of c1246ea737cdacb8:0:0 with session name  took 4.098427319s and succeeded
I0000 00:00:1726672123.892710     793 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(c1246ea737cdacb8:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_test_function_1077735260969777257", property.function_library_fingerprint = 14383365556629852594, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topolog

Epoch 2/3
Epoch 3/3


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


2024-09-18 15:41:32.473402: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node Adam/AssignAddVariableOp.
I0000 00:00:1726674095.869344     832 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(690aa6400eb9f49d:0:0), session_name()
I0000 00:00:1726674130.156917     832 tpu_compile_op_common.cc:245] Compilation of 690aa6400eb9f49d:0:0 with session name  took 34.28750998s and succeeded
I0000 00:00:1726674130.267742     832 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(690aa6400eb9f49d:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_8832857086231035305", property.function_library_fingerprint = 16885022150832214614, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, t



I0000 00:00:1726674815.423539     820 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(d5df74647f3dbd4a:0:0), session_name()
I0000 00:00:1726674848.720514     820 tpu_compile_op_common.cc:245] Compilation of d5df74647f3dbd4a:0:0 with session name  took 33.296924843s and succeeded
I0000 00:00:1726674848.823981     820 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(d5df74647f3dbd4a:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_train_function_8832857086231035305", property.function_library_fingerprint = 16885022150832214614, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology.wrap().x = false, topology.wrap().y = false, topology.wrap().z = false, std::string(property.shapes_prefix) = "2,512,;2,512,;2,6,;2,;", property.guaranteed_constants_size = 0, embedd



2024-09-18 15:54:22.145485: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:961] model_pruner failed: INVALID_ARGUMENT: Graph does not contain terminal node AssignAddVariableOp.
I0000 00:00:1726674862.954378     798 tpu_compilation_cache_interface.cc:441] TPU host compilation cache miss: cache_key(66213c2850d3f346:0:0), session_name()
I0000 00:00:1726674867.306162     798 tpu_compile_op_common.cc:245] Compilation of 66213c2850d3f346:0:0 with session name  took 4.35174442s and succeeded
I0000 00:00:1726674867.343167     798 tpu_compilation_cache_interface.cc:475] TPU host compilation cache: compilation complete for cache_key(66213c2850d3f346:0:0), session_name(), subgraph_key(std::string(property.function_name) = "cluster_test_function_8003081260919262684", property.function_library_fingerprint = 15087900916295914488, property.mlir_module_fingerprint = 0, property.num_replicas = 8, topology.chip_bounds().x = 2, topology.chip_bounds().y = 2, topology.chip_bounds().z = 1, topology

Epoch 2/3
Epoch 3/3


In [7]:
model_save_path = "/kaggle/working/model_save"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/kaggle/working/model_save/tokenizer_config.json',
 '/kaggle/working/model_save/special_tokens_map.json',
 '/kaggle/working/model_save/vocab.txt',
 '/kaggle/working/model_save/added_tokens.json')

In [None]:
predictions = []
true_labels = []

for batch in val_dataset:
    input_ids = batch[0]['input_ids']
    attention_mask = batch[0]['attention_mask']
    labels = batch[1]

    preds = model.predict({'input_ids': input_ids, 'attention_mask': attention_mask}, verbose=0)

    predictions.extend(preds)
    true_labels.extend(labels.numpy())

predictions = np.array(predictions)
true_labels = np.array(true_labels)

# Binarize predictions
predictions = np.where(predictions > 0.4, 1, 0)

# Print classification report
report = classification_report(true_labels, predictions, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], zero_division=0)
print(report)