# Dataset

In [None]:
import pathlib
import tensorflow as tf

from datasets import load_dataset
from transformers import DistilBertTokenizerFast

In [None]:
class CivilCommentsDataset:
    """
    Loads and processes the `civil_comments` dataset: https://huggingface.co/datasets/civil_comments.
    """

    def __init__(self):

        print("Building dataset...")
        # Load tokenizer
        self.tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
        self.dataset = None

        self.data_path = pathlib.Path("data")

        # Load/build data sets
        self.train_data = self.load_or_generate_tf_dataset("train")
        self.val_data = self.load_or_generate_tf_dataset("validation")
        self.test_data = self.load_or_generate_tf_dataset("test")

    def load_or_generate_tf_dataset(self, split):
        """
        Build dataset if not already done so, otherwise load it from disk
        """

        if not pathlib.Path.exists(self.data_path / split):
            if self.dataset is None:
                self.dataset = load_dataset("civil_comments")

            print(f"Building {split} data...")

            # Generate features and labels
            # Only grabbing 100 items for now
            encodings = self.tokenizer(self.dataset[split][0:100]["text"], truncation=True, padding=True)
            features = {x: encodings[x] for x in self.tokenizer.model_input_names}
            labels = self.dataset[split].remove_columns(
                ["text", "identity_attack", "insult", "obscene", "severe_toxicity", "sexual_explicit", "threat"]
            ).to_pandas().to_numpy()
            labels = labels[0:100]

            # Build dataset and save to disk
            data = tf.data.Dataset.from_tensor_slices((
                features,
                labels
            ))
            tf.data.experimental.save(data, str(self.data_path / split))

            return data
        else:
            return tf.data.experimental.load(str(self.data_path / split))


# Model

In [None]:
import tensorflow as tf
import transformers
from transformers import TFDistilBertForSequenceClassification

In [None]:
class CivilityModel:
    """
    Trains a civility classifier model, leveraging Hugging Face and TensorFlow.
    """

    def __init__(self, num_labels=7):

        """
        num_labels: Number of units in final dense layer of network. Defaults to 7 for the 7 categories of
            incivility in the dataset. Set to 1 for a simple civil/uncivil classifier.
        """

        # Define model and dataset
        self.dataset = CivilCommentsDataset()

        if len(tf.config.list_physical_devices("GPU")) > 1:
            strategy = tf.distribute.MirroredStrategy()
            with strategy.scope():
                self.model = TFDistilBertForSequenceClassification.from_pretrained(
                    'distilbert-base-uncased',
                    num_labels=num_labels
                )
        else:
            self.model = TFDistilBertForSequenceClassification.from_pretrained(
                'distilbert-base-uncased',
                num_labels=num_labels
            )

        self.model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            filepath="civility_model",
            save_weights_only=False,
            monitor='accuracy',
            mode='max',
            save_best_only=True
        )
        
        # Freeze layers
        for layer in self.model.layers:
            if isinstance(layer, transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertMainLayer):
                layer.trainable = False
        self.model.summary()

        optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
        loss = tf.keras.losses.MeanSquaredError()
        self.model.compile(optimizer=optimizer, loss=loss, metrics="accuracy")

    def train(self, epochs, batch_size=32):

        print("Beginning train...")

        history = self.model.fit(
            self.dataset.train_data,
            epochs=epochs,
            batch_size=batch_size,
            validation_data=self.dataset.val_data,
            callbacks=[self.model_checkpoint_callback]
        )
        return history

    def test(self, batch_size=32):

        print("Beginning evaluation")
        self.model.evaluate(
           self.dataset.test_data,
           batch_size=batch_size
        )

    def predict(self, x, x_tokenized):

        if x_tokenized:
            return self.model.predict(x)
        else:
            x_token = self.dataset.tokenizer(x, truncation=True, padding=True)
            return self.model.predict(x_token)


# Main Program

In [None]:
civility_model = CivilityModel()
history = civility_model.train(
    epochs=5,
    batch_size=32
)
civility_model.model.save("civility_model_final")