In [None]:
# !pip install --upgrade pip
# !pip install comet_ml
# !pip install -q pyyaml h5py
# !pip install scikit-plot

## Initial Setup

In [None]:
# Import libraries
from comet_ml import Experiment
from comet_ml import Optimizer
import numpy as np
import os
import pandas as pd
import random
import seaborn as sns
import scikitplot as skplt
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertModel, DistilBertConfig

# Import matplotlib
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline

# Import utility functions
from src.utils.train_utils import batch_encode
from src.utils.train_utils import focal_loss



# Load training data
X_train = pd.read_csv('data/processed/unbalanced_dataset/X_train.csv')['comment_text']
X_valid = pd.read_csv('data/processed/unbalanced_dataset/X_valid.csv')['comment_text']
y_train = pd.read_csv('data/processed/unbalanced_dataset/y_train.csv')['isToxic']
y_valid = pd.read_csv('data/processed/unbalanced_dataset/y_valid.csv')['isToxic']

# Load test data
test = pd.read_csv('data/processed/test_merged.csv')
X_test = test['comment_text']
y_test = test['isToxic']

# Check data
print('Our training data has   ', len(X_train.index), ' rows.')
print('Our validation data has ', len(X_valid.index), ' rows.')
print('Our test data has       ', len(X_test.index), ' rows.')



# Allow us to see full text (not truncated)
pd.set_option('display.max_colwidth', None)

In [None]:
# Set parameters:
params = {'MAX_LENGTH': 128,
          'EPOCHS': 6,
          'LEARNING_RATE': 5e-5,
          'OPTIMIZER': 'adam',
          'LOSS': 'Focal Loss // gamma=2, alpha=.8',
          'FL_GAMMA': 2.0,
          'FL_ALPHA': 0.8,
          'BATCH_SIZE': 64,
          'NUM_STEPS': X_train.shape[0] // 64,
          'DISTILBERT_DROPOUT': 0.2,
          'DISTILBERT_ATT_DROPOUT': 0.2,
          'LAYER_DROPOUT': 0.2,
          'KERNEL_INITIALIZER': 'GlorotNormal',
          'BIAS_INITIALIZER': 'zeros',
          'POS_PROBA_THRESHOLD': 0.5,
          'CALLBACKS': '[early_stopping w/ patience=2]',
          'FREEZING': 'All distilBERT layers frozen',
          'DATASET': 'Unbalanced Splits',
          'RANDOM_STATE':42
         }

# Define callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                  mode='max',
                                                  min_delta=0,
                                                  patience=2,
                                                  restore_best_weights=True)

In [None]:
def build_model(experiment):
    """""""""
    Builds a model off of the DistilBERT architecture with a
    MAX_LENGTH of 128.  Number of nodes in the two added
    Dense layers is configured by the Comet.ml experiment object
    according to a grid-search.
    
    Input:
      - experiment :  a Comet.ml experiment object
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """"""""""
    
    # The bare, pretrained DistilBERT transformer model outputting raw hidden-states 
    # and without any specific head on top.
    config = DistilBertConfig(dropout=params['DISTILBERT_DROPOUT'], 
                              attention_dropout=params['DISTILBERT_ATT_DROPOUT'], 
                              output_hidden_states=True)
    distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

    # Make DistilBERT layers untrainable
    for layer in distilBERT.layers:
        layer.trainable = False
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=params['RANDOM_STATE']) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(128,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(128,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = distilBERT([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, which is located
    # at index 0.  Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]   
    
    D1 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(cls_token)
    
    X = tf.keras.layers.Dense(experiment.get_parameter('Dense 1'),
                              activation='relu',
                              kernel_initializer=weight_initializer,
                              bias_initializer='zeros'
                              )(D1)
    
    D2 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(X)
    
    X = tf.keras.layers.Dense(experiment.get_parameter('Dense 2'),
                              activation='relu',
                              kernel_initializer=weight_initializer,
                              bias_initializer='zeros'
                              )(D2)
    
    D3 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(X)
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,
                                   bias_initializer='zeros'
                                   )(D3)
    
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    model.compile(tf.keras.optimizers.Adam(lr=params['LEARNING_RATE']), 
                  loss=focal_loss(),
                  metrics=['accuracy'])
    
    return model

In [None]:
########## Ensure reproducibility ##########


# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(params['RANDOM_STATE'])

# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(params['RANDOM_STATE'])

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(params['RANDOM_STATE'])

# 4. Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed=params['RANDOM_STATE'])

## Encode Datasets

In [None]:
# Instantiate DistilBERT tokenizer...we use the Fast version to optimize runtime
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())

# Encode X_valid
X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_valid.tolist())

# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())

## Run Grid-Search on Comet.ml

In [None]:
# Define Grid-search configuration:
config = {
    # We pick the Grid Search algorithm:
    'algorithm': 'grid',

    # Declare what we will be optimizing, and how:
    'spec': {
             'randomize': False,
             'maxCombo': 0,
             'metric': 'Accuracy',
             'gridSize': 4,
             'minSampleSize': len(X_train.index),
            },
    
    # Declare your hyperparameters in the Vizier-inspired format:
    'parameters': {
                    'Dense 1': {'type': 'discrete', 
                                'values': [32, 64, 128, 256]},
                    'Dense 2': {'type': 'discrete',
                                'values': [32, 64, 128, 256]}
                  },
    
    # Optionally declare a name to associate with the search instance
    'name': 'Grid Search of Unbalanced Dense Layers',
    
    # The number of trials per experiment to run
    'trials': 1
}



# Next, create an optimizer, passing in the config:
opt = Optimizer(config, api_key='YOUR_API_KEY')



# Finally, get experiments, and train your models:
for experiment in opt.get_experiments(
        api_key="YOUR_API_KEY",
        project_name="YOUR_PROJECT_NAME",
        workspace="YOUR_WORKSPACE",
        auto_histogram_weight_logging=True,
        auto_histogram_gradient_logging=True,
        auto_histogram_activation_logging=True,
        auto_log_co2=True,
        log_env_details=True,
        log_env_gpu=True,
        log_env_cpu=True):
    
    
    ############  Log Parameters and Assets:  ############
    
    # Log data assets
    experiment.log_asset('data/processed/test_merged.csv')
    experiment.log_asset_folder('data/processed/unbalanced_dataset')
    experiment.log_dataset_info(name='Toxic Comment (Unbalanced)')
    
    
    # Log custom parameter
    experiment.log_parameter('Other', 
                             'Grid Search of Unbalanced Dataset With Two Hidden Layers and Freezing')
    
    
    ############  Build the model:  ############
    
    model = build_model(experiment)
    
    
    ############  Train the model:  ############
    
    train_history = model.fit(
                            x = [X_train_ids, X_train_attention],
                            y = y_train.to_numpy(),
                            epochs = params['EPOCHS'],
                            batch_size = params['BATCH_SIZE'],
                            steps_per_epoch = params['NUM_STEPS'],
                            validation_data = ([X_valid_ids, X_valid_attention], 
                                               y_valid.to_numpy()),
                            callbacks=[early_stopping],
                            verbose=2)
    

    ############  Evaluate the model  ############
    
    with experiment.test():
        # Generate predictions
        y_pred = model.predict([X_test_ids, X_test_attention])
        y_pred_thresh = np.where(y_pred >= params['POS_PROBA_THRESHOLD'], 1, 0)
    
        # Get evaluation results
        accuracy = accuracy_score(y_test, y_pred_thresh)
        auc_roc = roc_auc_score(y_test, y_pred)
    
        # Log evaluation metrics
        experiment.log_metrics({'Accuracy':accuracy, 'AUC-ROC':auc_roc})
    
        # Log the ROC curve
        fpr, tpr, thresholds = roc_curve(y_test.to_numpy(), y_pred)
        experiment.log_curve('ROC cuve', fpr, tpr)
        
    
    ############  Plot Train and Validation Loss  ############
        
    # Plot training and validation loss over each epoch
    history_df = pd.DataFrame(train_history.history)
    history_df.loc[:, ['loss', 'val_loss']].plot()
    plt.title(label='Training + Validation Loss Over Time', fontsize=17, pad=19)
    plt.xlabel('Epoch', labelpad=14, fontsize=14)
    plt.ylabel('Focal Loss', labelpad=16, fontsize=14)

    # Save figure
    plt.savefig('figures/unbalanced_trainvalloss.png', dpi=300.0, transparent=True)

    # Log the figure
    experiment.log_image('figures/unbalanced_trainvalloss.png', name='Train Validation Loss')
    
    
    ############  Plot confusion matrix  ############
    
    # Plot confusion matrix
    skplt.metrics.plot_confusion_matrix(y_test.to_list(),
                                        y_pred_thresh.tolist(),
                                        figsize=(6,6),
                                        text_fontsize=14)
    plt.title(label='Test Confusion Matrix', fontsize=20, pad=17)
    plt.xlabel('Predicted Label', labelpad=14)
    plt.ylabel('True Label', labelpad=14)

    # Save the figure
    plt.savefig('figures/unbalanced_confusionmatrix.png', dpi=300.0, transparent=True)

    # Log the confusion matrix
    experiment.log_image('figures/unbalanced_confusionmatrix.png', name='Test Confusion Matrix')
    
    
     ############  End experiment  ############
        
    # End Comet.ml experiment
    experiment.end()