In [None]:
get_ipython().system('pip install --upgrade pip')
get_ipython().system('pip install comet_ml')
get_ipython().system('pip install -q pyyaml h5py')
get_ipython().system('pip install scikit-plot')

In [None]:
# Import libraries
from comet_ml import Experiment
from comet_ml import Optimizer
import numpy as np
import os
import pandas as pd
import random
import seaborn as sns
import scikitplot as skplt
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import initializers
import tqdm

# Import matplotlib
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')

# Load training data
X_train = pd.read_csv('../input/toxic-comment-unbalanced/X_train.csv')['comment_text']
X_valid = pd.read_csv('../input/toxic-comment-unbalanced/X_valid.csv')['comment_text']
y_train = pd.read_csv('../input/toxic-comment-unbalanced/y_train.csv')['isToxic']
y_valid = pd.read_csv('../input/toxic-comment-unbalanced/y_valid.csv')['isToxic']

# Load test data
test = pd.read_csv('../input/toxic-comment-test-merged/test_merged.csv')
X_test = test['comment_text']
y_test = test['isToxic']

# Check data
print('Our training data has   ', len(X_train.index), ' rows.')
print('Our validation data has ', len(X_valid.index), ' rows.')
print('Our test data has       ', len(X_test.index), ' rows.')


# Allow us to see full text (not truncated)
pd.set_option('display.max_colwidth', None)

In [None]:
# Set parameters:
params = {'MAX_LENGTH': 128,
          'EPOCHS': 6,
          'LEARNING_RATE': 5e-5,
          'OPTIMIZER': 'adam',
          'LOSS': 'Focal Loss // gamma=2, alpha=.8',
          'FL_GAMMA': 2.0,
          'FL_ALPHA': 0.8,
          'BATCH_SIZE': 64,
          'NUM_STEPS': X_train.shape[0] // 64,
          'DISTILBERT_DROPOUT': 0.2,
          'DISTILBERT_ATT_DROPOUT': 0.2,
          'LAYER_DROPOUT': 0.2,
          'KERNEL_INITIALIZER': 'GlorotNormal',
          'BIAS_INITIALIZER': 'zeros',
          'POS_PROBA_THRESHOLD': 0.5,
          'CALLBACKS': '[early_stopping]',
          
          'LR_SCHEDULE': 'None',
          'FREEZING': 'All distilBERT layers frozen',
          'OTHER': 'None',
          'DATASET': 'Unbalanced Splits',
          'RANDOM_STATE':42
         }

# Define callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',
                                                  mode='max',
                                                  min_delta=0,
                                                  patience=2,
                                                  restore_best_weights=True)

In [None]:
# DistilBERT and BERT use the same tokenizer...we use the Fast version to optimize runtime
from transformers import DistilBertTokenizerFast

def batch_encode(tokenizer, sentences):
    """""""""
    A function that encodes a batch of sentences and returns the sentences'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    
    Input:
        - tokenizer:  tokenizer object from the PreTrainedTokenizer Class
        - sentences:  a list of strings where each string represents a sentence
    Output:
        - input_ids:       a sentence encoded as a tf.Tensor object
        - attention_mask:  the sentence's attention mask encoded as a tf.Tensor object
    """""""""
    inputs = tokenizer.batch_encode_plus(sentences,
                                         max_length=params['MAX_LENGTH'],
                                         padding='longest', # implements dynamic padding
                                         truncation=True,
                                         return_tensors='tf',
                                         return_attention_mask=True, 
                                         return_token_type_ids=False)
    
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    return input_ids, attention_mask

# Get tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Encode X_train
X_train_ids, X_train_attention = batch_encode(tokenizer, X_train.tolist())

# Encode X_valid
X_valid_ids, X_valid_attention = batch_encode(tokenizer, X_valid.tolist())

# Encode X_test
X_test_ids, X_test_attention = batch_encode(tokenizer, X_test.tolist())

In [None]:
from transformers import TFDistilBertModel, DistilBertConfig


def focal_loss(gamma=params['FL_GAMMA'], alpha=params['FL_ALPHA']):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.mean(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) - K.mean((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

def build_model(experiment):
    
    # The bare DistilBERT encoder/transformer outputting raw hidden-states 
    # without any specific head on top.  Enumerates default settings. 
    config = DistilBertConfig(dropout=params['DISTILBERT_DROPOUT'], 
                              attention_dropout=params['DISTILBERT_ATT_DROPOUT'], 
                              output_hidden_states=True)
    distilBERT = TFDistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

    # Make DistilBERT layers untrainable
    for layer in distilBERT.layers:
        layer.trainable = False
    
    # Define weight initializer with a random seed to ensure reproducibility
    weight_initializer = tf.keras.initializers.GlorotNormal(seed=params['RANDOM_STATE']) 
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(128,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(128,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element is tf.Tensor of shape 
    # (batch_size, sequence_length, hidden_size=768).
    # The tf.Tensor represents a sequence of hidden-states at the output of the 
    # last layer of the model.
    last_hidden_states = distilBERT([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, which is located
    # at index 0.  Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_states[:, 0, :]   
    
    D1 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(cls_token)
    
    X = tf.keras.layers.Dense(experiment.get_parameter('Dense 1'),
                              activation='relu',
                              kernel_initializer=weight_initializer,
                              bias_initializer='zeros'
                              )(D1)
    
    D2 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(X)
    
    X = tf.keras.layers.Dense(experiment.get_parameter('Dense 2'),
                              activation='relu',
                              kernel_initializer=weight_initializer,
                              bias_initializer='zeros'
                              )(D2)
    
    D3 = tf.keras.layers.Dropout(params['LAYER_DROPOUT'],
                                 seed=params['RANDOM_STATE']
                                )(X)
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, 
                                   activation='sigmoid',
                                   kernel_initializer=weight_initializer,
                                   bias_initializer='zeros'
                                   )(D3)
    
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)
    
    model.compile(tf.keras.optimizers.Adam(lr=params['LEARNING_RATE']), 
                  loss=focal_loss(),
                  metrics=['accuracy'])
    
    return model

In [None]:
########## Ensure reproducibility ##########


# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
os.environ['PYTHONHASHSEED']=str(params['RANDOM_STATE'])

# 2. Set `python` built-in pseudo-random generator at a fixed value
random.seed(params['RANDOM_STATE'])

# 3. Set `numpy` pseudo-random generator at a fixed value
np.random.seed(params['RANDOM_STATE'])

# 4. Set `tensorflow` pseudo-random generator at a fixed value
tf.random.set_seed(seed=params['RANDOM_STATE'])

In [None]:
# Define Grid-search configuration:
config = {
    # We pick the Grid Search algorithm:
    'algorithm': 'grid',

    # Declare what we will be optimizing, and how:
    'spec': {
             'randomize': False,
             'maxCombo': 0,
             'metric': 'Accuracy',
             'gridSize': 4,
             'minSampleSize': len(X_train.index),
            },
    
    # Declare your hyperparameters in the Vizier-inspired format:
    'parameters': {
                    'Dense 1': {'type': 'discrete', 
                                'values': [32, 64, 128, 256]},
                    'Dense 2': {'type': 'discrete',
                                'values': [32, 64, 128, 256]}
                  },
    
    # Optionally declare a name to associate with the search instance
    'name': 'Grid Search of Unbalanced Dense Layers',
    
    # The number of trials per experiment to run
    'trials': 1
}



# Next, create an optimizer, passing in the config:
opt = Optimizer(config, api_key='TJ15ARkbpofBsCW8nh6bFXun5')


# Finally, get experiments, and train your models:
for experiment in opt.get_experiments(
        api_key="TJ15ARkbpofBsCW8nh6bFXun5",
        project_name="jigsaw-toxic-comment",
        workspace="raywilliamcs",
        auto_histogram_weight_logging=True,
        auto_histogram_gradient_logging=True,
        auto_histogram_activation_logging=True,
        auto_log_co2=True,
        log_env_details=True,
        log_env_gpu=True,
        log_env_cpu=True):
    
    ############  Log Parameters and Assets:  ############
    
    # Log data assets
    experiment.log_asset_folder('../input/toxic-comment-test-merged')
    experiment.log_asset_folder('../input/toxic-comment-unbalanced')
    experiment.log_dataset_info(name='Toxic Comment (Unbalanced)')
    
    
    # Log parameters:
    experiment.log_parameter('Other', 
                             'Grid Search of Unbalanced Dataset With Two Hidden Layers and Freezing')
    
    ############  Build the model:  ############
    model = build_model(experiment)
    
    ############  Train the model:  ############
    train_history = model.fit(
                            x = [X_train_ids, X_train_attention],
                            y = y_train.to_numpy(),
                            epochs = params['EPOCHS'],
                            batch_size = params['BATCH_SIZE'],
                            steps_per_epoch = params['NUM_STEPS'],
                            validation_data = ([X_valid_ids, X_valid_attention], 
                                               y_valid.to_numpy()),
                            callbacks=[early_stopping],
                            verbose=2)
    
    # Plot training and validation loss over each epoch
    history_df = pd.DataFrame(train_history.history)
    history_df.loc[:, ['loss', 'val_loss']].plot()
    plt.title(label='Training + Validation Loss Over Time', fontsize=17, pad=19)
    plt.xlabel('Epoch', labelpad=14, fontsize=14)
    plt.ylabel('Focal Loss', labelpad=16, fontsize=14)

    # Save figure
    plt.savefig('./trainvalloss.png', dpi=300.0, transparent=True)

    # Log the figure
    experiment.log_image('./trainvalloss.png', name='Train Validation Loss')
    
    ############  Evaluate the model  ############
    with experiment.test():
        # Generate predictions
        y_pred = model.predict([X_test_ids, X_test_attention])
        y_pred_thresh = np.where(y_pred >= params['POS_PROBA_THRESHOLD'], 1, 0)
    
        # Get evaluation results
        accuracy = accuracy_score(y_test, y_pred_thresh)
        auc_roc = roc_auc_score(y_test, y_pred)
    
        # Log evaluation metrics
        experiment.log_metrics({'Accuracy':accuracy, 'AUC-ROC':auc_roc})
    
        # Log the ROC curve
        fpr, tpr, thresholds = roc_curve(y_test.to_numpy(), y_pred)
        experiment.log_curve('ROC cuve', fpr, tpr)
        
    
    ############  Plot confusion matrix  ############
    # Plot confusion matrix
    skplt.metrics.plot_confusion_matrix(y_test.to_list(),
                                        y_pred_thresh.tolist(),
                                        figsize=(6,6),
                                        text_fontsize=14)
    plt.title(label='Test Confusion Matrix', fontsize=20, pad=17)
    plt.xlabel('Predicted Label', labelpad=14)
    plt.ylabel('True Label', labelpad=14)

    # Save the figure
    plt.savefig('./confusionmatrix.png', dpi=300.0, transparent=True)

    # Log the confusion matrix
    experiment.log_image('./confusionmatrix.png', name='Test Confusion Matrix')
    
    
     ############  End experiment  ############
    # End Comet.ml experiment
    experiment.end()