In [None]:
# Uncomment the next statement in case transformers module is not installed
# !pip install transformers

In [None]:
import tensorflow as tf
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
import numpy as np
import pandas as pd
from transformers import TFAutoModel, AutoTokenizer, AutoModel
import matplotlib.pyplot as plt

#Configuration
EPOCHS = 10
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
# change max sequence length depending on how long data samples are
MAX_LEN = 128
PRETRAINED_MODEL = "Set this to the name or path of any pretrained model"
DATA = "Set this to the path of your data"

In [None]:
# Defining metrics

from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
# Building CNN model.  Hyperparameters should be changed depending on your case

def build_model_cnn(transformer, max_len=512):

    inputs = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    sequence_output = transformer(inputs)[0]
    pooled_output = tf.keras.layers.Dense(transformer.config.hidden_size,
                          kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=transformer.config.initializer_range),
                          activation="tanh"
                         )(sequence_output)
    cnn = tf.keras.layers.Conv1D(128, 3,
                                bias_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                                kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.01, l2=0.0),
                                activity_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0))(pooled_output)
    max_pooling = tf.keras.layers.MaxPool1D(2)(cnn)
    dropout = tf.keras.layers.Dropout(0.1)(max_pooling)
    flatten = tf.keras.layers.Flatten()(dropout)
    out = tf.keras.layers.Dense(1, 
                activation='sigmoid',
                bias_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                activity_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.01)
               )(flatten)
    model = tf.keras.models.Model(inputs=inputs, outputs=out)
    
    return model

In [None]:
# building LSTM model. Hyperparameters should be changed depending on your case

def build_model_lstm(transformer, max_len=512):

    inputs = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
    sequence_output = transformer(inputs)[0]
    pooled_output = tf.keras.layers.Dense(transformer.config.hidden_size,
                          kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=transformer.config.initializer_range),
                          activation="tanh"
                         )(sequence_output)
    lstm = tf.keras.layers.LSTM(128, 
                                dropout=0.2,
                                bias_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                                kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.01, l2=0.0),
                                activity_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0)
                               )(pooled_output)
    dropout2 = tf.keras.layers.Dropout(0.1)(lstm)
    out = tf.keras.layers.Dense(1, 
                activation='sigmoid',
                bias_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                activity_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.01)
               )(dropout2)
    model = tf.keras.models.Model(inputs=inputs, outputs=out)
    
    return model

In [None]:
# Building CNN-LSTM model. Hyperparameters should be changed depending on your case

def build_model_cnn_lstm(transformer, max_len=512):


    inputs = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    sequence_output = transformer(inputs)[0]
    pooled_output = tf.keras.layers.Dense(transformer.config.hidden_size,
                          kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=transformer.config.initializer_range),
                          activation="tanh"
                         )(sequence_output)
    cnn = tf.keras.layers.Conv1D(128, 2,
                                bias_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                                kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.01, l2=0.0),
                                activity_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0)
                                )(pooled_output)
    max_pooling = tf.keras.layers.MaxPool1D(2)(cnn)
    dropout = tf.keras.layers.Dropout(0.1)(max_pooling)
    flatten = tf.keras.layers.Flatten()(dropout)
    lstm = tf.keras.layers.LSTM(128,
                                bias_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                                kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.01, l2=0.0),
                                activity_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0)
                               )(flatten)
    dropout2 = tf.keras.layers.Dropout(0.2)(lstm)
    out = tf.keras.layers.Dense(1, 
                activation='sigmoid',
                bias_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                kernel_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.0),
                activity_regularizer=tf.keras.regularizers.L1L2(l1=0.0, l2=0.01)
               )(dropout2)
    model = tf.keras.models.Model(inputs=inputs, outputs=out)
    
    return model

In [None]:
off_data = pd.read_excel(DATA)

off_data.dropna()

In [None]:
from sklearn.model_selection import train_test_split

X = off_data['text']
y = off_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 42)

In [None]:
def regular_encode(texts, tokenizer, maxlen=200):
    
    # encode the word to vector of integer
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        is_split_into_words=False,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)

In [None]:
X_train = regular_encode(list(X_train), tokenizer, maxlen = MAX_LEN)
X_test = regular_encode(list(X_test), tokenizer, maxlen = MAX_LEN)

In [None]:
# Check the weights of each class

from sklearn.utils.class_weight import compute_class_weight

# Count samples per class
classes_zero = off_data[off_data['label'] == 0]
classes_one = off_data[off_data['label'] == 1]

# Convert parts into NumPy arrays for weight computation
zero_numpy = classes_zero['label'].to_numpy()
one_numpy = classes_one['label'].to_numpy()
all_together = np.concatenate((zero_numpy, one_numpy))
unique_classes = np.unique(all_together)

# Compute weights
weights = compute_class_weight( "balanced", classes = unique_classes,y= all_together)

weights = {l:c for l,c in zip(np.unique(all_together), weights)}
print(weights)

In [None]:
# preparing the training and test datasets for the model

train_dataset = (
    tf.data.Dataset 
    .from_tensor_slices((X_train, y_train)) 
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE) 
)

test_dataset = (
    tf.data.Dataset 
    .from_tensor_slices((X_test, y_test)) 
    .batch(BATCH_SIZE) 
    .cache()
    .prefetch(tf.data.experimental.AUTOTUNE) 
)

In [None]:
train_data_size = X_train.shape[0]
steps_per_epoch = int(train_data_size / BATCH_SIZE)
num_train_steps = steps_per_epoch * EPOCHS
initial_learning_rate=3e-5

# Use polynomial decay for the learning rate
polynomial_decay = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=initial_learning_rate,
    end_learning_rate=0,
    decay_steps=num_train_steps)

optimizer = Adam(
    learning_rate = polynomial_decay)

x = tf.linspace(0, num_train_steps, 1001)
y = [linear_decay(xi) for xi in x]
plt.plot(x,y)
plt.xlabel('Train step')
plt.ylabel('Learning rate')

In [None]:
with strategy.scope():
    #take the encoder results of bert from transformers and use it as an input in the NN model
    transformer_layer = TFAutoModel.from_pretrained(PRETRAINED_MODEL, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, from_pt=True)
    
    # change the function call according to which experiment you want to try: CNN, LSTM, or CNN-LSTM
    model = build_model_cnn_lstm(transformer_layer, max_len=MAX_LEN)
    
    model.compile(optimizer=optimizer, 
                  loss='binary_crossentropy', 
                  metrics=[recall_m, precision_m, f1_m, tf.keras.metrics.BinaryAccuracy(name="binary_accuracy", dtype=None, threshold=0.5)]
                 )
model.summary()

In [None]:
n_steps = X_train.shape[0] // BATCH_SIZE

# EarlyStopping callback is used to stop the training once the validation loss goes up (patience=1)
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=1)

# ModelChekpoint callback is used to save the best model depending on validation f1_score (highest value)
# model.h5 is the path of the checkpoint in case you uncomment the last statement
mc = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_f1_m', mode='max', verbose=1, save_best_only=True)

train_history = model.fit(train_dataset, steps_per_epoch=n_steps,
                          validation_data=test_dataset,
                          epochs=10,
                          class_weight=weights,
                          callbacks=[es, mc])

# Uncomment the next statement if you want to save the checkpoint
# model.save("myModel")

In [None]:
# Visualize the evolution of the training and validation losses

history = train_history
loss = history.history['loss']
val_loss = history.history['val_loss']


epochs = range(0,len(loss))

plt.plot(epochs, np.array(loss), label='Loss')
plt.plot(epochs, np.array(val_loss), label='Validation Loss')

plt.title('Training and Validation Losses')
plt.xlabel('Epochs')
plt.ylabel('Value')
plt.legend()
plt.show()

In [None]:
# Visualize the evolution of the training and validation accuracy and f1 scores

history = train_history
acc = history.history['binary_accuracy']
f1 = history.history['f1_m']
acc_val = history.history['val_binary_accuracy']
f1_val = history.history['val_f1_m']
epochs = range(0,len(acc))

plt.plot(epochs, np.array(acc), label='Accuracy')
plt.plot(epochs, np.array(acc_val), label='Validation Accuracy')
plt.plot(epochs, np.array(f1), label='F1')
plt.plot(epochs, np.array(f1_val), label='Validation F1')

plt.title('Training and Validation Accuracy and F1')
plt.xlabel('Epochs')
plt.ylabel('Value')
plt.legend()
plt.show()