# Transfer learning with pre-trained BERT for text sentiment classification

In this notebook, we will use the Twitter dataset for sentiment classificatoin using pre-trained BERT with a binary classifier on top of it. Given a sentence, we are going to classify whether this sentence has negative meaning. Negative meaning will have label == 0, otherwise will have label == 1.

## Setup

In [None]:
# Dependencies of the preprocessing for BERT inputs and optimizing BERT
!pip install -q tensorflow-text
!pip install -q tf-models-official
!pip install transformers

In [None]:
import os
import datetime

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization

from transformers import pipeline

from sklearn.model_selection import train_test_split

AUTOTUNE = tf.data.experimental.AUTOTUNE

tf.get_logger().setLevel('ERROR')

In [None]:
path_prefix = Path.cwd()
print(path_prefix)

In [None]:
data_path = path_prefix.joinpath('data/')
data_path.mkdir(exist_ok = True)

## Download Dataset
[Dataset](https://www.kaggle.com/c/ml2020spring-hw4)

There are three .txt files -- training_label.txt、training_nolabel.txt、testing_data.txt

- training_label.txt: training data with labels（0 or 1)
    - +++$+++ is separating symbols
    - e.g., 1 +++$+++ are wtf ... awww thanks !

- training_nolabel.txt：training data without labels
    - We will use this training data for semi-supervised learning
    - ex: hates being this burnt !! ouch

- testing_data.txt： Predict with testing data 

    >id,text

    >0,my dog ate our dinner . no , seriously ... he ate it .

    >1,omg last day sooon n of primary noooooo x im gona be swimming out of school wif the amount of tears am gona cry

    >2,stupid boys .. they ' re so .. stupid !

### Download dataset if not have any

In [None]:
%cd $data_path

if not os.path.exists('training_label.txt') or\
    not os.path.exists('training_nolabel.txt') or\
    not os.path.exists('testing_data.txt'):
    print("Dataset is incompleted . Downloading")
    # Method1
    !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1dPHIl8ZnfDz_fxNd2ZeBYedTat2lfxcO' -O 'training_label.txt'
    !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=1x1rJOX_ETqnOZjdMAbEE2pqIjRNa8xcc' -O 'training_nolabel.txt'
    !wget --no-check-certificate 'https://drive.google.com/uc?export=download&id=16CtnQwSDCob9xmm6EdHHR7PNFNiOrQ30' -O 'testing_data.txt'

    # Method2
    # !gdown --id '1lz0Wtwxsh5YCPdqQ3E3l_nbfJT1N13V8' --output data.zip
    # !unzip data.zip
    # !ls
else:
    print("data is all set")
   
%cd $path_prefix

In [None]:
# this is for filtering the warnings
import warnings
warnings.filterwarnings('ignore')

## Preprocess dataset

### Setup paths and configurations

In [None]:
# Preset the paths to dataset
train_with_label = os.path.join(path_prefix, 'data/training_label.txt')
train_no_label = os.path.join(path_prefix, 'data/training_nolabel.txt')
testing_data = os.path.join(path_prefix, 'data/testing_data.txt')

### Read dataset from folder

#### Helping functions

In [None]:
def load_data(path, separator = '+++$+++'):
    """
    Reading dataset.
    
    Data format:
        With label:
            label, separator, text
        Without label:
            text
        Testing:
            id, text
        
    Inputs:
    - path: str. The path of the dataset
    - separator: str. The string separating label and text

    Outputs:
    - x: List of str.
    - y: List of int.
    """

    if 'nolabel' in path:
        return load_non_labelled_data(path, separator)
    elif 'test' in path:
        return load_testing_data(path, separator)
    
    return load_labelled_data(path, separator)


def load_labelled_data(path, separator):
    """
    Reading dataset with label.

    Data format:
        label, separator, text
        
    Inputs:
    - path: str. The path of the dataset
    - separator: str. The string separating label and text

    Outputs:
    - x: List of str.
    - y: List of int
    """

    with open(path, 'r') as f:
        lines = f.readlines()
        lines = [line.strip('\n') for line in lines]

    y = [int(line[0]) for line in lines]
    # Skip the separator
    x = [line[line.find(separator) + len(separator) + 1 : ] for line in lines]

    return x, y


def load_non_labelled_data(path, separator):
    """
    Reading dataset without label.

    Data format:
        text
        
    Inputs:
    - path: str. The path of the dataset
    - separator: str. The string separating label and text

    Outputs:
    - x: List of str.
    """
    with open(path, 'r') as f:
        lines = f.readlines()
        x = [line.strip('\n') for line in lines]

    return x


def load_testing_data(path, separator):
    """
    Reading testing set.
    
    Data format:
        title
        id1,text1
        id2,text2
            .
            .

    Inputs:
    - path: str. The path of the dataset

    Outputs:
    - X: List of str.
    """
    with open(path, 'r') as f:
        lines = f.readlines()
        X = [line.strip('\n').split(separator, maxsplit = 1)[1] for line in lines[1:]]

    return X

#### Load dataset

In [None]:
# Read 'training_label.txt' and 'training_nolabel.txt'
print("loading data ...")
X_train_label, y_train_label = load_data(train_with_label)
X_train, X_val, y_train, y_val = train_test_split(X_train_label, 
                                                  y_train_label, 
                                                  test_size = 0.1)

X_train_no_label = load_data(train_no_label)

X_test = load_data(testing_data, ',')

In [None]:
print(f"Total number of the training data with label: {len(X_train)}")
print(f"Total number of the training data without label: {len(X_train_no_label)}")
print(f"Total number of the validation data: {len(X_val)}")
print(f"Total number of the validation data: {len(X_test)}")

print(f"Positive rate in training dataset: {np.sum(y_train) / len(y_train)}")
print(f"Positive rate in validation dataset: {np.sum(y_val) / len(y_val)}")

### Preprocess training and validation datasets

In [None]:
# Configuration
seed = 42
batch_size = 32

with tf.device('/cpu:0'):
    # Training dataset
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))

    train_dataset = train_dataset.shuffle(len(X_train))
    train_dataset = train_dataset.batch(batch_size)
    train_dataset = train_dataset.cache().prefetch(AUTOTUNE)

    # Validation dataset
    val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

    val_dataset = val_dataset.shuffle(len(X_val))
    val_dataset = val_dataset.batch(batch_size)
    val_dataset = val_dataset.cache().prefetch(AUTOTUNE)

In [None]:
for x_batch, y_batch in train_dataset.take(1):
    print(f"x_batch shape: {x_batch.shape}")
    print(f"y_batch shape: {y_batch.shape}")
    print(f"{x_batch[0]}: {y_batch[0]}")  

## Loading BERT from TensorFlow Hub

Here we are going to use small BERT or original BERT from Tensorflow Hub. More versions can be found in [here](https://tfhub.dev/google/collections/bert/1)

In [None]:
# Choose a BERT model to fine-tune

bert_model_name = 'expert/bert_wiki_books_sst2'

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'expert/bert_wiki_books_sst2':
        'https://tfhub.dev/google/experts/bert/wiki_books/sst2/2'
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'expert/bert_wiki_books_sst2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

### The preprocessing model

In [None]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [None]:
for x_batch, y_batch in train_dataset.take(1):
    text = x_batch[0].numpy().decode('utf8')
    print(text)
    text_test = [text]
    text_preprocessed = bert_preprocess_model(text_test)

    print(f'Keys       : {list(text_preprocessed.keys())}')
    print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
    print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :]}')
    print(f'Input Mask : {text_preprocessed["input_mask"][0, :]}')
    print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :]}')

### The BERT model


## Build the sentiment classifier with pre-trained BERT

In [None]:
def buildClassifierModel():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=False, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(128, activation='gelu', name='dense')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

Plain check of the model

In [None]:
model = buildClassifierModel()
bert_raw_result = model(tf.constant(text_test))
print(tf.sigmoid(bert_raw_result))

## Train the model

### Loss function

In [None]:
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

### Optimizer

For fine-tuning, we use the same optimizer that BERT was originally trained with: the "Adaptive Moments" (Adam). This optimizer minimizes the prediction loss and does regularization by weight decay (not using moments), which is also known as AdamW.

For the learning rate (init_lr), we use the same schedule as BERT pre-training: linear decay of a notional initial learning rate, prefixed with a linear warm-up phase over the first 10% of training steps (num_warmup_steps). In line with the BERT paper, the initial learning rate is smaller for fine-tuning (best of 5e-5, 3e-5, 2e-5).

In [None]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1 * num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')


### Training

In [None]:
model.compile(optimizer=optimizer,
              loss=loss_fn,
              metrics=metrics)
model.summary()

In [None]:
# Checkpoint
checkpoint_filepath = os.path.join(path_prefix, 'first_train_ckpt/')

!rm -rf checkpoint_filepath

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                               save_weights_only=True,
                                                               save_best_only=True)

In [None]:
#@title
# Tiny training dataset - for debugging
# tiny_train_dataset = tf.data.Dataset.from_tensor_slices((X_train[:1000], y_train[:1000]))

# tiny_train_dataset = tiny_train_dataset.batch(batch_size)
# tiny_train_dataset = tiny_train_dataset.cache().prefetch(AUTOTUNE)

# # Tiny validation dataset
# tiny_val_dataset = tf.data.Dataset.from_tensor_slices((X_val[:1000], y_val[:1000]))

# tiny_val_dataset = tiny_val_dataset.batch(batch_size)
# tiny_val_dataset = tiny_val_dataset.cache().prefetch(AUTOTUNE)


# print(f'Training model with {tfhub_handle_encoder}')
# history = model.fit(tiny_train_dataset, 
#                     validation_data = tiny_val_dataset, 
#                     epochs = epochs, 
#                     callbacks=[model_checkpoint_callback])

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = model.fit(train_dataset, 
                    validation_data=val_dataset, 
                    epochs = epochs, 
                    callbacks=[model_checkpoint_callback])

### Evaluate

In [None]:
best_model = buildClassifierModel()
best_model.load_weights(checkpoint_filepath)
best_model.compile(optimizer=optimizer,
                    loss=loss_fn,
                    metrics=metrics)

loss, accuracy = best_model.evaluate(val_dataset)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

### See the training history

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs_range = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs_range, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs_range, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs_range, acc, 'r', label='Training acc')
plt.plot(epochs_range, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

## Semi-supervised Learning 
We can further train the model using the training data without label

### Semi-supervised Learning
Here we use simple self-learning strategy to hard label the data.

In [None]:
# Checkpoint
checkpoint_filepath_final = os.path.join(path_prefix, 'final_train_ckpt/')

!rm -rf checkpoint_filepath_final

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath_final,
                                                               save_weights_only=True,
                                                               save_best_only=True)

In [None]:
threshold = 0.8
num_samples = 50000

In [None]:
used = [False] * len(X_train_no_label)

for _ in range(epochs):
    new_X = []
    new_y = []
    rand_idx = np.arange(len(X_train_no_label))
    np.random.shuffle(rand_idx)

    for idx in rand_idx:
        if used[idx]:
            continue

        pred_prob = tf.sigmoid(best_model.predict(X_train_no_label[idx : idx + 1]))

        if pred_prob > threshold or pred_prob < (1 - threshold):
            label = 1 if pred_prob > 0.5 else 0
            new_X.append(X_train_no_label[idx])
            new_y.append(label)
            used[idx] = True

        if len(new_X) >= num_samples:
            break

    with tf.device('/cpu:0'):
        semi_dataset = tf.data.Dataset.from_tensor_slices((new_X, new_y))
        semi_dataset = semi_dataset.batch(batch_size)
        semi_dataset = semi_dataset.cache().prefetch(AUTOTUNE)

        # Augment dataset
        train_dataset = train_dataset.concatenate(semi_dataset)

        num_total_batch = len( list(train_dataset.as_numpy_iterator()) )

        train_dataset = train_dataset.shuffle(num_total_batch)
        train_dataset = train_dataset.cache().prefetch(AUTOTUNE)

    # Train with the new augmented dataset
    history = best_model.fit(train_dataset, 
                                validation_data = val_dataset, 
                                epochs = 1, 
                                callbacks=[model_checkpoint_callback])


In [None]:
best_model = buildClassifierModel()
best_model.load_weights(checkpoint_filepath_final)
best_model.compile(optimizer=optimizer,
                    loss=loss_fn,
                    metrics=metrics)

loss, accuracy = best_model.evaluate(val_dataset)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

## Testing

### Preprocess test dataset

In [None]:
with tf.device('/cpu:0'):
    test_dataset = tf.data.Dataset.from_tensor_slices(X_test)
    test_dataset = test_dataset.batch(batch_size)
    test_dataset = test_dataset.cache().prefetch(AUTOTUNE)

### Load the best model

In [None]:
print('\nload model ...')
best_model = buildClassifierModel()
best_model.load_weights(checkpoint_filepath_final)
best_model.compile(optimizer=optimizer,
                    loss=loss_fn,
                    metrics=metrics)

### Make prediction

In [None]:
def testing(model, dataset):
    outputs = model.predict(dataset)

    outputs_prob = tf.math.sigmoid(outputs.reshape(-1))

    res = np.array([1 if prob > 0.5 else 0 for prob in outputs_prob])

    return res

In [None]:
outputs = testing(model, test_dataset)

In [None]:
# Write the result to a CSV file
tmp = pd.DataFrame({"id":[str(i) for i in range(len(X_test))],"label":outputs})
print("save csv ...")
tmp.to_csv(os.path.join(path_prefix, 'predict.csv'), index=False)
print("Finish Predicting")