# **Transfer learning with DistilBert**

The goal of this excercise is to build a text classifier using the pretrained DistilBert published by HuggingFace.

In [32]:
!pip install -q transformers tfds-nightly

import matplotlib.pyplot as plt
import tensorflow.keras as keras
import pandas as pd

try: # this is only working on the 2nd try in colab :)
  from transformers import DistilBertTokenizer, TFDistilBertModel
except Exception as err: # so we catch the error and import it again
  from transformers import DistilBertTokenizer, TFDistilBertModel

import numpy as np
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, Input, Dropout

import tensorflow_datasets as tfds

dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


# Data Preparation

Clean the text and targets

In [33]:
def load_data(save_dir="./"):
  dataset = tfds.load('glue/cola', shuffle_files=True)
  train = tfds.as_dataframe(dataset["train"])
  val = tfds.as_dataframe(dataset["validation"])
  test = tfds.as_dataframe(dataset["test"])
  return train, val, test

def prepare_raw_data(df):
  raw_data = df.loc[:, ["idx", "sentence", "label"]]
  raw_data["label"] = raw_data["label"].astype('category')
  return raw_data

train, val, test = load_data()
train = prepare_raw_data(train)
val = prepare_raw_data(val)
test = prepare_raw_data(test)

In [34]:
def clean_data(df):
  clean_data = df.drop_duplicates(subset=["sentence", "label"])
  clean_data = clean_data.drop_duplicates(subset=["sentence"], keep=False)
  clean_data['token_count'] = [len(x.split()) for x in clean_data.sentence]
  clean_data = clean_data[clean_data['token_count'] >= 5]
  return clean_data

train = clean_data(train)
val = clean_data(val)
test = clean_data(test)

print(train.head())
print(test.head())

    idx                                           sentence label  token_count
0  1680  b'It is this hat that it is certain that he wa...     1           12
1  1456  b'Her efficient looking up of the answer pleas...     1           10
2  4223          b'Both the workers will wear carnations.'     1            6
3  4093  b'John enjoyed drawing trees for his syntax ho...     1            8
4  7111  b'We consider Leslie rather foolish, and Lou a...     1           10
    idx                                           sentence label  token_count
0   163              b'Brian was wiping behind the stove.'    -1            6
1   131         b'You could give a headache to a Tylenol.'    -1            8
2  1021                            b'I want to meet at 6.'    -1            6
4  1039    b"Many people said they were sick who weren't."    -1            8
5   778  b'A dog with brown spots chased a cat with no ...    -1           11


Prepare the text for DistilBert

In [35]:
def extract_text_and_y(df):
  text = [x.decode('utf-8') for x in  df.sentence.values]
  # for multiclass problems, you can use sklearn.preprocessing.OneHotEncoder, but we only have two classes, so we'll use a single sigmoid output
  y = np.array([x for x in df.label.values])
  return text, y

def encode_text(text):
    model_inputs_and_masks = dbert_tokenizer(
        text, 
        return_tensors="tf",
        padding='max_length',
        truncation=True,
        max_length=100
    )
    input_ids = model_inputs_and_masks['input_ids']
    attention_mask = model_inputs_and_masks['attention_mask']

    return input_ids, attention_mask

# the following prepares the input for running in DistilBert
train_text, train_y = extract_text_and_y(clean_data(train))
val_text, val_y = extract_text_and_y(clean_data(val))
test_text, test_y = extract_text_and_y(clean_data(test))

train_input, train_mask = encode_text(train_text)
val_input, val_mask = encode_text(val_text)
test_input, test_mask = encode_text(test_text)

train_model_inputs_and_masks = {
    'inputs' : train_input,
    'masks' : train_mask
}

val_model_inputs_and_masks = {
    'inputs' : val_input,
    'masks' : val_mask
}

test_model_inputs_and_masks = {
    'inputs' : test_input,
    'masks' : test_mask
}

# Modelling

## Build and Train Model

In [36]:
def build_model(base_model, trainable=False, params={}):
    # build the model, with the option to freeze the parameters in distilBERT
    # the cls token corresponds to the first element in the sequence in DistilBert

    max_seq_len = params["max_seq_len"]
    inputs = Input(shape = (max_seq_len,), dtype='int64', name='inputs')
    masks  = Input(shape = (max_seq_len,), dtype='int64', name='masks')

    base_model.trainable = trainable

    dbert_output = base_model(inputs, attention_mask=masks)
    dbert_last_hidden_state = dbert_output.last_hidden_state

    # add additional layers
    # 'params' as dictionary for hyperparameter in experiments

    dbert_cls_output = dbert_last_hidden_state[:,0,:]

    my_output = Dense(params["layer_width1"], activation='relu')(dbert_cls_output)
    my_output = Dropout(params["dropout1"])(my_output)
    my_output = Dense(params["layer_width2"], activation='relu')(my_output)
    my_output = Dropout(params["dropout2"])(my_output)

    probs = Dense(1, activation='sigmoid')(my_output)

    model = keras.Model(inputs=[inputs, masks], outputs=probs)
    model.summary()
    return model

dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')
params={"max_seq_len" : train_input.shape[1],
        "layer_width1" : 128,
        "dropout1" : 0.3,
        "layer_width2" : 64,
        "dropout2" : 0.3}

model = build_model(dbert_model, params=params)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, 100)]        0           []                               
                                                                                                  
 masks (InputLayer)             [(None, 100)]        0           []                               
                                                                                                  
 tf_distil_bert_model_4 (TFDist  TFBaseModelOutput(l  66362880   ['inputs[0][0]',                 
 ilBertModel)                   ast_hidden_state=(N               'masks[0][0]']                  
                                one, 100, 768),                                                   
                                 hidden_states=None                                         

In [37]:
def compile_model(model):
    # include relevant auc metrics when training
    
    model.compile(
        loss=keras.losses.BinaryCrossentropy(),
        optimizer=keras.optimizers.Adam(learning_rate=1e-4),
        metrics=[
            'accuracy', 
            keras.metrics.AUC(curve="ROC", multi_label=True), 
            keras.metrics.AUC(curve="PR", multi_label=True), 
            keras.metrics.Precision(),
            keras.metrics.Recall()
        ]
    )
    
    return model

model = compile_model(model)

In [38]:
def train_model(model, model_inputs_and_masks_train, model_inputs_and_masks_val, y_train, y_val, batch_size, num_epochs):
    es = keras.callbacks.EarlyStopping(
        monitor="val_loss", 
        mode='min', 
        verbose=1,
        patience=1
    )
    history = model.fit(
            model_inputs_and_masks_train, 
            y_train,
            batch_size=batch_size,
            epochs=num_epochs,
            verbose=1,
            validation_data=(model_inputs_and_masks_val, y_val),
            callbacks=[es]
        )
    return model, history

model, history = train_model(model, train_model_inputs_and_masks, val_model_inputs_and_masks, train_y, val_y, batch_size=128, num_epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 12: early stopping
