# Transfer Learning NLP. Ulises Bértolo.

### Creando el Dataset y la instancia del modelo

In [28]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from datasets import load_dataset
import tensorflow as tf
import numpy as np

# Loading our dataset
tweet_dataset = load_dataset(path="tweet_eval", name="emotion")

Reusing dataset tweet_eval (C:\Users\200248\.cache\huggingface\datasets\tweet_eval\emotion\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
# Instantiating our DistilBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/347M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'activation_13', 'vocab_transform', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

### Preproceso de Datos

In [32]:
tweet_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})

In [33]:
tweet_dataset['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 3257
})

In [34]:
print(f"Sequence samples:\n {tweet_dataset['train']['text'][:2]}\n")
print(f"Label samples:\n {tweet_dataset['train']['label'][:2]}")

Sequence samples:
 ["“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry", "My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs"]

Label samples:
 [2, 0]


In [35]:
class_names = {0: "anger", 1: "joy", 2: "optimism", 3: "sadness"}

In [36]:
# A function for finding the length of the longest sequence in the data
def find_max_length(dataset):
    return len(max(dataset, key=lambda x: len(x.split())).split())

In [37]:
# Obtaining the length of the longest sequences in our data splits
train_max_length = find_max_length(tweet_dataset["train"]["text"])
val_max_length = find_max_length(tweet_dataset["validation"]["text"])
test_max_length = find_max_length(tweet_dataset["test"]["text"])

# Inspecting the length of the longest sequences
print(f"Longest sequence in train set has {train_max_length} words")
print(f"Longest sequence in val set has {val_max_length} words")
print(f"Longest sequence in test set has {test_max_length} words")

Longest sequence in train set has 33 words
Longest sequence in val set has 32 words
Longest sequence in test set has 36 words


In [38]:
# A function for discarding sequences beyond a specified length
def filter_dataset(dataset, num_words):    
    return dataset.filter(lambda x: len(x["text"].split()) <= num_words)

In [39]:
# Specifying the max length for sequences
num_words = 36

# Dropping sequences longer than the specified number from the dataset
filtered_dataset = filter_dataset(tweet_dataset, num_words)

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [40]:
# Inspecting the shortened dataset
print(filtered_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 374
    })
})


In [41]:
# A function for tokenizing our dataset
def tokenize_dataset(examples):
    return tokenizer(examples["text"], padding="max_length", 
                     truncation=True, max_length=36)

In [42]:
# Tokenizing our dataset
tokenized_dataset = filtered_dataset.map(tokenize_dataset)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [43]:
# Inspecting the tokenized dataset
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3257
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1421
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 374
    })
})


In [44]:
# Inspecting a training sample
print(tokenized_dataset["train"][0])

{'text': "“Worry is a down payment on a problem you may never have'. \xa0Joyce Meyer.  #motivation #leadership #worry", 'label': 2, 'input_ids': [101, 1523, 4737, 2003, 1037, 2091, 7909, 2006, 1037, 3291, 2017, 2089, 2196, 2031, 1005, 1012, 11830, 11527, 1012, 1001, 14354, 1001, 4105, 1001, 4737, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [46]:
# Removing "text" and "label" columns from our data splits to craft features for the model
train_features = tokenized_dataset["train"].remove_columns(["text", "label"]).with_format("tensorflow")
val_features = tokenized_dataset["validation"].remove_columns(["text", "label"]).with_format("tensorflow")
test_features = tokenized_dataset["test"].remove_columns(["text", "label"]).with_format("tensorflow")

# Converting our features to TF Tensors
train_features = {x: train_features[x] for x in tokenizer.model_input_names}
val_features = {x: val_features[x] for x in tokenizer.model_input_names}
test_features = {x: test_features[x] for x in tokenizer.model_input_names}

In [47]:
# Inspecting expected model input names
print(tokenizer.model_input_names)

['input_ids', 'attention_mask']


In [48]:
# Inspecting our Tensors
print(train_features)

{'input_ids': <tf.Tensor: shape=(3257, 36), dtype=int64, numpy=
array([[  101,  1523,  4737, ...,     0,     0,     0],
       [  101,  2026, 18328, ...,     0,     0,     0],
       [  101,  2053,  2021, ...,     0,     0,     0],
       ...,
       [  101,  1030,  5310, ...,     0,     0,     0],
       [  101,  2017,  2031, ...,     0,     0,     0],
       [  101,  1030,  5310, ...,     0,     0,     0]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(3257, 36), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int64)>}


In [50]:
# Importing the function for one-hot encoding
from tensorflow.keras.utils import to_categorical

# Creating labels for each of the data splits
train_labels = to_categorical(tokenized_dataset["train"]["label"])
val_labels = to_categorical(tokenized_dataset["validation"]["label"])
test_labels = to_categorical(tokenized_dataset["test"]["label"])


# Inspecting training labels
print(train_labels[:5])

[[0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]]


In [51]:
# Importing the TF Dataset class
from tensorflow.data import Dataset

# Creating TF Datasets for each of our data splits
train_dataset = Dataset.from_tensor_slices((train_features, train_labels))
val_dataset = Dataset.from_tensor_slices((val_features, val_labels))
test_dataset = Dataset.from_tensor_slices((test_features, test_labels))

# Shuffling and batching our data
train_dataset = train_dataset.shuffle(len(train_features), seed=2).batch(8)
val_dataset = val_dataset.shuffle(len(train_features), seed=2).batch(8)
test_dataset = test_dataset.shuffle(len(train_features), seed=2).batch(8)

### Applying fine tuning to the model

In [52]:
# Inspecting the model's architecture
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,956,548
Trainable params: 66,956,548
Non-trainable params: 0
_________________________________________________________________


In [53]:
# Freezing the DistilBERT block
model.layers[0].trainable = False

In [54]:
# Inspecting the model again to see the differences in trainable params
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,956,548
Trainable params: 593,668
Non-trainable params: 66,362,880
_________________________________________________________________


In [55]:
# A function defining our learning rate schedule
def lr_decay(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * np.exp(-0.1 * epoch)

In [56]:
# Instantiating our learning rate scheduler callback
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(schedule=lr_decay, verbose=1)

In [58]:
# Setting some hyperparameters and compiling the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), 
              metrics=tf.keras.metrics.CategoricalAccuracy())

In [59]:
# Training the model
history = model.fit(train_dataset, validation_data=val_dataset, 
                    epochs=15, callbacks=[lr_scheduler])


Epoch 1: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 1/15

Epoch 2: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 2/15

Epoch 3: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 3/15

Epoch 4: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 4/15

Epoch 5: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 5/15

Epoch 6: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 6/15

Epoch 7: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 7/15

Epoch 8: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 8/15

Epoch 9: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 9/15

Epoch 10: LearningRateScheduler setting learning rate to 0.0010000000474974513.
Epoch 10/15

Epoch 11: LearningRateScheduler setting learning rate to 0.0003678794586447782.
Epoch 1

In [60]:
# Evaluating our model on the test set
model.evaluate(test_dataset)



[0.696970522403717, 0.733286440372467]