# Basic Parts

In [1]:
from transformers import TFAutoModelForSequenceClassification
from transformers import TFBertModel
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import AdamWeightDecay
from sklearn.metrics import accuracy_score

In [2]:
## Load Data 
from datasets import load_dataset
dataset = load_dataset("financial_phrasebank", "sentences_allagree", split = "train")

In [3]:
## Data Tokenization & Train-validation-test split
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True)

def train_val_test_split(tokenized_dataset, test_size, val_size=0):
        tokenized_dataset.shuffle()
        temp1 = tokenized_dataset.train_test_split(test_size = test_size)
        tokenized_test = temp1["test"]
        temp2 = temp1["train"]
        temp3 = temp2.train_test_split(test_size = val_size/(1-test_size))
        tokenized_val = temp3["test"]
        tokenized_train = temp3["train"]
        return tokenized_train, tokenized_val, tokenized_test

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenized_dataset_train, tokenized_dataset_val, tokenized_dataset_test =train_val_test_split(tokenized_dataset, test_size=0.2, val_size=0.2)

In [4]:
##convert data to trainable data
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
tf_train_dataset = tokenized_dataset_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_dataset_val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)
tf_test_dataset = tokenized_dataset_test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [13]:
## freeze the layers of the pretrained model
def createTransferModel(base_model, freeze_n=0, freeze = True):
    if freeze == True:
        for i, layer in enumerate(base_model.layers[:-freeze_n]):
            base_model.layers[i].trainable=False
    else:
        for i, layer in enumerate(base_model.layers[:]):
            base_model.layers[i].trainable=True

    return base_model

training with freezed layers

In [5]:
from transformers import TFAutoModelForSequenceClassification
from transformers import TFBertModel
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import AdamWeightDecay
from sklearn.metrics import accuracy_score
checkpoint = "bert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
transfer_model = createTransferModel(model,1)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
# optimizer = AdamWeightDecay(transfer_model.config)
transfer_model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10,
    callbacks=[callback]
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2471237fb20>

In [6]:
transfer_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 109,484,547
Trainable params: 2,307
Non-trainable params: 109,482,240
_________________________________________________________________


In [7]:
## report the accuracy score of the validation dataset
from sklearn.metrics import accuracy_score
preds_val = tf.nn.softmax(transfer_model.predict(tf_validation_dataset)["logits"])
class_preds_val = np.argmax(preds_val, axis=1)
accuracy_val = accuracy_score(tokenized_dataset_val["label"],class_preds_val)
accuracy_val



0.7748344370860927

In [8]:
## report the accuracy score of the test dataset
preds_test = tf.nn.softmax(transfer_model.predict(tf_test_dataset)["logits"])
class_preds_test = np.argmax(preds_test, axis=1)
accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
accuracy_test



0.7505518763796909

In [9]:
## unfreeze all the layers and train with a smaller learning rate
transfer_model_unfreezed = createTransferModel(transfer_model, freeze=False)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
transfer_model_unfreezed.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_model_unfreezed.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 15,
    callbacks = [callback]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15


<keras.callbacks.History at 0x246ff379970>

In [10]:
preds_val = tf.nn.softmax(transfer_model_unfreezed.predict(tf_validation_dataset)["logits"])
class_preds_val = np.argmax(preds_val, axis=1)
accuracy_val = accuracy_score(tokenized_dataset_val["label"],class_preds_val)
accuracy_val



0.9668874172185431

In [11]:
preds_test = tf.nn.softmax(transfer_model_unfreezed.predict(tf_test_dataset)["logits"])
class_preds_test = np.argmax(preds_test, axis=1)
accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
accuracy_test



0.9624724061810155

In [12]:
## Instead of head first then body, I train all the weights simultaneously from the begining.
transfer_model_allw = createTransferModel(model,freeze=False)
transfer_model_allw.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_model_allw.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x24a11980f70>

In [13]:
transfer_model_allw.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 109,484,547
Trainable params: 109,484,547
Non-trainable params: 0
_________________________________________________________________


In [14]:
preds_val = tf.nn.softmax(transfer_model_allw.predict(tf_validation_dataset)["logits"])
class_preds_val = np.argmax(preds_val, axis=1)
accuracy_val = accuracy_score(tokenized_dataset_val["label"],class_preds_val)
accuracy_val



0.9359823399558499

In [15]:
preds_test = tf.nn.softmax(transfer_model_allw.predict(tf_test_dataset)["logits"])
class_preds_test = np.argmax(preds_test, axis=1)
accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
accuracy_test



0.9359823399558499

**Discussion**

In summary, I choose to use the pretrained model with checkpoint = "bert-base-uncased". I first freeze the layers of the pretrained model, and only the weights of the classification head can be trained. After fitting, the accuracy score on the validation dataset is around 77%, and the accuracy score on the test dataset is around 74%. We can see from the training process that the validation accuracy is increasing, but it can only reach around 80% at maximum. I then unfreeze all the layers of the model to enable training, with a smaller learning rate. The reasoning is that after getting the weights of the classifier head, the weights of the previously-freezed layers can be "fine-tuned" for our dataset, which are not supposed to change much. Amazingly, with all layers being trainable, the accuracy score on the validation dataset is around 96% and the accuracy score on the test dataset is around 96%.

Next, I also try another pre-trained model from the same checkpoint, but with no layer freezed at first. Namely, I train all the weights of the bert model as well as of the classification head simultaneously. The model's performance varies. In the case above, its performance turns out good, with a validation accuracy score of 94% and a test accuracy score of around 96%, even though it is not as good as the first approach. However, in some other cases of the training, with the same hyperparameters, the model could not improve its performance in terms of validation accuracy. The potential problem here is that the pretrained-model has an extremely huge amount of parameters. All layers being unfreezed simply gives too much freedom to the training process, and makes the training unstable, the model hard to find the optimum.

# Extra parts

## Create (and fit the model with) a TensorFlow Dataset (TFDS)

In [5]:
def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True, padding="max_length")
dataset_build = load_dataset("financial_phrasebank", "sentences_allagree", split = "train")
tokenized_dataset_build = dataset_build.map(tokenize_function, batched=True)

Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

In [6]:
def np_get_batch(
    indices, dataset, cols_to_retain, collate_fn, collate_fn_args, columns_to_np_types, return_dict=False
):
    if not isinstance(indices, np.ndarray):
        indices = indices.numpy()

    is_batched = True
    if isinstance(indices, np.integer):
        batch = dataset[indices.item()]
        is_batched = False
    elif np.all(np.diff(indices) == 1):
        batch = dataset[indices[0] : indices[-1] + 1]
    elif isinstance(indices, np.ndarray):
        batch = dataset[indices]
    else:
        raise RuntimeError("Unexpected type for indices: {}".format(type(indices)))

    if cols_to_retain is not None:
        batch = {
            key: value
            for key, value in batch.items()
            if key in cols_to_retain or key in ("label", "label_ids", "labels")
        }

    if is_batched:
        actual_size = len(list(batch.values())[0])  # Get the length of one of the arrays, assume all same
        # Our collators expect a list of dicts, not a dict of lists/arrays, so we invert
        batch = [{key: value[i] for key, value in batch.items()} for i in range(actual_size)]
    batch = collate_fn(batch, **collate_fn_args)

    if return_dict:
        out_batch = {}
        for col, cast_dtype in columns_to_np_types.items():
            # In case the collate_fn returns something strange
            array = np.array(batch[col])
            array = array.astype(cast_dtype)
            out_batch[col] = array
    else:
        out_batch = []
        for col, cast_dtype in columns_to_np_types.items():
            # In case the collate_fn returns something strange
            array = np.array(batch[col])
            array = array.astype(cast_dtype)
            out_batch.append(array)
    return out_batch

In [7]:
output_signature = {'input_ids': tf.TensorSpec(shape=(None, None, None), dtype=tf.int64, name=None),
  'token_type_ids': tf.TensorSpec(shape=(None, None, None), dtype=tf.int64, name=None),
  'attention_mask': tf.TensorSpec(shape=(None, None, None), dtype=tf.int64, name=None),
  'labels': tf.TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}
columns_to_np_types = {'input_ids': np.int64,
  'token_type_ids': np.int64,
  'attention_mask': np.int64,
  'labels': np.int64}

In [8]:
from functools import partial
getter_fn = partial(
        np_get_batch,
        dataset=tokenized_dataset_build,
        cols_to_retain=["attention_mask", "input_ids", "token_type_ids", "labels"],
        collate_fn=data_collator,
        collate_fn_args={},
        columns_to_np_types=columns_to_np_types,
        return_dict=False,
    )
tout = [tf.dtypes.as_dtype(dtype) for dtype in columns_to_np_types.values()]

@tf.function(input_signature=[tf.TensorSpec(None, tf.int64)])
def fetch_function(indices):
    output = tf.py_function(
        getter_fn,
        inp=[indices],
        Tout=tout,
    )
    return {key: output[i] for i, key in enumerate(columns_to_np_types.keys())}

tf_dataset = tf.data.Dataset.range(len(dataset))

tf_dataset = tf_dataset.map(fetch_function)

In [9]:
def ensure_shapes(input_dict):
        return {key: tf.ensure_shape(val, output_signature[key].shape[1:]) for key, val in input_dict.items()}
    
class Operation:
    def __init__(self, dataset):
        self.dataset = dataset

    def shuffle(self):
        self.dataset = self.dataset.shuffle(buffer_size = self.dataset.cardinality())
        return self.dataset

    def batch(self, batch_size):
        self.dataset = self.dataset.batch(batch_size, drop_remainder=True)
        return self.dataset

    def train_test_split(self, train_size):
        dataset1 = self.dataset.take(train_size)
        dataset2 = self.dataset.skip(train_size)
        return dataset1.map(ensure_shapes), dataset2.map(ensure_shapes)

In [10]:
operation = Operation(tf_dataset)
shuffled_dataset = operation.shuffle()
batched_dataset = operation.batch(8)
train_dataset, test_dataset = operation.train_test_split(800)

In [11]:
train_dataset

<MapDataset element_spec={'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'labels': TensorSpec(shape=(None,), dtype=tf.int64, name=None)}>

In [14]:
checkpoint = "bert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
transfer_model = createTransferModel(model,1)
# optimizer = AdamWeightDecay(transfer_model.config)
transfer_model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_model.fit(
    train_dataset,
    epochs = 10
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x10bc7e91ac0>

**Discussion**

I build the tensorflow dataset from scratch, and then I write an class "operation" to shuffle, batch, and split the dataset. One thing I have taken care of the shape of the dataset after batching/splitting. In order for the dataset to be fed into the model, I removed the batch/take/skip dimension so that the tensor shape are compatible with what the model requires. Also, I choose to pad the data with max length, which might slow down the training process.

## Create your own Classification head

In [5]:
from transformers import TFBertModel
import keras
from transformers import TFAutoModelForSequenceClassification
from transformers import TFBertModel
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import AdamWeightDecay
        
class MyModel(keras.Model):
    def __init__(self):
        super().__init__()
        self.bertlayer = TFBertModel.from_pretrained("bert-base-uncased")
        self.dense1 = keras.layers.Dense(32, activation="relu")
        self.dropout = keras.layers.Dropout(0.1)
        self.dense2 = keras.layers.Dense(3, activation="softmax")

    def call(self, inputs):
  
        inputs1 = self.bertlayer(input_ids = inputs["input_ids"], attention_mask = inputs["attention_mask"], token_type_ids = inputs["token_type_ids"])
        inputs2 = self.dropout(inputs1[1])
        inputs3= self.dense1(inputs2)
        return self.dense2(inputs3)


In [6]:
model_own = MyModel()
model_own_freezed = createTransferModel(model_own,3)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
model_own_freezed.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=SparseCategoricalCrossentropy(from_logits=False),
    metrics=["accuracy"],
)
model_own_freezed.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10,
    callbacks=[callback]
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1329109dd00>

In [7]:
model_own_unfreezed = createTransferModel(model_own_freezed, freeze=False)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)
model_own_unfreezed.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss=SparseCategoricalCrossentropy(from_logits=False),
    metrics=["accuracy"],
)
model_own_unfreezed.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10,
    callbacks=[callback]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x132618000d0>

In [9]:
from sklearn.metrics import accuracy_score
preds_val = model_own_freezed.predict(tf_validation_dataset)
class_preds_val = np.argmax(preds_val, axis=1)
accuracy_val = accuracy_score(tokenized_dataset_val["label"],class_preds_val)
accuracy_val



0.9139072847682119

In [11]:
preds_test = model_own_freezed.predict(tf_test_dataset)
class_preds_test = np.argmax(preds_test, axis=1)
accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
accuracy_test



0.9116997792494481

**Discussion**

The key insight is that the pre-trained BERT model outputs contextualized token representations from the [CLS] special token. This output can serve as features that capture the semantic meaning of the input text. We can feed that into our own classifier head to make predictions. First, when loading the pre-trained BERT model using TFBertModel.from_pretrained, this model does not have any prediction head. It just outputs token embeddings. We first feed the [CLS] embedding into a Dense layer to reduce the dimensionality and introduce some nonlinearity with ReLU activation. The dropout layer regularizes the model. Finally, the last Dense layer has an output size of 3, to make predictions for the 3 classes in our dataset, with a softmax activation to output prediction probabilities.

## 2.3. Use different "flavors" of the dataset

In [12]:
class trainer():
    def __init__(self, dataset, checkpoint):
        self.dataset = dataset.shuffle(seed=42)
        self.checkpoint = checkpoint
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        
    def tokenize_function(self, example):
        return self.tokenizer(example["sentence"], truncation=True, padding=True)

    def get_tokenized_data(self):
        tokenized_dataset = self.dataset.map(self.tokenize_function, batched=True)
        return tokenized_dataset

    def train_val_test_split(self, test_size, val_size=0):
        tokenized_dataset = self.get_tokenized_data()
        temp1 = tokenized_dataset.train_test_split(test_size = test_size)
        tokenized_test = temp1["test"]
        temp2 = temp1["train"]
        temp3 = temp2.train_test_split(test_size = val_size/(1-test_size))
        tokenized_val = temp3["test"]
        tokenized_train = temp3["train"]
        return tokenized_train, tokenized_val, tokenized_test


    def train(self, model, tokenized_dataset_train, tokenized_dataset_val=None,freeze_layer=0, epochs=10, unfreeze_after=False):
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
        # tokenized_dataset_train, tokenized_dataset_val, tokenized_dataset_test = self.train_val_test_split(test_size=test_size, val_size = val_size)
        tf_train_dataset = tokenized_dataset_train.to_tf_dataset(
            columns=["attention_mask", "input_ids", "token_type_ids"],
            label_cols=["label"],
            shuffle=True,
            collate_fn=data_collator,
            batch_size=8,
        )

        tf_val_dataset = tokenized_dataset_val.to_tf_dataset(
            columns=["attention_mask", "input_ids", "token_type_ids"],
            label_cols=["label"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=8,
        )

        transfer_model = createTransferModel(model,freeze_n=freeze_layer)
        transfer_model.compile(
            optimizer="adam",
            loss=SparseCategoricalCrossentropy(from_logits=True),
            metrics=["accuracy"],
        )
        transfer_model.fit(
            tf_train_dataset,
            validation_data=tf_val_dataset,
            epochs = epochs, 
            callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]
        )
        self.trained_model = transfer_model
        if unfreeze_after == True:
            transfer_model_unfreezed = createTransferModel(transfer_model, freeze=False)
            transfer_model_unfreezed.compile(
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                loss=SparseCategoricalCrossentropy(from_logits=True),
                metrics=["accuracy"],
            )
            transfer_model_unfreezed.fit(
                tf_train_dataset,
                validation_data=tf_val_dataset,
                epochs = epochs,
                callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)]
            )
            self.trained_model = transfer_model_unfreezed
        return 

        
    def test_score(self, tokenized_dataset_test):

        tf_test_dataset = tokenized_dataset_test.to_tf_dataset(
            columns=["attention_mask", "input_ids", "token_type_ids"],
            label_cols=["label"],
            shuffle=False,
            collate_fn=data_collator,
            batch_size=8,
        )
        
        preds_test = tf.nn.softmax(self.trained_model.predict(tf_test_dataset)["logits"])
        class_preds_test = np.argmax(preds_test, axis=1)
# print(len(tokenized_dataset_val["label"]),len(class_preds_test))
        accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
        return accuracy_test

    

In [13]:
dataset_all = load_dataset("financial_phrasebank", "sentences_allagree", split = "train")
dataset_75 = load_dataset("financial_phrasebank", "sentences_75agree", split = "train")
dataset_66 = load_dataset("financial_phrasebank", "sentences_66agree", split = "train")
dataset_50 = load_dataset("financial_phrasebank", "sentences_50agree", split = "train")

In [14]:
checkpoint = "bert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
Trainer_all = trainer(dataset = dataset_all, checkpoint = checkpoint)
tokenized_dataset_train_all, tokenized_dataset_val_all, tokenized_dataset_test_all = Trainer_all.train_val_test_split(test_size=0.2, val_size=0.2)
Trainer_all.train(model = model,tokenized_dataset_train=tokenized_dataset_train_all, tokenized_dataset_val=tokenized_dataset_val_all,freeze_layer=1, epochs=10, unfreeze_after=True)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

Epoch 1/10


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [15]:
Trainer_all.test_score(tokenized_dataset_test_all)



0.9646799116997793

In [16]:
checkpoint = "bert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
Trainer_75 = trainer(dataset = dataset_75, checkpoint = checkpoint)
tokenized_dataset_train_75, tokenized_dataset_val_75, tokenized_dataset_test_75 = Trainer_75.train_val_test_split(test_size=0.2, val_size=0.2)
Trainer_75.train(model = model,tokenized_dataset_train=tokenized_dataset_train_75, tokenized_dataset_val=tokenized_dataset_val_75, freeze_layer=1, epochs=10, unfreeze_after=True)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

Epoch 1/10


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [17]:
Trainer_75.test_score(tokenized_dataset_test_75)



0.8900144717800289

In [18]:
checkpoint = "bert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
Trainer_66 = trainer(dataset = dataset_66, checkpoint = checkpoint)
tokenized_dataset_train_66, tokenized_dataset_val_66, tokenized_dataset_test_66 = Trainer_66.train_val_test_split(test_size=0.2, val_size=0.2)
Trainer_66.train(model = model,tokenized_dataset_train=tokenized_dataset_train_66, tokenized_dataset_val=tokenized_dataset_val_66, freeze_layer=1, epochs=10, unfreeze_after=True)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4217 [00:00<?, ? examples/s]

Epoch 1/10


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [19]:
Trainer_66.test_score(tokenized_dataset_test_66)



0.8281990521327014

In [20]:
checkpoint = "bert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
Trainer_50 = trainer(dataset = dataset_50,  checkpoint = checkpoint)
tokenized_dataset_train_50, tokenized_dataset_val_50, tokenized_dataset_test_50 = Trainer_50.train_val_test_split(test_size=0.2, val_size=0.2)
Trainer_50.train(model = model,tokenized_dataset_train=tokenized_dataset_train_50, tokenized_dataset_val=tokenized_dataset_val_50, freeze_layer=1, epochs=10, unfreeze_after=True)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4846 [00:00<?, ? examples/s]

Epoch 1/10


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [21]:
Trainer_50.test_score(tokenized_dataset_test_50)



0.7649484536082474

**Discussion**

I apply the same model, the same training process, and the same hyperparameters separately to the four datasets, which are all-agreed, 75_agreed, 66_agreed, and 50-agreed. As we could see from the results, the test accuracy score declines with the quality of the datasets. 

The declining performance on the datasets with lower inter-annotator agreement makes sense, as the "correct" labels become more ambiguous and debatable as agreement drops. The all-agreed dataset likely has very clear, unambiguous labels that are easy to learn and generalize. There is no debate about what the correct label should be, making it easier for the model to recognize the patterns that correlate with each class. As agreement declines to 75%, there is more variability in the labels. For around 25% of examples, annotators disagreed on the best label. This introduces some noisy, potentially inconsistent labels into the training data, making the patterns harder to recognize. Performance declines, but there is still relatively high agreement. At 66% agreement, over 30% of the labels may be debatable. Different annotators can reasonably assign different labels to a substantial minority of examples. This makes it quite challenging for the model to properly learn the distinctions between classes. With only 50% agreement, the dataset likely contains a large fraction of examples that different people would legitimately label differently. Many examples likely have ambiguous qualities or lack strong cues that clearly differentiate the classes. This high level of subjectivity greatly reduces how learnable the categories are.

In essence, less agreement means more noisy, subjective, borderline examples. This reduces how clearly defined the class patterns are in the data, making generalization more difficult. The model likely learns superficial cues and struggles to match the wisdom of the crowd on ambiguous cases. Augmenting training data, handling label uncertainty, and other techniques may help address such issues. But fundamentally, subjective and unclear labels limit model performance.

## Address any Imbalanced Data issues

In [19]:
## Check the proportion of each class
label = tokenized_dataset_train["label"]
classes, count = np.unique(label, return_counts=True)

In [20]:
neg = count[0]
neu = count[1]
pos = count[2]
total = neg + neu + pos

In [21]:
weight_for_0 = (1 / neg) * (total)
weight_for_1 = (1 / neu) * (total)
weight_for_2 = (1 / pos) * (total)

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
print('Weight for class 2: {:.2f}'.format(weight_for_2))

Weight for class 0: 7.26
Weight for class 1: 1.63
Weight for class 2: 4.02


In [22]:
checkpoint = "bert-base-uncased"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
transfer_model_imbalanced = createTransferModel(model,1)
# optimizer = AdamWeightDecay(transfer_model.config)
transfer_model_imbalanced.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_model_imbalanced.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10,
    class_weight=class_weight
)
transfer_model_imbalanced_unfreezed = createTransferModel(transfer_model_imbalanced,freeze=False)
# optimizer = AdamWeightDecay(transfer_model.config)
transfer_model_imbalanced_unfreezed.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_model_imbalanced_unfreezed.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10,
    class_weight=class_weight
)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2072bb93b20>

In [23]:
from sklearn.metrics import accuracy_score
preds_val = transfer_model_imbalanced_unfreezed.predict(tf_validation_dataset)
class_preds_val = np.argmax(preds_val.logits, axis=1)
accuracy_val = accuracy_score(tokenized_dataset_val["label"],class_preds_val)
accuracy_val



0.9028697571743929

In [24]:
preds_test = transfer_model_imbalanced_unfreezed.predict(tf_test_dataset)
class_preds_test = np.argmax(preds_test.logits, axis=1)
accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
accuracy_test



0.8962472406181016

**Discussion**

To deal with the potential data imbalance, I first take a look at the proportion of each class takes in the data. It turns out that in training data, there exist data imbalance with neutral class taking up to 61%. Therefore, I pass the class weights argument to the model fitting. Class weigght is an  optional dictionary mapping class indices (integers) to a weight (float) value, used for weighting the loss function (during training only). This can be useful to tell the model to "pay more attention" to samples from an under-represented class.



## Superior Error Analysis

### Is one class harder to correctly classify than the others?

In [138]:
def most_wrong_class(pred, true):
    count_wrong = [0, 0, 0]
    count_right = [0, 0, 0]
    for i in range(len(pred)):
        if pred[i] != true[i]:
            if true[i] == 0:
                count_wrong[0] += 1
            if true[i] == 1:
                count_wrong[1] +=1
            if true[i] == 2:
                count_wrong[2] += 1
        else:
            if true[i] == 0:
                count_right[0] += 1
            if true[i] == 1:
                count_right[1] +=1
            if true[i] == 2:
                count_right[2] += 1
    count_tot = list(np.add(count_wrong, count_right))
    percent_wrong = list(np.divide(count_wrong,count_tot))
    count_max = np.max(percent_wrong)
    most_wrong = percent_wrong.index(count_max)
    total = count_tot[most_wrong]
    print("Class %d is mostly wrongly classified, with %.2f%% of the class wrongly put."%(most_wrong, count_max*100))
    return 

In [139]:
### Here I use the models trained in the basic part.
preds_val_basic = transfer_model_unfreezed.predict(tf_validation_dataset)
class_preds_val_basic = np.argmax(preds_val_basic.logits, axis=1)
print("Regarding the model in the basic part, in the validation dataset:")
most_wrong_class(class_preds_val_basic, tokenized_dataset_val["label"])

Regarding the model in the basic part, in the validation dataset:
Class 2 is mostly wrongly classified, with 7.69% of the class wrongly put.


In [140]:
preds_test_basic = transfer_model_unfreezed.predict(tf_test_dataset)
class_preds_test_basic = np.argmax(preds_test_basic.logits, axis=1)
print("Regarding the model in the basic part, in the test dataset:")
most_wrong_class(class_preds_test_basic, tokenized_dataset_test["label"])

Regarding the model in the basic part, in the test dataset:
Class 2 is mostly wrongly classified, with 11.30% of the class wrongly put.


In [141]:
### Here I use the models trained based on imbalance adjustment.
preds_val_adjusted = transfer_model_imbalanced_unfreezed.predict(tf_validation_dataset)
class_preds_val_adjusted = np.argmax(preds_val_adjusted.logits, axis=1)
print("Regarding the model trained with class weights, in the validation dataset:")
most_wrong_class(class_preds_val_adjusted, tokenized_dataset_val["label"])

Regarding the model trained with class weights, in the validation dataset:
Class 2 is mostly wrongly classified, with 17.09% of the class wrongly put.


In [142]:
preds_test_adjusted = transfer_model_imbalanced_unfreezed.predict(tf_test_dataset)
class_preds_test_adjusted = np.argmax(preds_test_adjusted.logits, axis=1)
print("Regarding the model trained with class weights, in the test dataset:")
most_wrong_class(class_preds_test_adjusted, tokenized_dataset_test["label"])

Regarding the model trained with class weights, in the test dataset:
Class 2 is mostly wrongly classified, with 16.52% of the class wrongly put.


### If there exist patterns in those sentences that are wrongly classified?

In [143]:
def find_wrong_index(pred, true):
    result = []
    for i in range(len(pred)):
        if pred[i] != true[i]:
            result.append(i)
    return result

def find_right_index(pred, true):
    result = []
    for i in range(len(pred)):
        if pred[i] == true[i]:
            result.append(i)
    return result

In [144]:
class Analysis:
    def __init__(self, pred, true_dataset):
        self.pred = pred
        self.true_dataset = true_dataset
        self.wrong_sentence = []
        self.right_sentence = []

    def length_comparison(self):
        wrong_index = find_wrong_index(self.pred, self.true_dataset["label"])
        right_index = find_right_index(self.pred, self.true_dataset["label"])
        
        # wrong_sentence = []
        for i in range(len(wrong_index)):
            self.wrong_sentence.append(
                (self.true_dataset["sentence"][wrong_index[i]],
                 self.true_dataset["label"][wrong_index[i]],
                 self.pred[wrong_index[i]])
            )
            
        # right_sentence = []
        for i in range(len(right_index)):
            self.right_sentence.append(self.true_dataset["sentence"][right_index[i]])

        avg_len1=0
        for i in range(len(self.wrong_sentence)):
            avg_len1 = (avg_len1*i + len(self.wrong_sentence[i][0]))/(i+1)
        avg_len2=0
        for i in range(len(self.right_sentence)):
            avg_len2 = (avg_len2*i + len(self.right_sentence[i]))/(i+1)
            
        print("The average length of all the wrongly classified sentences is %.2f, the average lenghth of all the correctly classified sentences is %.2f."%(avg_len1, avg_len2))
        return

    def bias_analysis(self):
        pos_bias = 0
        neg_bias = 0
        ### For each tuple, the first element is the wrongly put sentence, the second is the true class, third the prediction class
        if not self.right_sentence or not self.wrong_sentence:
            wrong_index = find_wrong_index(self.pred, self.true_dataset["label"])
            right_index = find_right_index(self.pred, self.true_dataset["label"])
            
            for i in range(len(wrong_index)):
                self.wrong_sentence.append(
                    (self.true_dataset["sentence"][wrong_index[i]],
                     self.true_dataset["label"][wrong_index[i]],
                     self.pred[wrong_index[i]])
                )
                
            for i in range(len(right_index)):
                self.right_sentence.append(self.true_dataset["sentence"][right_index[i]])
        
        for i in range(len(self.wrong_sentence)):
            if self.wrong_sentence[i][1] < self.wrong_sentence[i][2]:
                ### if the prediction class is more positive than the true class, the model outputs positive bias
                pos_bias += 1
            else:
                neg_bias += 1
        pos_bias = pos_bias/len(self.wrong_sentence)*100
        neg_bias = neg_bias/len(self.wrong_sentence)*100
        print("%.2f%% of the predictions suffer from positive bias, and %.2f%% of thepredictions suffers from negative bias."%(pos_bias,neg_bias))
        return

In [145]:
print("Regarding the model in the basic part, in the validation dataset:")
val_basic = Analysis(class_preds_val_basic, tokenized_dataset_val)
val_basic.length_comparison()
val_basic.bias_analysis()

Regarding the model in the basic part, in the validation dataset:
The average length of all the wrongly classified sentences is 115.75, the average lenghth of all the correctly classified sentences is 122.83.
37.50% of the predictions suffer from positive bias, and 62.50% of thepredictions suffers from negative bias.


In [146]:
print("Regarding the model in the basic part, in the test dataset:")
test_basic = Analysis(class_preds_test_basic, tokenized_dataset_test)
test_basic.length_comparison()
test_basic.bias_analysis()

Regarding the model in the basic part, in the test dataset:
The average length of all the wrongly classified sentences is 151.72, the average lenghth of all the correctly classified sentences is 117.49.
11.11% of the predictions suffer from positive bias, and 88.89% of thepredictions suffers from negative bias.


In [147]:
print("Regarding the model trained with class weights, in the validation dataset:")
val_adjusted = Analysis(class_preds_val_adjusted, tokenized_dataset_val)
val_adjusted.length_comparison()
val_adjusted.bias_analysis()

Regarding the model trained with class weights, in the validation dataset:
The average length of all the wrongly classified sentences is 129.43, the average lenghth of all the correctly classified sentences is 121.85.
45.45% of the predictions suffer from positive bias, and 54.55% of thepredictions suffers from negative bias.


In [148]:
print("Regarding the model trained with class weights, in the test dataset:")
test_adjusted = Analysis(class_preds_test_adjusted, tokenized_dataset_test)
test_adjusted.length_comparison()
test_adjusted.bias_analysis()

Regarding the model trained with class weights, in the test dataset:
The average length of all the wrongly classified sentences is 137.40, the average lenghth of all the correctly classified sentences is 116.70.
46.81% of the predictions suffer from positive bias, and 53.19% of thepredictions suffers from negative bias.


**Discussion**

In this section, I try to apply both the model before and after the class weights adjustment to superior error analysis. Specifically, I look into both the validation dataset and the test dataset, in order to compare the average length of sentences, and to see what kind of bias the models have when making classification. 

As we could see from the error analysis of the model both from the basic part and from the class-weights-adjusted model, generally, the wrongly classified sentences are longer than those correctly classified. It makes sense in that longer sentences contain more semantic meanings and are more complex to analyze for the model. The second pattern is that positive or neutral sentences are more likely to be wrongly classified to the more negative class. The most wrongly classified class for both models is class 2, which is the positive class, proving that the models suffer more from negative bias.

## Experiment with different Pre-Trained models

### DistilBert Model

In [25]:
from transformers import TFAutoModelForSequenceClassification
from transformers import TFBertModel
from tensorflow.keras.losses import SparseCategoricalCrossentropy

checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenized_dataset_train, tokenized_dataset_val, tokenized_dataset_test =train_val_test_split(tokenized_dataset, test_size=0.2, val_size=0.2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
tf_train_dataset = tokenized_dataset_train.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_dataset_val.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)
tf_test_dataset = tokenized_dataset_test.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
model.summary()

Map:   0%|          | 0/2264 [00:00<?, ? examples/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffe

Model: "tf_distil_bert_for_sequence_classification_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
 dropout_157 (Dropout)       multiple                  0         
                                                                 
Total params: 66,955,779
Trainable params: 66,955,779
Non-trainable params: 0
_________________________________________________________________


In [17]:
transfer_model = createTransferModel(model,3)
# optimizer = AdamWeightDecay(transfer_model.config)
transfer_model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23aeef20820>

In [18]:
preds_val = tf.nn.softmax(transfer_model.predict(tf_validation_dataset)["logits"])
class_preds_val = np.argmax(preds_val, axis=1)
accuracy_val = accuracy_score(tokenized_dataset_val["label"],class_preds_val)
accuracy_val



0.8565121412803532

In [19]:
preds_test = tf.nn.softmax(transfer_model.predict(tf_test_dataset)["logits"])
class_preds_test = np.argmax(preds_test, axis=1)
accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
accuracy_test



0.8852097130242825

In [20]:
transfer_model_unfreezed = createTransferModel(transfer_model,freeze=False)
# optimizer = AdamWeightDecay(transfer_model.config)
transfer_model_unfreezed.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_model_unfreezed.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23aeeee7eb0>

In [21]:
preds_val = tf.nn.softmax(transfer_model_unfreezed.predict(tf_validation_dataset)["logits"])
class_preds_val = np.argmax(preds_val, axis=1)
accuracy_val = accuracy_score(tokenized_dataset_val["label"],class_preds_val)
accuracy_val



0.9514348785871964

In [22]:
preds_test = tf.nn.softmax(transfer_model_unfreezed.predict(tf_test_dataset)["logits"])
class_preds_test = np.argmax(preds_test, axis=1)
accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
accuracy_test



0.9403973509933775

### Bertweet Model

In [26]:
checkpoint = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenized_dataset_train, tokenized_dataset_val, tokenized_dataset_test =train_val_test_split(tokenized_dataset, test_size=0.2, val_size=0.2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
tf_train_dataset = tokenized_dataset_train.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_dataset_val.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)
tf_test_dataset = tokenized_dataset_test.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

bertweet = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
bertweet.summary()

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_roberta_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 134309376 
 )                                                               
                                                                 
 classifier (TFRobertaClassi  multiple                 592899    
 ficationHead)                                                   
                                                                 
Total params: 134,902,275
Trainable params: 134,902,275
Non-trainable params: 0
_________________________________________________________________


In [27]:
transfer_bertweet = createTransferModel(bertweet,1)
# optimizer = AdamWeightDecay(transfer_model.config)
transfer_bertweet.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_bertweet.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23b0da372b0>

In [28]:
preds_val = tf.nn.softmax(transfer_bertweet.predict(tf_validation_dataset)["logits"])
class_preds_val = np.argmax(preds_val, axis=1)
accuracy_val = accuracy_score(tokenized_dataset_val["label"],class_preds_val)
accuracy_val



0.7615894039735099

In [29]:
preds_test = tf.nn.softmax(transfer_bertweet.predict(tf_test_dataset)["logits"])
class_preds_test = np.argmax(preds_test, axis=1)
accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
accuracy_test



0.8123620309050773

In [30]:
transfer_bertweet_unfreezed = createTransferModel(transfer_bertweet,freeze=False)
# optimizer = AdamWeightDecay(transfer_model.config)
transfer_bertweet_unfreezed.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_bertweet_unfreezed.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x23b460bef10>

In [31]:
preds_val = tf.nn.softmax(transfer_bertweet_unfreezed.predict(tf_validation_dataset)["logits"])
class_preds_val = np.argmax(preds_val, axis=1)
accuracy_val = accuracy_score(tokenized_dataset_val["label"],class_preds_val)
accuracy_val



0.9448123620309051

In [32]:
preds_test = tf.nn.softmax(transfer_bertweet_unfreezed.predict(tf_test_dataset)["logits"])
class_preds_test = np.argmax(preds_test, axis=1)
accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
accuracy_test



0.9359823399558499

**Discussion**

To see if a larger/smaller model performs better, I first try distil-bert model, which has fewer parameters than the bert model in the basic part. Before fine-tuning, it actually achives better performance with a validation accuracy around 85% and a test accuracy around 88%. However, after unfreezing all the weights and fine-tuning, the bert model has better performance. This result makes sense because the bert model with all layers unfreezed has more parameters to be fine-tuned. A larger model again has greater learning capacity - its larger set of parameters can adapt better to new data. 
More parameters enable capturing more complex patterns when fine-tuning on downstream tasks

To see if a model trained on different data will have a different performance, I try Bertweet model, which is trained on English tweets data. Before fine-tuning, it also has a better performance than Bert. However, after fine-tuning, the performance is not as good. The good performance before fine tuning cound be due to the fact that bertweet is a large model. The Bert model is trained on Wikipedia and book data, while the bertweet is trained on tweets data, which could be more casual and less compatible with financial news. Therefore, after fine-tuning, the performance of Bertweet is not as good as Bert..

## Experiment with Fine-Tuning

In [4]:
## Load Data 
from datasets import load_dataset
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding
dataset_1 = load_dataset("financial_phrasebank", "sentences_allagree", split = "train")

def tokenize_function(example):
    return tokenizer(example["sentence"], truncation=True, padding=True)

def train_val_test_split(tokenized_dataset, test_size, val_size=0):
        temp1 = tokenized_dataset.train_test_split(test_size = test_size)
        tokenized_test = temp1["test"]
        temp2 = temp1["train"]
        temp3 = temp2.train_test_split(test_size = val_size/(1-test_size))
        tokenized_val = temp3["test"]
        tokenized_train = temp3["train"]
        return tokenized_train, tokenized_val, tokenized_test

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_dataset = dataset_1.map(tokenize_function, batched=True)

tokenized_dataset_train, tokenized_dataset_val, tokenized_dataset_test =train_val_test_split(tokenized_dataset, test_size=0.2, val_size=0.2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
tf_train_dataset = tokenized_dataset_train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = tokenized_dataset_val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)
tf_test_dataset = tokenized_dataset_test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)
checkpoint = "bert-base-uncased"
model_sampling = TFAutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
transfer_model_sampling = createTransferModel(model_sampling,1)
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
# optimizer = AdamWeightDecay(transfer_model.config)
transfer_model_sampling.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)
transfer_model_sampling.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs = 10,
    callbacks=[callback]
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17994bae3d0>

In [5]:
from datasets import Dataset
def fine_tuning_with_sampling(model, sample_size, class_weights=None):
    pd_tokenized_dataset_train = tokenized_dataset_train.to_pandas()
    if class_weights == None:
        pd_sampled = pd_tokenized_dataset_train.sample(sample_size)
    else:
        pd_sampled = pd_tokenized_dataset_train.sample(sample_size,weights=class_weights)
    dset = Dataset.from_pandas(pd_sampled)
    tf_dset = dset.to_tf_dataset(
        columns=["attention_mask", "input_ids", "token_type_ids"],
        label_cols=["label"],
        shuffle=False,
        collate_fn=data_collator,
        batch_size=8,
    )
    model_unfreezed = createTransferModel(model, freeze=False)
    callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
    model_unfreezed.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
        loss=SparseCategoricalCrossentropy(from_logits=True),
        metrics=["accuracy"],
    )
    model_unfreezed.fit(
        tf_dset,
        validation_data=tf_validation_dataset,
        epochs = 15,
        callbacks = [callback]
    )
    preds_test = tf.nn.softmax(model_unfreezed.predict(tf_test_dataset)["logits"])
    class_preds_test = np.argmax(preds_test, axis=1)
    accuracy_test = accuracy_score(tokenized_dataset_test["label"],class_preds_test)
    return accuracy_test

In [6]:
import pandas as pd
def calculate_prob(data):
    weights = []
    pd_data = data.to_pandas()
    tot_length = len(pd_data)
    neg_prob = len(pd_data[pd_data["label"]==0]["label"])/tot_length
    neu_prob = len(pd_data[pd_data["label"]==1]["label"])/tot_length
    pos_prob = len(pd_data[pd_data["label"]==2]["label"])/tot_length
    label_list = list(pd_data["label"])
    for i in range(len(label_list)):
        if label_list[i] == 0:
            weights.append(neg_prob)
        elif label_list[i] == 1:
            weights.append(neu_prob)
        else:
            weights.append(pos_prob)
    return weights

In [7]:
weights = calculate_prob(tokenized_dataset_train)

In [8]:
fine_tuning_with_sampling(model=transfer_model_sampling, sample_size=400)

Epoch 1/15


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15


0.9470198675496688

In [10]:
fine_tuning_with_sampling(model=transfer_model_sampling, sample_size=800)

Epoch 1/15


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15


0.9558498896247241

In [9]:
fine_tuning_with_sampling(model=transfer_model_sampling, sample_size=1200)

Epoch 1/15


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 2/15
Epoch 3/15
Epoch 4/15


0.9646799116997793

In [11]:
fine_tuning_with_sampling(model=transfer_model_sampling, sample_size=800,class_weights=weights)

Epoch 1/15


Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15


0.9646799116997793

**Discussion**

To see how the out-of-sample performance changes with the length of the training dataset in the process of fine-tuning, I write a fine_tuning_with_sampling function to help. The first sampling method is randomly taking a fixed sample size of data from the whole training dataset. Then, we fine tune the model based on the sampled data. As we could see from above, **the out-of-sample performance increases with the sample size**. The second sampling method is to make sure that each class has the sample probability of being sampled. Namely, we calculate the prior probability of each class in the training dataset, and assign this probability to sampling. **By sampling with class weights considered, we can see that the fine-tuned model has a better performance with the same sample size.**

## In-context learning

*Relative difficulty: low but fun !*

Can you use few-shot learning successfully (i.e., no further training) ?

It would be great to do this for Financial PhraseBank but the sentences may be too long
- pre-trained models have maximum sequence lengths that may be too small

Propose some interesting task related to Finance and try to achieve Few Shot Learning on the task.

In [17]:
input_text = """According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing . neutral
   For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m . positive
   The OMX Helsinki index was down 0.34 pct at 8,256.02 on turnover of 813.191 mln eur . negative

   The growth of net sales has continued favourably in the Middle East and Africaand in Asia Pacific ."""


In [31]:
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from datasets import load_dataset
checkpoint = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(checkpoint)
tokenized_input = tokenizer(input_text, return_tensors = "tf").input_ids

In [35]:
model = TFDistilBertForSequenceClassification.from_pretrained(checkpoint,num_labels=3)
preds = model.predict(tokenized_input)["logits"]
class_preds = np.argmax(preds, axis=1)
class_preds

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 



array([2], dtype=int64)

**Discussion**

Using few-shot learning, we do not train the model and try to feed some contexts to the model and hopefully we could get the prediction. Here, I feed three classified sentences as context and one sentence (should be positive) to the model. Without any further training, the model successfully predicts that the sentence is positive.