# w266 Masked language modeling (MLM) & learning rate fine-tuning (Fit)

## Packages and Libraries

In [None]:
# Installs
!pip install -q transformers
!pip install pydot

# data processessing packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## NN packages
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from sklearn.metrics import classification_report

# NLP packages
from transformers import BertTokenizer,TFAutoModel, TFBertModel, BertForSequenceClassification,TFAutoModelForSequenceClassification


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m


### Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing/Preprocessing data

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/w266/data/clean_train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/w266/data/clean_test_data.csv')

In [None]:
# creating train/val datasets
x_train, x_val, y_train, y_val = train_test_split(train_data.text, train_data.label, test_size=0.20, random_state=42)
# creating test datasets
x_test = test_data.text
y_test = test_data.label

In [None]:
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}\n")
print(f"x_val shape: {x_val.shape}")
print(f"y_val shape: {y_val.shape}\n")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")


x_train shape: (13592,)
y_train shape: (13592,)

x_val shape: (3398,)
y_val shape: (3398,)

x_test shape: (4117,)
y_test shape: (4117,)


## Global Variables

In [None]:
target_names = ["Analyst Update","Fed | Central Banks",
        "Company | Product News","Treasuries | Corporate Debt",
        "Dividend","Earnings","Energy | Oil",
        "Financials","Currencies","General News | Opinion",
        "Gold | Metals | Materials","IPO","Legal | Regulation",
        "M&A | Investments","Macro","Markets","Politics",
        "Personnel Change","Stock Commentary", "Stock Movement"]

In [None]:
# Creating of learning rate schedule
num_epochs = 5
num_train_steps = len(x_train) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.,
    decay_steps=num_train_steps
)

## Utility Functions

In [None]:
#Function for creating tokenized data and outputs for models

def create_datasets(tokenizer, train, val, test):
  # Variables
  # train/val/test = datasets to encode
  # tokenizer = bert tokenizer

  train_encodings = tokenizer(list(train), padding=True, return_tensors='tf')
  valid_encodings = tokenizer(list(val), padding=True, return_tensors='tf')
  test_encodings = tokenizer(list(test),padding=True, return_tensors='tf')


  return train_encodings, valid_encodings, test_encodings


In [None]:
 # Function for creating model
def create_bert_multiclass_model(model,
                                 num_classes = 20,
                                 hidden_size = 201,
                                 dropout=0.3,
                                 learning_rate=0.00005,
                                 activation='softmax'):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes.
    """

    bert_model = model

    # building bert inputs
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    # building bert model
    bert_out = bert_model(bert_inputs)
    pooler_output = bert_out[1] # bert_out.pooler_output

    # building hidden layers
    last_hidden_output = tf.keras.layers.Dense(hidden_size, activation='relu', name='last_hidden_output')(pooler_output)
    last_hidden_output = tf.keras.layers.Dropout(dropout, name='dropout')(last_hidden_output)
    bert_cls_prediction = keras.layers.Dense(num_classes, activation=activation, name='cls_output')(last_hidden_output)

    # compiling model
    bert_cls_model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=bert_cls_prediction)
    bert_cls_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                           metrics='accuracy')

    ### END YOUR CODE
    return bert_cls_model

##MLM-Fit
Masked Language Model and Fit-Tuning Strategies

### BERT-Base

In [None]:
# Loading BERT-base-MLM tokenizer/model
bert_mlm_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-tokenizer')
bert_mlm_model = TFBertModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM")

Some layers from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Creating BERT-base-MLM encodings
bert_mlm_train_encodings, bert_mlm_valid_encodings, bert_mlm_test_encodings = create_datasets(bert_mlm_tokenizer, x_train, x_val, x_test)

In [None]:
# Creating BERT-base-MLM-Fit model
bert_mlm_fit = create_bert_multiclass_model(bert_mlm_model, num_classes=20, learning_rate=lr_scheduler)

In [None]:
# Running BERT-base-MLM-Fit model
bert_mlm_fit_history = bert_mlm_fit.fit([bert_mlm_train_encodings.input_ids, bert_mlm_train_encodings.token_type_ids, bert_mlm_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([bert_mlm_valid_encodings.input_ids, bert_mlm_valid_encodings.token_type_ids, bert_mlm_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating BERT-base-MLM-Fit models
bert_mlm_fit_results = bert_mlm_fit.evaluate([bert_mlm_test_encodings.input_ids, bert_mlm_test_encodings.token_type_ids, bert_mlm_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {bert_mlm_fit_results[1]}\n"+
      f"Model loss: {bert_mlm_fit_results[0]}")

Model accuracy: 0.8785523176193237
Model loss: 0.583951473236084


In [None]:
# Computing BERT-base-MLM-Fit F1 metric
bert_mlm_fit_y_pred = bert_mlm_fit.predict([bert_mlm_test_encodings.input_ids, bert_mlm_test_encodings.token_type_ids, bert_mlm_test_encodings.attention_mask])
pred_bert_mlm_fit_model = tf.argmax(bert_mlm_fit_y_pred, axis=-1)

print(classification_report(y_test, pred_bert_mlm_fit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.9737    0.5068    0.6667        73
        Fed | Central Banks     0.8628    0.9112    0.8864       214
     Company | Product News     0.8418    0.9366    0.8867       852
Treasuries | Corporate Debt     0.8841    0.7922    0.8356        77
                   Dividend     0.9588    0.9588    0.9588        97
                   Earnings     0.9750    0.9669    0.9710       242
               Energy | Oil     0.7733    0.7945    0.7838       146
                 Financials     0.8750    0.9187    0.8963       160
                 Currencies     0.8966    0.8125    0.8525        32
     General News | Opinion     0.8651    0.7440    0.8000       336
  Gold | Metals | Materials     1.0000    0.3077    0.4706        13
                        IPO     0.8667    0.9286    0.8966        14
         Legal | Regulation     0.8595    0.8739    0.8667       119
          M&A | Investments     0

###FinBert

In [None]:
# Loading FinBert-MLM tokenizer/model
finbert_mlm_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM-tokenizer')
finbert_mlm_model = TFBertModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM")

Some layers from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM and are newly initialized: ['bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Creating FinBert-MLM encodings
finbert_mlm_train_encodings, finbert_mlm_valid_encodings, finbert_mlm_test_encodings = create_datasets(finbert_mlm_tokenizer, x_train, x_val, x_test)

In [None]:
# Creating FinBert-MLM model
finbert_mlm_fit = create_bert_multiclass_model(finbert_mlm_model, num_classes=20, learning_rate=lr_scheduler)

In [None]:
# Running FinBert-MLM model
finbert_mlm_fit_history = finbert_mlm_fit.fit([finbert_mlm_train_encodings.input_ids, finbert_mlm_train_encodings.token_type_ids, finbert_mlm_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([finbert_mlm_valid_encodings.input_ids, finbert_mlm_valid_encodings.token_type_ids, finbert_mlm_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating FinBert-MLM models
finbert_mlm_fit_results = finbert_mlm_fit.evaluate([finbert_mlm_test_encodings.input_ids, finbert_mlm_test_encodings.token_type_ids, finbert_mlm_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {finbert_mlm_fit_results[1]}\n"+
      f"Model loss: {finbert_mlm_fit_results[0]}")

Model accuracy: 0.8678649663925171
Model loss: 0.5805019736289978


In [None]:
# Computing FinBert-MLM F1 metric
finbert_mlm_fit_y_pred = finbert_mlm_fit.predict([finbert_mlm_test_encodings.input_ids, finbert_mlm_test_encodings.token_type_ids, finbert_mlm_test_encodings.attention_mask])
pred_finbert_mlm_fit_model = tf.argmax(finbert_mlm_fit_y_pred, axis=-1)

print(classification_report(y_test, pred_finbert_mlm_fit_model.numpy(), target_names=target_names, digits=4))

                             precision    recall  f1-score   support

             Analyst Update     0.8000    0.7123    0.7536        73
        Fed | Central Banks     0.9344    0.7991    0.8615       214
     Company | Product News     0.9302    0.8451    0.8856       852
Treasuries | Corporate Debt     0.7470    0.8052    0.7750        77
                   Dividend     0.9792    0.9691    0.9741        97
                   Earnings     0.9532    0.9256    0.9392       242
               Energy | Oil     0.8516    0.9041    0.8771       146
                 Financials     0.7906    0.9437    0.8604       160
                 Currencies     0.7692    0.9375    0.8451        32
     General News | Opinion     0.7051    0.8185    0.7576       336
  Gold | Metals | Materials     0.7500    0.9231    0.8276        13
                        IPO     0.8750    1.0000    0.9333        14
         Legal | Regulation     0.8045    0.8992    0.8492       119
          M&A | Investments     0