# w266 MLM-Cit-Fit

## Packages and Libraries

In [None]:
# Installs
!pip install -q transformers
!pip install pydot

# data processessing packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## NN packages
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from sklearn.metrics import classification_report

# NLP packages
from transformers import BertTokenizer,TFAutoModel, TFBertModel, BertForSequenceClassification,TFAutoModelForSequenceClassification


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m125.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m83.6 MB/s[0m eta [36m0:00:00[0m


## Importing/Preprocessing data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/w266/Datasets/clean_train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/w266/Datasets/clean_test_data.csv')

In [None]:
# creating train/val datasets
x_train, x_val, y_train, y_val = train_test_split(train_data.text, train_data.label, test_size=0.20, random_state=42)
# creating test datasets
x_test = test_data.text
y_test = test_data.label

In [None]:
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}\n")
print(f"x_val shape: {x_val.shape}")
print(f"y_val shape: {y_val.shape}\n")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")


x_train shape: (13592,)
y_train shape: (13592,)

x_val shape: (3398,)
y_val shape: (3398,)

x_test shape: (4117,)
y_test shape: (4117,)


## Global Variables

In [None]:
target_names = ["Analyst Update","Fed | Central Banks",
        "Company | Product News","Treasuries | Corporate Debt",
        "Dividend","Earnings","Energy | Oil",
        "Financials","Currencies","General News | Opinion",
        "Gold | Metals | Materials","IPO","Legal | Regulation",
        "M&A | Investments","Macro","Markets","Politics",
        "Personnel Change","Stock Commentary", "Stock Movement"]

In [None]:
# Creating of learning rate schedule
num_epochs = 5
num_train_steps = len(x_train) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.,
    decay_steps=num_train_steps
)

## Utility Functions

In [None]:
#Function for creating tokenized data and outputs for models

def create_datasets(tokenizer, train, val, test):
  # Variables
  # train/val/test = datasets to encode
  # tokenizer = bert tokenizer

  train_encodings = tokenizer(list(train), padding=True, return_tensors='tf')
  valid_encodings = tokenizer(list(val), padding=True, return_tensors='tf')
  test_encodings = tokenizer(list(test),padding=True, return_tensors='tf')


  return train_encodings, valid_encodings, test_encodings


In [None]:
 # Function for creating model
def create_bert_multiclass_model(model,
                                 num_classes = 20,
                                 hidden_size = 201,
                                 dropout=0.3,
                                 learning_rate=0.00005,
                                 activation='softmax'):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes.
    """

    bert_model = model

    # building bert inputs
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    # building bert model
    bert_out = bert_model(bert_inputs)
    pooler_output = bert_out[1] # bert_out.pooler_output

    # building hidden layers
    last_hidden_output = tf.keras.layers.Dense(hidden_size, activation='relu', name='last_hidden_output')(pooler_output)
    last_hidden_output = tf.keras.layers.Dropout(dropout, name='dropout')(last_hidden_output)
    bert_cls_prediction = keras.layers.Dense(num_classes, activation=activation, name='cls_output')(last_hidden_output)

    # compiling model
    bert_cls_model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=bert_cls_prediction)
    bert_cls_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                           metrics='accuracy')

    ### END YOUR CODE
    return bert_cls_model

##MLM-Cit-Fit
Masked Language Modeling, Cluster Inter-Training, and Fit-Tuning Strategies


###BERT-Base

In [None]:
# Loading BERT-base-MLM-Cit tokenizer/model
bert_mlm_cit_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/w266/Model-Weights/BERT-base-MLM-Cit-Tokenizer')
bert_mlm_cit_model = TFBertModel.from_pretrained("/content/drive/MyDrive/w266/Model-Weights/BERT-base-MLM-Cit")

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at /content/drive/MyDrive/w266/Model-Weights/BERT-base-MLM-Cit.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Creating BERT-base-MLM-Cit encodings
bert_mlm_cit_train_encodings, bert_mlm_cit_valid_encodings, bert_mlm_cit_test_encodings = create_datasets(bert_mlm_cit_tokenizer, x_train, x_val, x_test)

In [None]:
# Creating BERT-base-MLM-Cit-Fit model
bert_mlm_cit_fit = create_bert_multiclass_model(bert_mlm_cit_model, num_classes=20, learning_rate=lr_scheduler)

In [None]:
# Running BERT-base-MLM-Cit-Fit model
bert_mlm_cit_fit_model_history = bert_mlm_cit_fit.fit([bert_mlm_cit_train_encodings.input_ids, bert_mlm_cit_train_encodings.token_type_ids, bert_mlm_cit_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([bert_mlm_cit_valid_encodings.input_ids, bert_mlm_cit_valid_encodings.token_type_ids, bert_mlm_cit_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating BERT-base-MLM-Cit-Fit models
bert_mlm_cit_fit_results = bert_mlm_cit_fit.evaluate([bert_mlm_cit_test_encodings.input_ids, bert_mlm_cit_test_encodings.token_type_ids, bert_mlm_cit_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {bert_mlm_cit_fit_results[1]}\n"+
      f"Model loss: {bert_mlm_cit_fit_results[0]}")


Model accuracy: 0.8829244375228882
Model loss: 0.6949968338012695


In [None]:
# Computing BERT-base-MLM-Cit-Fit F1 metric
bert_mlm_cit_fit_y_pred = bert_mlm_cit_fit.predict([bert_mlm_cit_test_encodings.input_ids, bert_mlm_cit_test_encodings.token_type_ids, bert_mlm_cit_test_encodings.attention_mask])
pred_bert_mlm_cit_fit_model = tf.argmax(bert_mlm_cit_fit_y_pred, axis=-1)

print(classification_report(y_test, pred_bert_mlm_cit_fit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.9259    0.6849    0.7874        73
        Fed | Central Banks     0.8850    0.9346    0.9091       214
     Company | Product News     0.9052    0.8850    0.8950       852
Treasuries | Corporate Debt     0.9333    0.7273    0.8175        77
                   Dividend     0.9500    0.9794    0.9645        97
                   Earnings     0.9444    0.9835    0.9636       242
               Energy | Oil     0.9167    0.8288    0.8705       146
                 Financials     0.9459    0.8750    0.9091       160
                 Currencies     0.7692    0.9375    0.8451        32
     General News | Opinion     0.7895    0.8036    0.7965       336
  Gold | Metals | Materials     0.5000    0.9231    0.6486        13
                        IPO     0.9286    0.9286    0.9286        14
         Legal | Regulation     0.9000    0.8319    0.8646       119
          M&A | Investments     0

In [None]:
# Checkpointing BERT-base-MLM-Cit-Fit model
bert_mlm_cit_model.save_pretrained('/content/drive/MyDrive/w266/Model-Weights/BERT-base-MLM-Cit-Fit')

# Checkpointing BERT-base-MLM-Cit-Fit tokenizer
bert_mlm_cit_tokenizer.save_pretrained('/content/drive/MyDrive/w266/Model-Weights/BERT-base-MLM-Cit-Fit-tokenizer')

('/content/drive/MyDrive/w266/Model-Weights/BERT-base-MLM-Cit-Fit-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/w266/Model-Weights/BERT-base-MLM-Cit-Fit-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/w266/Model-Weights/BERT-base-MLM-Cit-Fit-tokenizer/vocab.txt',
 '/content/drive/MyDrive/w266/Model-Weights/BERT-base-MLM-Cit-Fit-tokenizer/added_tokens.json')

###FinBert

In [None]:
# Loading FinBert-MLM-Cit tokenizer/model
finbert_mlm_cit_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/w266/Model-Weights/FinBert-MLM-Cit-Tokenizer')
finbert_mlm_cit_model = TFBertModel.from_pretrained("/content/drive/MyDrive/w266/Model-Weights/FinBert-MLM-Cit")

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at /content/drive/MyDrive/w266/Model-Weights/FinBert-MLM-Cit.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Creating FinBert-MLM-Cit encodings
finbert_mlm_cit_train_encodings, finbert_mlm_cit_valid_encodings, finbert_mlm_cit_test_encodings = create_datasets(finbert_mlm_cit_tokenizer, x_train, x_val, x_test)

In [None]:
# Creating FinBert-MLM-Cit-Fit  model
finbert_mlm_cit_fit = create_bert_multiclass_model(finbert_mlm_cit_model, num_classes=20, learning_rate=lr_scheduler)

In [None]:
# Running FinBert-MLM-Cit-Fit model
finbert_mlm_cit_fit_model_history = finbert_mlm_cit_fit.fit([finbert_mlm_cit_train_encodings.input_ids, finbert_mlm_cit_train_encodings.token_type_ids, finbert_mlm_cit_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([finbert_mlm_cit_valid_encodings.input_ids, finbert_mlm_cit_valid_encodings.token_type_ids, finbert_mlm_cit_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating FinBert-MLM-Cit-Fit models
finbert_mlm_cit_fit_results = finbert_mlm_cit_fit.evaluate([finbert_mlm_cit_test_encodings.input_ids, finbert_mlm_cit_test_encodings.token_type_ids, finbert_mlm_cit_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {finbert_mlm_cit_fit_results[1]}\n"+
      f"Model loss: {finbert_mlm_cit_fit_results[0]}")


Model accuracy: 0.8593636155128479
Model loss: 0.7672273516654968


In [None]:
# Computing FinBert-MLM-Cit-Fit metric
finbert_mlm_cit_fit_y_pred = finbert_mlm_cit_fit.predict([finbert_mlm_cit_test_encodings.input_ids, finbert_mlm_cit_test_encodings.token_type_ids, finbert_mlm_cit_test_encodings.attention_mask])
pred_finbert_mlm_cit_fit_model = tf.argmax(finbert_mlm_cit_fit_y_pred, axis=-1)

print(classification_report(y_test, pred_finbert_mlm_cit_fit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.9333    0.7671    0.8421        73
        Fed | Central Banks     0.8155    0.8879    0.8501       214
     Company | Product News     0.8483    0.8862    0.8668       852
Treasuries | Corporate Debt     0.8767    0.8312    0.8533        77
                   Dividend     0.9792    0.9691    0.9741        97
                   Earnings     0.9237    0.9504    0.9369       242
               Energy | Oil     0.8125    0.8904    0.8497       146
                 Financials     0.9211    0.8750    0.8974       160
                 Currencies     0.8235    0.8750    0.8485        32
     General News | Opinion     0.8247    0.7143    0.7656       336
  Gold | Metals | Materials     0.5652    1.0000    0.7222        13
                        IPO     0.8667    0.9286    0.8966        14
         Legal | Regulation     0.7692    0.8403    0.8032       119
          M&A | Investments     0

In [None]:
# Checkpointing FinBert-MLM-Cit-Fit model
finbert_mlm_cit_model.save_pretrained('/content/drive/MyDrive/w266/Model-Weights/FinBert-MLM-Cit-Fit')

# Checkpointing FinBert-MLM-Cit-Fit tokenizer
finbert_mlm_cit_tokenizer.save_pretrained('/content/drive/MyDrive/w266/Model-Weights/FinBert-MLM-Cit-Fit-tokenizer')

('/content/drive/MyDrive/w266/Model-Weights/FinBert-MLM-Cit-Fit-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/w266/Model-Weights/FinBert-MLM-Cit-Fit-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/w266/Model-Weights/FinBert-MLM-Cit-Fit-tokenizer/vocab.txt',
 '/content/drive/MyDrive/w266/Model-Weights/FinBert-MLM-Cit-Fit-tokenizer/added_tokens.json')