# w266 Masked language modeling (MLM) & Cluster inter-training (Cit)

## Packages and Libraries

In [None]:
# Installs
!pip install sib-clustering
!pip install -q transformers

# data processessing packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## NN packages
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from sklearn.metrics import classification_report
from sib import SIB
from sklearn.feature_extraction.text import CountVectorizer

# NLP packages
from transformers import BertTokenizer,TFAutoModel, TFBertModel, BertForSequenceClassification,TFAutoModelForSequenceClassification


Collecting sib-clustering
  Downloading sib_clustering-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (608 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.3/608.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sib-clustering
Successfully installed sib-clustering-0.2.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h

### Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Global Variables

In [None]:
target_names = ["Analyst Update","Fed | Central Banks",
        "Company | Product News","Treasuries | Corporate Debt",
        "Dividend","Earnings","Energy | Oil",
        "Financials","Currencies","General News | Opinion",
        "Gold | Metals | Materials","IPO","Legal | Regulation",
        "M&A | Investments","Macro","Markets","Politics",
        "Personnel Change","Stock Commentary", "Stock Movement"]

## Utility Functions

In [None]:
#Function for creating tokenized data and outputs for models

def create_datasets(tokenizer, train, val, test):
  # Variables
  # train/val/test = datasets to encode
  # tokenizer = bert tokenizer

  train_encodings = tokenizer(list(train), padding=True, return_tensors='tf')
  valid_encodings = tokenizer(list(val), padding=True, return_tensors='tf')
  test_encodings = tokenizer(list(test),padding=True, return_tensors='tf')


  return train_encodings, valid_encodings, test_encodings


In [None]:
 # Function for creating model
def create_bert_multiclass_model(model,
                                 num_classes = 20,
                                 hidden_size = 201,
                                 dropout=0.3,
                                 learning_rate=0.00005,
                                 activation='softmax'):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes.
    """

    bert_model = model

    # building bert inputs
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    # building bert model
    bert_out = bert_model(bert_inputs)
    pooler_output = bert_out[1] # bert_out.pooler_output

    # building hidden layers
    last_hidden_output = tf.keras.layers.Dense(hidden_size, activation='relu', name='last_hidden_output')(pooler_output)
    last_hidden_output = tf.keras.layers.Dropout(dropout, name='dropout')(last_hidden_output)
    bert_cls_prediction = keras.layers.Dense(num_classes, activation=activation, name='cls_output')(last_hidden_output)

    # compiling model
    bert_cls_model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=bert_cls_prediction)
    bert_cls_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                           metrics='accuracy')

    ### END YOUR CODE
    return bert_cls_model

## Importing and vectorizing data

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/w266/data/clean_train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/w266/data/clean_test_data.csv')

In [None]:
# create count vectors using the 10K most frequent words
vectorizer = CountVectorizer(max_features=10000)
X = vectorizer.fit_transform(train_data.text)

### sIB Model

In [None]:
# SIB initialization and clustering; parameters:
# perform 10 random initializations (n_init=10); the best one is returned.
# up to 15 optimization iterations in each initialization (max_iter=15)
# use all cores in the running machine for parallel execution (n_jobs=-1)
sib = SIB(n_clusters=50, random_state=128, n_init=10,
          n_jobs=-1, max_iter=15, verbose=True)
sib.fit(X)

Initialization complete
sIB information stats on best partition:
	I(T;Y) = 2.2845, H(T) = 5.5975
	I(T;Y)/I(X;Y) = 0.3192
	H(T)/H(X) = 0.3984


In [None]:
y_sib = sib.labels_
y_sib

array([28,  4, 36, ...,  4, 13, 28], dtype=int32)

### Creating train/val data sets with sIB results & test data set

In [None]:
# Replacing original labels with sIB labels
train_data_sib = pd.DataFrame(train_data.iloc[:,1].copy())
train_data_sib['labels'] = y_sib.tolist()
train_data_sib

Unnamed: 0,text,labels
0,Here are Thursday's biggest analyst calls: App...,28
1,Buy Las Vegas Sands as travel to Singapore bui...,4
2,"Piper Sandler downgrades DocuSign to sell, cit...",36
3,"Analysts react to Tesla's latest earnings, bre...",28
4,Netflix and its peers are set for a ‘return to...,28
...,...,...
16985,KfW credit line for Uniper could be raised to ...,0
16986,KfW credit line for Uniper could be raised to ...,0
16987,Russian sells 1 bln roubles at one-year repo...,4
16988,Global ESG bond issuance posts H1 dip as supra...,13


In [None]:
# creating train/val datasets
x_train, x_val, sib_y_train, sib_y_val = train_test_split(train_data_sib.text, train_data_sib.labels, test_size=0.20, random_state=42)

print(f"x_train shape: {x_train.shape}")
print(f"sib_y_train shape: {sib_y_train.shape}\n")
print(f"x_val shape: {x_val.shape}")
print(f"sib_y_val shape: {sib_y_val.shape}\n")

x_train shape: (13592,)
sib_y_train shape: (13592,)

x_val shape: (3398,)
sib_y_val shape: (3398,)



In [None]:
# Creating true y values data set

y_train = train_data.label[sib_y_train.index]
y_val = train_data.label[sib_y_val.index]

print(f"y_train shape: {y_train.shape}\n")
print(f"y_val shape: {y_val.shape}\n")

y_train shape: (13592,)

y_val shape: (3398,)



In [None]:
x_test = test_data.text
y_test = test_data.label

print(f"x_test shape: {x_test.shape}\n")
print(f"y_test shape: {y_test.shape}\n")

x_test shape: (4117,)

y_test shape: (4117,)



### BERT-base

In [None]:
# Loading BERT-base-MLM tokenizer/model
bert_mlm_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-tokenizer')
bert_mlm_model = TFBertModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM")

Some layers from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM and are newly initialized: ['bert/pooler/dense/kernel:0', 'bert/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Creating BERT-base-MLM encodings
bert_mlm_train_encodings, bert_mlm_valid_encodings, bert_mlm_test_encodings = create_datasets(bert_mlm_tokenizer, x_train, x_val, x_test)

#### sIB model

In [None]:
# Creating BERT-base-MLM-sib model
bert_mlm_sib = create_bert_multiclass_model(bert_mlm_model, num_classes=50)

In [None]:
# Training SIB-bert-base model
# Original model with 5 epochs
bert_mlm_sib_model_history = bert_mlm_sib.fit([bert_mlm_train_encodings.input_ids, bert_mlm_train_encodings.token_type_ids, bert_mlm_train_encodings.attention_mask],
                                                  sib_y_train,
                                                  validation_data=([bert_mlm_valid_encodings.input_ids, bert_mlm_valid_encodings.token_type_ids, bert_mlm_valid_encodings.attention_mask],
                                                  sib_y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating SIB-bert-base model
bert_mlm_sib_results = bert_mlm_sib.evaluate([bert_mlm_test_encodings.input_ids, bert_mlm_test_encodings.token_type_ids, bert_mlm_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {bert_mlm_sib_results[1]}\n"+
      f"Model loss: {bert_mlm_sib_results[0]}")

Model accuracy: 0.011658974923193455
Model loss: 10.500276565551758


##### MLM-Cit

In [None]:
# Creating BERT-base-MLM-Cit model
bert_mlm_cit = create_bert_multiclass_model(bert_mlm_model, num_classes=20)

In [None]:
# Running BERT-base-MLM-Cit model
bert_mlm_cit_model_history = bert_mlm_cit.fit([bert_mlm_train_encodings.input_ids, bert_mlm_train_encodings.token_type_ids, bert_mlm_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([bert_mlm_valid_encodings.input_ids, bert_mlm_valid_encodings.token_type_ids, bert_mlm_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating BERT-base-MLM-Cit models
bert_mlm_cit_results = bert_mlm_cit.evaluate([bert_mlm_test_encodings.input_ids, bert_mlm_test_encodings.token_type_ids, bert_mlm_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {bert_mlm_cit_results[1]}\n"+
      f"Model loss: {bert_mlm_cit_results[0]}")


Model accuracy: 0.8746660351753235
Model loss: 0.5195614695549011


In [None]:
# Computing BERT-base-MLM-Cit F1 metric
bert_mlm_cit_y_pred = bert_mlm_cit.predict([bert_mlm_test_encodings.input_ids, bert_mlm_test_encodings.token_type_ids, bert_mlm_test_encodings.attention_mask])
pred_bert_mlm_cit_model = tf.argmax(bert_mlm_cit_y_pred, axis=-1)

print(classification_report(y_test, pred_bert_mlm_cit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.8710    0.7397    0.8000        73
        Fed | Central Banks     0.8571    0.9252    0.8899       214
     Company | Product News     0.8943    0.8638    0.8788       852
Treasuries | Corporate Debt     0.8906    0.7403    0.8085        77
                   Dividend     0.9785    0.9381    0.9579        97
                   Earnings     0.9402    0.9752    0.9574       242
               Energy | Oil     0.7840    0.8699    0.8247       146
                 Financials     0.8909    0.9187    0.9046       160
                 Currencies     0.7750    0.9688    0.8611        32
     General News | Opinion     0.7453    0.8274    0.7842       336
  Gold | Metals | Materials     0.9000    0.6923    0.7826        13
                        IPO     0.8750    1.0000    0.9333        14
         Legal | Regulation     0.9320    0.8067    0.8649       119
          M&A | Investments     0

In [None]:
# Checkpointing BERT-base-MLM-Cit model
bert_mlm_model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-Cit')

# Checkpointing BERT-base-MLM-Cit tokenizer
bert_mlm_tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-Cit-tokenizer')

('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-Cit-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-Cit-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-Cit-tokenizer/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-MLM-Cit-tokenizer/added_tokens.json')

### FinBert

In [None]:
# Loading FinBert-MLM tokenizer/model
finbert_mlm_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM-tokenizer')
finbert_mlm_model = TFBertModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM")

Some layers from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM were not used when initializing TFBertModel: ['mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM and are newly initialized: ['bert/pooler/dense/kernel:0', 'bert/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Creating FinBert-MLM encodings
finbert_mlm_train_encodings, finbert_mlm_valid_encodings, finbert_mlm_test_encodings = create_datasets(finbert_mlm_tokenizer, x_train, x_val, x_test)

##### sIB Model

In [None]:
# Creating SIB-Fin-bert model
finbert_mlm_sib = create_bert_multiclass_model(finbert_mlm_model, num_classes=50)

In [None]:
# Training SIB-bert-base model
# Original model with 5 epochs
finbert_mlm_sib_model_history = finbert_mlm_sib.fit([finbert_mlm_train_encodings.input_ids, finbert_mlm_train_encodings.token_type_ids, finbert_mlm_train_encodings.attention_mask],
                                                  sib_y_train,
                                                  validation_data=([finbert_mlm_valid_encodings.input_ids, finbert_mlm_valid_encodings.token_type_ids, finbert_mlm_valid_encodings.attention_mask],
                                                  sib_y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating SIB-bert-base model
finbert_mlm_sib_results = finbert_mlm_sib.evaluate([finbert_mlm_test_encodings.input_ids, finbert_mlm_test_encodings.token_type_ids, finbert_mlm_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {finbert_mlm_sib_results[1]}\n"+
      f"Model loss: {finbert_mlm_sib_results[0]}")

Model accuracy: 0.011658974923193455
Model loss: 10.815556526184082


MLM-Cit

In [None]:
# Creating new bert-base model to be trained with updated SIB-bert-base weights
finbert_mlm_cit = create_bert_multiclass_model(finbert_mlm_model, num_classes=20)

In [None]:
# Running SIB trained bert-base model
# Original model with 5 epochs
finbert_mlm_cit_model_history = finbert_mlm_cit.fit([finbert_mlm_train_encodings.input_ids, finbert_mlm_train_encodings.token_type_ids, finbert_mlm_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([finbert_mlm_valid_encodings.input_ids, finbert_mlm_valid_encodings.token_type_ids, finbert_mlm_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating BERT-base-MLM-Fit models
finbert_mlm_cit_results = finbert_mlm_cit.evaluate([finbert_mlm_test_encodings.input_ids, finbert_mlm_test_encodings.token_type_ids, finbert_mlm_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {finbert_mlm_cit_results[1]}\n"+
      f"Model loss: {finbert_mlm_cit_results[0]}")


Model accuracy: 0.8649501800537109
Model loss: 0.6081891059875488


In [None]:
# Computing BERT-base-MLM-Cit F1 metric
finbert_mlm_cit_y_pred = finbert_mlm_cit.predict([finbert_mlm_test_encodings.input_ids, finbert_mlm_test_encodings.token_type_ids, finbert_mlm_test_encodings.attention_mask])
pred_finbert_mlm_cit_model = tf.argmax(finbert_mlm_cit_y_pred, axis=-1)

print(classification_report(y_test, pred_finbert_mlm_cit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.8788    0.7945    0.8345        73
        Fed | Central Banks     0.9435    0.7804    0.8542       214
     Company | Product News     0.8435    0.9237    0.8818       852
Treasuries | Corporate Debt     0.8485    0.7273    0.7832        77
                   Dividend     0.9896    0.9794    0.9845        97
                   Earnings     0.9742    0.9380    0.9558       242
               Energy | Oil     0.9538    0.8493    0.8986       146
                 Financials     0.9351    0.9000    0.9172       160
                 Currencies     0.6250    0.9375    0.7500        32
     General News | Opinion     0.6389    0.8899    0.7438       336
  Gold | Metals | Materials     0.6316    0.9231    0.7500        13
                        IPO     1.0000    0.2143    0.3529        14
         Legal | Regulation     0.8952    0.7899    0.8393       119
          M&A | Investments     0

In [None]:
# Checkpointing FinBert-MLM-Cit model
finbert_mlm_model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM-Cit')

# Checkpointing FinBert-MLM-Cit tokenizer
finbert_mlm_tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM-Cit-tokenizer')

('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM-Cit-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM-Cit-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM-Cit-tokenizer/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-MLM-Cit-tokenizer/added_tokens.json')