# w266 Cluster inter-training (Cit) & learning rate fine-tuning (Fit)

## Packages and Libraries

In [None]:
# Installs
!pip install sib-clustering
!pip install -q transformers
!pip install pydot

# data processessing packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## NN packages
import tensorflow as tf
from tensorflow import keras
from keras import backend as K
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from sklearn.metrics import classification_report

# NLP packages
from transformers import BertTokenizer,TFAutoModel, TFBertModel, BertForSequenceClassification,TFAutoModelForSequenceClassification


Collecting sib-clustering
  Downloading sib_clustering-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (608 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.3/608.3 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sib-clustering
Successfully installed sib-clustering-0.2.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m


### Mounting Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing/Preprocessing data

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/w266/data/clean_train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/w266/data/clean_test_data.csv')

In [None]:
# creating train/val datasets
x_train, x_val, y_train, y_val = train_test_split(train_data.text, train_data.label, test_size=0.20, random_state=42)
# creating test datasets
x_test = test_data.text
y_test = test_data.label

In [None]:
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}\n")
print(f"x_val shape: {x_val.shape}")
print(f"y_val shape: {y_val.shape}\n")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")


x_train shape: (13592,)
y_train shape: (13592,)

x_val shape: (3398,)
y_val shape: (3398,)

x_test shape: (4117,)
y_test shape: (4117,)


## Global Variables

In [None]:
target_names = ["Analyst Update","Fed | Central Banks",
        "Company | Product News","Treasuries | Corporate Debt",
        "Dividend","Earnings","Energy | Oil",
        "Financials","Currencies","General News | Opinion",
        "Gold | Metals | Materials","IPO","Legal | Regulation",
        "M&A | Investments","Macro","Markets","Politics",
        "Personnel Change","Stock Commentary", "Stock Movement"]

In [None]:
# Creating of learning rate schedule
num_epochs = 5
num_train_steps = len(x_train) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.,
    decay_steps=num_train_steps
)

## Utility Functions

In [None]:
#Function for creating tokenized data and outputs for models

def create_datasets(tokenizer, train, val, test):
  # Variables
  # train/val/test = datasets to encode
  # tokenizer = bert tokenizer

  train_encodings = tokenizer(list(train), padding=True, return_tensors='tf')
  valid_encodings = tokenizer(list(val), padding=True, return_tensors='tf')
  test_encodings = tokenizer(list(test),padding=True, return_tensors='tf')


  return train_encodings, valid_encodings, test_encodings


In [None]:
 # Function for creating model
def create_bert_multiclass_model(model,
                                 num_classes = 20,
                                 hidden_size = 201,
                                 dropout=0.3,
                                 learning_rate=0.00005,
                                 activation='softmax'):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes.
    """

    bert_model = model

    # building bert inputs
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    # building bert model
    bert_out = bert_model(bert_inputs)
    pooler_output = bert_out[1] # bert_out.pooler_output

    # building hidden layers
    last_hidden_output = tf.keras.layers.Dense(hidden_size, activation='relu', name='last_hidden_output')(pooler_output)
    last_hidden_output = tf.keras.layers.Dropout(dropout, name='dropout')(last_hidden_output)
    bert_cls_prediction = keras.layers.Dense(num_classes, activation=activation, name='cls_output')(last_hidden_output)

    # compiling model
    bert_cls_model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=bert_cls_prediction)
    bert_cls_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                           metrics='accuracy')

    ### END YOUR CODE
    return bert_cls_model

##Cit-Fit
Cluster Inter-Training and Fit-Tuning Strategies

###BERT-Base

In [None]:
# Loading BERT-Base-Cit tokenizer/model
cit_bert_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-Cit-tokenizer')
cit_bert_model = TFBertModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-Cit")

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-Cit.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Creating BERT-Base-Cit encodings
cit_bert_train_encodings, cit_bert_valid_encodings, cit_bert_test_encodings = create_datasets(cit_bert_tokenizer, x_train, x_val, x_test)

In [None]:
# Creating BERT-base-Cit-Fit model
bert_cit_fit = create_bert_multiclass_model(cit_bert_model, num_classes=20, learning_rate=lr_scheduler)

In [None]:
# Running BERT-base-Cit-Fit model
bert_cit_fit_model_history = bert_cit_fit.fit([cit_bert_train_encodings.input_ids, cit_bert_train_encodings.token_type_ids, cit_bert_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([cit_bert_valid_encodings.input_ids, cit_bert_valid_encodings.token_type_ids, cit_bert_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Checkpointing BERT-base-Cit-Fit model
cit_bert_model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-Cit-Fit')

# Checkpointing BERT-base-Cit-Fit tokenizer
cit_bert_tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-Cit-Fit-tokenizer')

('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-Cit-Fit-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-Cit-Fit-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-Cit-Fit-tokenizer/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/BERT-base-Cit-Fit-tokenizer/added_tokens.json')

In [None]:
# Evaluating BERT-base-Cit-Fit models
bert_cit_fit_results = bert_cit_fit.evaluate([cit_bert_test_encodings.input_ids, cit_bert_test_encodings.token_type_ids, cit_bert_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {bert_cit_fit_results[1]}\n"+
      f"Model loss: {bert_cit_fit_results[0]}")

Model accuracy: 0.8678649663925171
Model loss: 0.6201757788658142


In [None]:
# Computing BERT-base-Cit-Fit F1 metric
bert_cit_fit_y_pred = bert_cit_fit.predict([cit_bert_test_encodings.input_ids, cit_bert_test_encodings.token_type_ids, cit_bert_test_encodings.attention_mask])
pred_bert_cit_fit_model = tf.argmax(bert_cit_fit_y_pred, axis=-1)

print(classification_report(y_test, pred_bert_cit_fit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.7037    0.7808    0.7403        73
        Fed | Central Banks     0.8883    0.8551    0.8714       214
     Company | Product News     0.9382    0.8369    0.8846       852
Treasuries | Corporate Debt     0.9206    0.7532    0.8286        77
                   Dividend     0.9794    0.9794    0.9794        97
                   Earnings     0.9789    0.9587    0.9687       242
               Energy | Oil     0.8832    0.8288    0.8551       146
                 Financials     0.8427    0.9375    0.8876       160
                 Currencies     0.6667    1.0000    0.8000        32
     General News | Opinion     0.7832    0.8065    0.7947       336
  Gold | Metals | Materials     0.5000    0.6154    0.5517        13
                        IPO     0.8667    0.9286    0.8966        14
         Legal | Regulation     0.9474    0.7563    0.8411       119
          M&A | Investments     0

###FinBert

In [None]:
# Loading FinBert-Cit tokenizer/model
cit_finbert_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-Cit-tokenizer')
cit_finbert_model = TFBertModel.from_pretrained("/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-Cit")

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at /content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-Cit.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Creating FinBert-Cit encodings
cit_finbert_train_encodings, cit_finbert_valid_encodings, cit_finbert_test_encodings = create_datasets(cit_finbert_tokenizer, x_train, x_val, x_test)

In [None]:
# Creating FinBert-Cit model
finbert_cit_fit = create_bert_multiclass_model(cit_finbert_model, num_classes=20, learning_rate=lr_scheduler)

In [None]:
# Running FinBert-Cit model
finbert_cit_fit_model_history = finbert_cit_fit.fit([cit_finbert_train_encodings.input_ids, cit_finbert_train_encodings.token_type_ids, cit_finbert_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([cit_finbert_valid_encodings.input_ids, cit_finbert_valid_encodings.token_type_ids, cit_finbert_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Checkpointing FinBert-Cit-Fit model
cit_finbert_model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-Cit-Fit')

# Checkpointing FinBert-Cit-Fit tokenizer
cit_finbert_tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-Cit-Fit-tokenizer')

('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-Cit-Fit-tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-Cit-Fit-tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-Cit-Fit-tokenizer/vocab.txt',
 '/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/FinBert-Cit-Fit-tokenizer/added_tokens.json')

In [None]:
# Evaluating FinBert-Cit models
finbert_cit_fit_results = finbert_cit_fit.evaluate([cit_finbert_test_encodings.input_ids, cit_finbert_test_encodings.token_type_ids, cit_finbert_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {finbert_cit_fit_results[1]}\n"+
      f"Model loss: {finbert_cit_fit_results[0]}")

Model accuracy: 0.8710225820541382
Model loss: 0.620046854019165


In [None]:
# Computing FinBert-Cit F1 metric
finbert_cit_fit_y_pred = finbert_cit_fit.predict([cit_finbert_test_encodings.input_ids, cit_finbert_test_encodings.token_type_ids, cit_finbert_test_encodings.attention_mask])
pred_finbert_cit_fit_model = tf.argmax(finbert_cit_fit_y_pred, axis=-1)

print(classification_report(y_test, pred_finbert_cit_fit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.9388    0.6301    0.7541        73
        Fed | Central Banks     0.8768    0.8318    0.8537       214
     Company | Product News     0.8564    0.9096    0.8822       852
Treasuries | Corporate Debt     0.8571    0.7792    0.8163        77
                   Dividend     0.9592    0.9691    0.9641        97
                   Earnings     0.9630    0.9669    0.9649       242
               Energy | Oil     0.8355    0.8699    0.8523       146
                 Financials     0.9068    0.9125    0.9097       160
                 Currencies     0.7045    0.9688    0.8158        32
     General News | Opinion     0.8036    0.8036    0.8036       336
  Gold | Metals | Materials     0.6000    0.6923    0.6429        13
                        IPO     0.8125    0.9286    0.8667        14
         Legal | Regulation     0.7910    0.8908    0.8379       119
          M&A | Investments     0