### Libraries & Packages

In [None]:
# data processessing packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import json

## NN packages
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from sklearn.metrics import classification_report

# NLP packages
!pip install -q transformers
!pip install pydot
from transformers import BertTokenizer,TFAutoModel, TFBertModel, BertForSequenceClassification,TFAutoModelForSequenceClassification
from nltk.tokenize import TweetTokenizer





### Importing/Preprocessing data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/w266/data/clean_train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/w266/data/clean_test_data.csv')

In [None]:
train_data

Unnamed: 0.1,Unnamed: 0,text,label
0,0,Here are Thursday's biggest analyst calls: App...,0
1,1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,3,"Analysts react to Tesla's latest earnings, bre...",0
4,4,Netflix and its peers are set for a ‘return to...,0
...,...,...,...
16985,16985,KfW credit line for Uniper could be raised to ...,3
16986,16986,KfW credit line for Uniper could be raised to ...,3
16987,16987,Russian sells 1 bln roubles at one-year repo...,3
16988,16988,Global ESG bond issuance posts H1 dip as supra...,3


In [None]:
# creating train/val datasets
x_train, x_val, y_train, y_val = train_test_split(train_data.text, train_data.label, test_size=0.20, random_state=42)
# creating test datasets
x_test = test_data.text
y_test = test_data.label

In [None]:
print(f"x_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}\n")
print(f"x_val shape: {x_val.shape}")
print(f"y_val shape: {y_val.shape}\n")
print(f"x_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")


x_train shape: (13592,)
y_train shape: (13592,)

x_val shape: (3398,)
y_val shape: (3398,)

x_test shape: (4117,)
y_test shape: (4117,)


### Gloabl Variables

In [None]:
# Creating of learning rate schedule
num_epochs = 5
num_train_steps = len(x_train) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.,
    decay_steps=num_train_steps
)

In [None]:
target_names = ["Analyst Update","Fed | Central Banks",
        "Company | Product News","Treasuries | Corporate Debt",
        "Dividend","Earnings","Energy | Oil",
        "Financials","Currencies","General News | Opinion",
        "Gold | Metals | Materials","IPO","Legal | Regulation",
        "M&A | Investments","Macro","Markets","Politics",
        "Personnel Change","Stock Commentary", "Stock Movement"]

### Utility functions

In [None]:
# Function for creating tokenized data and outputs for models

def create_datasets(tokenizer, train=x_train, val=x_val, test=x_test):
  # Variables
  # train/val/test = datasets to encode
  # tokenizer = bert tokenizer

  train_encodings = tokenizer(list(train), padding=True, return_tensors='tf')
  valid_encodings = tokenizer(list(val), padding=True, return_tensors='tf')
  test_encodings = tokenizer(list(test), padding=True, return_tensors='tf')

  return train_encodings, valid_encodings, test_encodings


In [None]:
 # Function for creating model
def create_bert_multiclass_model(model,
                                 num_classes = 20,
                                 hidden_size = 201,
                                 dropout=0.3,
                                 learning_rate=lr_scheduler,
                                 activation='softmax'):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes.
    """

    bert_model = model

    # building bert inputs
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    # building bert model
    bert_out = bert_model(bert_inputs)
    pooler_output = bert_out[1] # bert_out.pooler_output

    # building hidden layers
    last_hidden_output = tf.keras.layers.Dense(hidden_size, activation='relu', name='last_hidden_output')(pooler_output)
    last_hidden_output = tf.keras.layers.Dropout(dropout, name='dropout')(last_hidden_output)
    bert_cls_prediction = keras.layers.Dense(num_classes, activation=activation, name='cls_output')(last_hidden_output)

    # compiling model
    bert_cls_model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=bert_cls_prediction)
    bert_cls_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                           metrics='accuracy')

    ### END YOUR CODE
    return bert_cls_model

In [None]:
# Function to show model parameters/shape
def depict_model(model):

  display(model.summary())
  display(keras.utils.plot_model(model, show_shapes=False, show_dtype=False, show_layer_names=True, dpi=90))

In [None]:
# Load the new emoji tokens created using the Tweet Tokenizer in the EDA notebook

with open('/content/drive/MyDrive/w266/data/tweet_emoji_tokens.json', "r") as file:
    emoji_tokens = json.load(file)

### BERT-base model

In [None]:
# Loading bert-base-uncased tokenizer/model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
print(len(bert_tokenizer))  #30522

#Adding emoji tokens to the bert tokenizer
bert_tokenizer.add_tokens(["emoji_tokens"])
print(len(bert_tokenizer))  #30523

30522
30523


In [None]:
bert_model.resize_token_embeddings(len(bert_tokenizer))

<transformers.models.bert.modeling_tf_bert.TFBertEmbeddings at 0x7bc270c7aa10>

In [None]:
# Creating base-base-uncased encodings
base_train_encodings, base_valid_encodings, base_test_encodings = create_datasets(bert_tokenizer)

In [None]:
# Creating bert-base-uncased model
bert_base_Fit = create_bert_multiclass_model(bert_model)

In [None]:
# Running BERT-base-Fit model
bertbase_model_history = bert_base_Fit.fit([base_train_encodings.input_ids, base_train_encodings.token_type_ids, base_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([base_valid_encodings.input_ids, base_valid_encodings.token_type_ids, base_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=num_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
bert_base_fit_results = bert_base_Fit.evaluate([base_test_encodings.input_ids, base_test_encodings.token_type_ids, base_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {bert_base_fit_results[1]}\n"+
      f"Model loss: {bert_base_fit_results[0]}")

Model accuracy: 0.868836522102356
Model loss: 0.5609752535820007


In [None]:
# Computing BERT-base-Fit F1 metric
bert_fit_y_pred = bert_base_Fit.predict([base_test_encodings.input_ids, base_test_encodings.token_type_ids, base_test_encodings.attention_mask])
pred_bert_fit_model = tf.argmax(bert_fit_y_pred, axis=-1)

print(classification_report(y_test, pred_bert_fit_model.numpy(), target_names=target_names, digits=4))

#worse performance compared to the version that did not have emoji tokens.

                             precision    recall  f1-score   support

             Analyst Update     0.9524    0.5479    0.6957        73
        Fed | Central Banks     0.9006    0.6776    0.7733       214
     Company | Product News     0.9198    0.8885    0.9039       852
Treasuries | Corporate Debt     1.0000    0.7013    0.8244        77
                   Dividend     0.9574    0.9278    0.9424        97
                   Earnings     0.9442    0.9793    0.9615       242
               Energy | Oil     0.9561    0.7466    0.8385       146
                 Financials     0.8352    0.9500    0.8889       160
                 Currencies     0.7805    1.0000    0.8767        32
     General News | Opinion     0.7839    0.8095    0.7965       336
  Gold | Metals | Materials     0.6250    0.7692    0.6897        13
                        IPO     0.6364    1.0000    0.7778        14
         Legal | Regulation     0.8000    0.8739    0.8353       119
          M&A | Investments     0

In [None]:
# Saving model & weights
# bert_base_Fit.save_weights('./BERT-base-Fit-weights.h5')
# bert_base_Fit.save('./BERT-base-Fit-model.keras')
# bert_model.save_weights('./Bert-base-fit-modelWeights.h5')

### Finbert-pretrained Model
Basic pretrained model

In [None]:
# Loading finbert-pretrained
finbert_base_model = TFAutoModel.from_pretrained('yiyanghkust/finbert-pretrain', ignore_mismatched_sizes=True, from_pt=True)
finbert_base_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')

Downloading (…)lve/main/config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
print(len(finbert_base_tokenizer))  #30522

#Adding emoji tokens to the finbert tokenizer
finbert_base_tokenizer.add_tokens(["emoji_tokens"])
print(len(finbert_base_tokenizer))  #30523

30874
30874


In [None]:
bert_model.resize_token_embeddings(len(finbert_base_tokenizer))

<transformers.models.bert.modeling_tf_bert.TFBertEmbeddings at 0x7bc270c7aa10>

In [None]:
# Creating base-base-uncased encodings
finbert_base_train_encodings, finbert_base_valid_encodings, finbert_base_test_encodings = create_datasets(finbert_base_tokenizer)

In [None]:
# Creating FinBert-Fit model
finbert_fit_model = create_bert_multiclass_model(finbert_base_model)

In [None]:
# Running FinBert-Fit model
finbert_base_model_history = finbert_fit_model.fit([finbert_base_train_encodings.input_ids, finbert_base_train_encodings.token_type_ids, finbert_base_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([finbert_base_valid_encodings.input_ids, finbert_base_valid_encodings.token_type_ids, finbert_base_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=num_epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Testing model with lr
finbert_fit_results = finbert_fit_model.evaluate([finbert_base_test_encodings.input_ids, finbert_base_test_encodings.token_type_ids, finbert_base_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {finbert_fit_results[1]}\n"+
      f"Model loss: {finbert_fit_results[0]}")

Model accuracy: 0.8756375908851624
Model loss: 0.5996012091636658


In [None]:
# Computing FinBert-Fit F1 metric
finbert_fit_y_pred = finbert_fit_model.predict([finbert_base_test_encodings.input_ids, finbert_base_test_encodings.token_type_ids, finbert_base_test_encodings.attention_mask])
pred_finbert_fit_model = tf.argmax(finbert_fit_y_pred, axis=-1)

print(classification_report(y_test, pred_finbert_fit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.7949    0.8493    0.8212        73
        Fed | Central Banks     0.8916    0.8458    0.8681       214
     Company | Product News     0.8767    0.9096    0.8929       852
Treasuries | Corporate Debt     0.7927    0.8442    0.8176        77
                   Dividend     0.9890    0.9278    0.9574        97
                   Earnings     0.9262    0.9339    0.9300       242
               Energy | Oil     0.9070    0.8014    0.8509       146
                 Financials     0.9478    0.7937    0.8639       160
                 Currencies     0.6809    1.0000    0.8101        32
     General News | Opinion     0.7841    0.8214    0.8023       336
  Gold | Metals | Materials     0.5625    0.6923    0.6207        13
                        IPO     0.8667    0.9286    0.8966        14
         Legal | Regulation     0.8991    0.8235    0.8596       119
          M&A | Investments     0

In [None]:
# Saving FinBert-Fit model & weights
# finbert_fit_model.save_weights('./FinBert-Fit-weights.h5')
# finbert_fit_model.save('./FinBert-Fit-model.keras')
# finbert_base_model.save_weights('./FinBert-fit-modelWeights.h5')