# w266 Cluster inter-training




### Packages

In [None]:
!pip install sib-clustering
!pip install -q transformers
!pip install pydot

# data processessing packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# NN Packages
import tensorflow as tf
from tensorflow import keras

# ML packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.metrics import classification_report
from sib import SIB

# NLP packages
from transformers import BertTokenizer,TFAutoModel, TFBertModel, BertForSequenceClassification,TFAutoModelForSequenceClassification


Collecting sib-clustering
  Downloading sib_clustering-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (608 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.3/608.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sib-clustering
Successfully installed sib-clustering-0.2.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m


### Global Variables

In [None]:
target_names = ["Analyst Update","Fed | Central Banks",
        "Company | Product News","Treasuries | Corporate Debt",
        "Dividend","Earnings","Energy | Oil",
        "Financials","Currencies","General News | Opinion",
        "Gold | Metals | Materials","IPO","Legal | Regulation",
        "M&A | Investments","Macro","Markets","Politics",
        "Personnel Change","Stock Commentary", "Stock Movement"]

### Utility Functions

In [None]:
#Function for creating tokenized data and outputs for models

def create_datasets(tokenizer, train, val, test):
  # Variables
  # train/val/test = datasets to encode
  # tokenizer = bert tokenizer

  train_encodings = tokenizer(list(train), padding=True, return_tensors='tf')
  valid_encodings = tokenizer(list(val), padding=True, return_tensors='tf')
  test_encodings = tokenizer(list(test),padding=True, return_tensors='tf')


  return train_encodings, valid_encodings, test_encodings


In [None]:
 # Function for creating model
def create_bert_multiclass_model(model,
                                 num_classes,
                                 hidden_size = 201,
                                 dropout=0.3,
                                 learning_rate=0.00005,
                                 activation='softmax'):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes.
    """

    bert_model = model

    # building bert inputs
    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    # building bert model
    bert_out = bert_model(bert_inputs)
    pooler_output = bert_out.pooler_output #bert_out[1]

    # building hidden layers
    last_hidden_output = tf.keras.layers.Dense(hidden_size, activation='relu', name='last_hidden_output')(pooler_output)
    last_hidden_output = tf.keras.layers.Dropout(dropout, name='dropout')(last_hidden_output)
    bert_cls_prediction = keras.layers.Dense(num_classes, activation=activation, name='cls_output')(last_hidden_output)

    # compiling model
    bert_cls_model = keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=bert_cls_prediction)
    bert_cls_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                           loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                           metrics='accuracy')

    ### END YOUR CODE
    return bert_cls_model

### Importing Data & BOW

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/w266/finalProject/data/clean_train_data.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/w266/finalProject/data/clean_test_data.csv')

In [None]:
# create count vectors using the 10K most frequent words
vectorizer = CountVectorizer(max_features=10000)
X = vectorizer.fit_transform(train_data.text)

In [None]:
X.shape

(16990, 10000)

### sIB Model

In [None]:
# SIB initialization and clustering; parameters:
# perform 10 random initializations (n_init=10); the best one is returned.
# up to 15 optimization iterations in each initialization (max_iter=15)
# use all cores in the running machine for parallel execution (n_jobs=-1)
sib = SIB(n_clusters=50, random_state=128, n_init=10,
          n_jobs=-1, max_iter=15, verbose=True)
sib.fit(X)

Initialization complete
sIB information stats on best partition:
	I(T;Y) = 2.2845, H(T) = 5.5975
	I(T;Y)/I(X;Y) = 0.3192
	H(T)/H(X) = 0.3984


In [None]:
y_sib = sib.labels_
y_sib

array([28,  4, 36, ...,  4, 13, 28], dtype=int32)

### Creating train/val data sets with sIB results & test data set

In [None]:
# Replacing original labels with sIB labels
train_data_sib = pd.DataFrame(train_data.iloc[:,1].copy())
train_data_sib['labels'] = y_sib.tolist()
train_data_sib

Unnamed: 0,text,labels
0,Here are Thursday's biggest analyst calls: App...,28
1,Buy Las Vegas Sands as travel to Singapore bui...,4
2,"Piper Sandler downgrades DocuSign to sell, cit...",36
3,"Analysts react to Tesla's latest earnings, bre...",28
4,Netflix and its peers are set for a ‘return to...,28
...,...,...
16985,KfW credit line for Uniper could be raised to ...,0
16986,KfW credit line for Uniper could be raised to ...,0
16987,Russian sells 1 bln roubles at one-year repo...,4
16988,Global ESG bond issuance posts H1 dip as supra...,13


In [None]:
train_data_sib.to_csv('/content/drive/MyDrive/Colab Notebooks/w266/finalProject/data/sib_train.csv')

In [None]:
# creating train/val datasets
x_train, x_val, sib_y_train, sib_y_val = train_test_split(train_data_sib.text, train_data_sib.labels, test_size=0.20, random_state=42)

print(f"x_train shape: {x_train.shape}")
print(f"sib_y_train shape: {sib_y_train.shape}\n")
print(f"x_val shape: {x_val.shape}")
print(f"sib_y_val shape: {sib_y_val.shape}\n")

x_train shape: (13592,)
sib_y_train shape: (13592,)

x_val shape: (3398,)
sib_y_val shape: (3398,)



In [None]:
# Creating true y values data set

y_train = train_data.label[sib_y_train.index]
y_val = train_data.label[sib_y_val.index]

print(f"y_train shape: {y_train.shape}\n")
print(f"y_val shape: {y_val.shape}\n")

y_train shape: (13592,)

y_val shape: (3398,)



In [None]:
type(x_train)

pandas.core.series.Series

In [None]:
x_test = test_data.text
y_test = test_data.label

print(f"x_test shape: {x_test.shape}\n")
print(f"y_test shape: {y_test.shape}\n")

x_test shape: (4117,)

y_test shape: (4117,)



In [None]:
type(x_test)

pandas.core.series.Series

### BERT-base
Applying transfer learning from SIB to BERT-base model

In [None]:
# Loading bert-base-uncased tokenizer/model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_base_model = TFAutoModel.from_pretrained("bert-base-uncased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [None]:
# Loading previously save SIB-Bert-base weights
# bert_base_model.load_weights('./sib_bert-base(1).h5', by_name=True)

In [None]:
# Creating base-base-uncased encodings
base_train_encodings, base_valid_encodings, base_test_encodings = create_datasets(bert_tokenizer, x_train, x_val, x_test)

In [None]:
# Sanity check

# train_encodings

In [None]:
# Creating SIB-bert-base-uncased model
train_sib_bert_base_model = create_bert_multiclass_model(bert_base_model, num_classes=50)

In [None]:
# Training SIB-bert-base model
# Original model with 5 epochs
train_sib_bertbase_model_history = train_sib_bert_base_model.fit([base_train_encodings.input_ids, base_train_encodings.token_type_ids, base_train_encodings.attention_mask],
                                                  sib_y_train,
                                                  validation_data=([base_valid_encodings.input_ids, base_valid_encodings.token_type_ids, base_valid_encodings.attention_mask],
                                                  sib_y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Checkpoint for saving SIB-fin-bert model
train_sib_bert_base_model.save('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/train_sib_bertbase_model.keras')

In [None]:
# Creating new bert-base model to be trained with updated SIB-bert-base weights
BERT_base_Cit = create_bert_multiclass_model(bert_base_model, num_classes=20)

In [None]:
# Running SIB trained bert-base model
# Original model with 5 epochs
BERT_base_Cit_model_history = BERT_base_Cit.fit([base_train_encodings.input_ids, base_train_encodings.token_type_ids, base_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([base_valid_encodings.input_ids, base_valid_encodings.token_type_ids, base_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating BERT_base_Cit models
cit_bertbase_results = BERT_base_Cit.evaluate([base_test_encodings.input_ids, base_test_encodings.token_type_ids, base_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {cit_bertbase_results[1]}\n"+
      f"Model loss: {cit_bertbase_results[0]}")

Model accuracy: 0.870293915271759
Model loss: 0.5849003195762634


In [None]:
# Computing BERT-base-Cit F1 metric
base_cit_y_pred = BERT_base_Cit.predict([base_test_encodings.input_ids, base_test_encodings.token_type_ids, base_test_encodings.attention_mask])
pred_base_Cit_model = tf.argmax(base_cit_y_pred, axis=-1)

print(classification_report(y_test, pred_base_Cit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.8525    0.7123    0.7761        73
        Fed | Central Banks     0.8815    0.8692    0.8753       214
     Company | Product News     0.8637    0.8850    0.8742       852
Treasuries | Corporate Debt     0.9333    0.7273    0.8175        77
                   Dividend     0.9789    0.9588    0.9688        97
                   Earnings     0.9512    0.9669    0.9590       242
               Energy | Oil     0.7844    0.8973    0.8371       146
                 Financials     0.8605    0.9250    0.8916       160
                 Currencies     0.8286    0.9062    0.8657        32
     General News | Opinion     0.8066    0.7321    0.7676       336
  Gold | Metals | Materials     0.7333    0.8462    0.7857        13
                        IPO     0.6500    0.9286    0.7647        14
         Legal | Regulation     0.8448    0.8235    0.8340       119
          M&A | Investments     0

In [None]:
# Saving BERT-base-Cit model/weights
BERT_base_Cit.save_weights('./BERT-base-Cit-weights.h5')
BERT_base_Cit.save('./BERT-base-Cit-model.keras')

### Fin-bert

Applying transfer learning from SIB to Fin-bert model



In [None]:
# Loading finbert-pretrained
finbert_base_model = TFAutoModel.from_pretrained('yiyanghkust/finbert-pretrain', ignore_mismatched_sizes=True, from_pt=True)
finbert_base_tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-pretrain')

Downloading (…)lve/main/config.json:   0%|          | 0.00/359 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
# Creating Fin-Bert encodings
fin_train_encodings, fin_valid_encodings, fin_test_encodings = create_datasets(finbert_base_tokenizer, x_train, x_val, x_test)

In [None]:
# Creating SIB-Fin-bert model
train_sib_finbert_base_model = create_bert_multiclass_model(finbert_base_model, num_classes=50)

In [None]:
# Training SIB-Fin-bert model
# Original model with 5 epochs
train_sib_finbert_base_model_history = train_sib_finbert_base_model.fit([fin_train_encodings.input_ids, fin_train_encodings.token_type_ids, fin_train_encodings.attention_mask],
                                                  sib_y_train,
                                                  validation_data=([fin_valid_encodings.input_ids, fin_valid_encodings.token_type_ids, fin_valid_encodings.attention_mask],
                                                  sib_y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Checkpoint for saving SIB-fin-bert model
train_sib_finbert_base_model.save('/content/drive/MyDrive/Colab Notebooks/w266/Model Weights/train_sib_finbert_model.keras')

In [None]:
# Creating new fin_bert model to be trained with updated SIB-fin-bert weights
FinBert_Cit = create_bert_multiclass_model(finbert_base_model, num_classes=20)

In [None]:
# Running SIB trained Fin-Bert model
# Original model with 5 epochs
FinBert_Cit_model_history = FinBert_Cit.fit([fin_train_encodings.input_ids, fin_train_encodings.token_type_ids, fin_train_encodings.attention_mask],
                                                  y_train,
                                                  validation_data=([fin_valid_encodings.input_ids, fin_valid_encodings.token_type_ids, fin_valid_encodings.attention_mask],
                                                  y_val),
                                                  batch_size=8,
                                                  epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
# Evaluating FinBert-Cit models
cit_bertbase_results = FinBert_Cit.evaluate([fin_test_encodings.input_ids, fin_test_encodings.token_type_ids, fin_test_encodings.attention_mask],
                                         y_test,
                                         batch_size=8)

print(f"Model accuracy: {cit_bertbase_results[1]}\n"+
      f"Model loss: {cit_bertbase_results[0]}")

Model accuracy: 0.8576633334159851
Model loss: 0.6967729330062866


In [None]:
# Computing FinBert-Cit F1 metric
fin_cit_y_pred = FinBert_Cit.predict([fin_test_encodings.input_ids, fin_test_encodings.token_type_ids, fin_test_encodings.attention_mask])
pred_Fin_Cit_model = tf.argmax(fin_cit_y_pred, axis=-1)

print(classification_report(y_test, pred_Fin_Cit_model.numpy(), target_names=target_names, digits=4))



                             precision    recall  f1-score   support

             Analyst Update     0.9400    0.6438    0.7642        73
        Fed | Central Banks     0.9171    0.8785    0.8974       214
     Company | Product News     0.8641    0.8955    0.8795       852
Treasuries | Corporate Debt     0.8857    0.8052    0.8435        77
                   Dividend     0.9792    0.9691    0.9741        97
                   Earnings     0.9352    0.9545    0.9448       242
               Energy | Oil     0.9423    0.6712    0.7840       146
                 Financials     0.7723    0.9750    0.8619       160
                 Currencies     0.7838    0.9062    0.8406        32
     General News | Opinion     0.8315    0.6905    0.7545       336
  Gold | Metals | Materials     0.6875    0.8462    0.7586        13
                        IPO     0.8750    1.0000    0.9333        14
         Legal | Regulation     0.8306    0.8655    0.8477       119
          M&A | Investments     0

In [None]:
# Saving FinBert-Cit model/weights
FinBert_Cit.save_weights('./FinBert-Cit-weights.h5')
FinBert_Cit.save('./FinBert-Cit-model.keras')

### References


*   sIB: https://github.com/IBM/sib


