In [49]:
#https://github.com/ProsusAI/finBERT

In [1]:
# Create datafiles
# cd
# source ./anaconda3/bin/activate
# conda activate finbert
# cd /mnt/files/workdata/work/python-scripts/prediction_analyzer/predict-stock-quotes/finbert
# python scripts/datasets.py --data_path /mnt/files/workdata/work/python-scripts/prediction_analyzer/predict-stock-quotes/finbert/data/benzinga_data/benzinga_text_labels.txt

# FinBERT Example Notebook

This notebooks shows how to train and use the FinBERT pre-trained language model for financial sentiment analysis.

## Modules 

In [1]:
from pathlib import Path
import shutil
from collections import Counter
import os
import logging
import sys
sys.path.append('..')

from textblob import TextBlob
from pprint import pprint
from sklearn.metrics import classification_report

from transformers import AutoModelForSequenceClassification

from finbert.finbert import *
import finbert.utils as tools

%load_ext autoreload
%autoreload 2

project_dir = Path.cwd().parent
pd.set_option('max_colwidth', -1)



In [2]:
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.ERROR)

## Prepare the model

### Setting path variables:
1. `lm_path`: the path for the pre-trained language model (If vanilla Bert is used then no need to set this one).
2. `cl_path`: the path where the classification model is saved.
3. `cl_data_path`: the path of the directory that contains the data files of `train.csv`, `validation.csv`, `test.csv`.
---

In the initialization of `bertmodel`, we can either use the original pre-trained weights from Google by giving `bm = 'bert-base-uncased`, or our further pre-trained language model by `bm = lm_path`


---
All of the configurations with the model is controlled with the `config` variable. 

#### Path to models from docker container

In [3]:
path_docker_models = project_dir /'models'/'docker_models'/'finbert_model'

In [17]:
cl_data_path

PosixPath('/mnt/files/workdata/work/python-scripts/prediction_analyzer/predict_stock_quotes/finbert/data/sentiment_data')

In [4]:
lm_path = path_docker_models
cl_path = project_dir /'models'/'docker_models'/'finbert_model'/'prepare_model'
cl_data_path = project_dir /'finbert'/'data'/'sentiment_data'

###  Configuring training parameters

You can find the explanations of the training parameters in the class docsctrings. 

In [5]:
# Clean the cl_path
try:
    shutil.rmtree(cl_path) 
except:
    pass

bertmodel = AutoModelForSequenceClassification.from_pretrained(lm_path, cache_dir=None, num_labels=3)


config = Config(   data_dir=cl_data_path,
                   bert_model=bertmodel,
                   num_train_epochs=4,
                   model_dir=cl_path,
                   max_seq_length = 48,
                   train_batch_size = 32,
                   learning_rate = 2e-5,
                   output_mode='classification',
                   warm_up_proportion=0.2,
                   local_rank=-1,
                   discriminate=True,
                   gradual_unfreeze=True,
                   
               )

`finbert` is our main class that encapsulates all the functionality. The list of class labels should be given in the prepare_model method call with label_list parameter.

In [8]:
finbert = FinBert(config)
finbert.base_model = 'bert-base-uncased'
finbert.config.discriminate=True
finbert.config.gradual_unfreeze=True

In [9]:
finbert.prepare_model(label_list=['positive', 'negative', 'neutral'])

06/11/2021 15:36:00 - INFO - finbert.finbert -   device: cpu n_gpu: 0, distributed training: False, 16-bits training: False


## Fine-tune the model

In [20]:
# Get the training examples
train_data = finbert.get_data('train')

In [23]:
model = finbert.create_the_model()

### [Optional] Fine-tune only a subset of the model
The variable `freeze` determines the last layer (out of 12) to be freezed. You can skip this part if you want to fine-tune the whole model.

<span style="color:red">Important: </span>
Execute this step if you want a shorter training time in the expense of accuracy.

In [50]:
#This is for fine-tuning a subset of the model.

# freeze = 6

# for param in model.bert.embeddings.parameters():
#     param.requires_grad = False
    
# for i in range(freeze):
#     for param in model.bert.encoder.layer[i].parameters():
#         param.requires_grad = False

### Training

In [24]:
trained_model = finbert.train(train_examples = train_data, model = model)

06/11/2021 15:57:51 - INFO - finbert.utils -   *** Example ***
06/11/2021 15:57:51 - INFO - finbert.utils -   guid: train-1
06/11/2021 15:57:51 - INFO - finbert.utils -   tokens: [CLS] has anyone seen this ‚ au ##re ##gul ##atory fee ? ‚ au what percentage is the fee ? can only be seen of you scroll down on a past sale order . [SEP]
06/11/2021 15:57:51 - INFO - finbert.utils -   input_ids: 101 2038 3087 2464 2023 1522 8740 2890 24848 14049 7408 1029 1522 8740 2054 7017 2003 1996 7408 1029 2064 2069 2022 2464 1997 2017 17186 2091 2006 1037 2627 5096 2344 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 15:57:51 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 15:57:51 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 15:57:51 - INFO - finbert.utils -   label: neutral (id = 2)
06/11/2021 15:57

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

06/11/2021 16:01:51 - INFO - finbert.utils -   *** Example ***
06/11/2021 16:01:51 - INFO - finbert.utils -   guid: validation-1
06/11/2021 16:01:51 - INFO - finbert.utils -   tokens: [CLS] morgan stanley maintains over ##weight on q ##or ##vo , raises price target to $ 223 [SEP]
06/11/2021 16:01:51 - INFO - finbert.utils -   input_ids: 101 5253 6156 9319 2058 11179 2006 1053 2953 6767 1010 13275 3976 4539 2000 1002 20802 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:01:51 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:01:51 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:01:51 - INFO - finbert.utils -   label: positive (id = 0)
06/11/2021 16:01:51 - INFO - finbert.finbert -   ***** Loading data *****
06/11/2021 16:01:51 - INFO - finbert.finbert -

Validating:   0%|          | 0/52 [00:00<?, ?it/s]

Validation losses: [0.42696812003850937]
No best model found


Epoch:  25%|██▌       | 1/4 [04:37<13:51, 277.16s/it]

Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

06/11/2021 16:08:04 - INFO - finbert.utils -   *** Example ***
06/11/2021 16:08:04 - INFO - finbert.utils -   guid: validation-1
06/11/2021 16:08:04 - INFO - finbert.utils -   tokens: [CLS] morgan stanley maintains over ##weight on q ##or ##vo , raises price target to $ 223 [SEP]
06/11/2021 16:08:04 - INFO - finbert.utils -   input_ids: 101 5253 6156 9319 2058 11179 2006 1053 2953 6767 1010 13275 3976 4539 2000 1002 20802 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:08:04 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:08:04 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:08:04 - INFO - finbert.utils -   label: positive (id = 0)
06/11/2021 16:08:04 - INFO - finbert.finbert -   ***** Loading data *****
06/11/2021 16:08:04 - INFO - finbert.finbert -

Validating:   0%|          | 0/52 [00:00<?, ?it/s]

Epoch:  50%|█████     | 2/4 [10:49<11:06, 333.13s/it]

Validation losses: [0.42696812003850937, 0.46287959226622033]


Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

06/11/2021 16:16:09 - INFO - finbert.utils -   *** Example ***
06/11/2021 16:16:09 - INFO - finbert.utils -   guid: validation-1
06/11/2021 16:16:09 - INFO - finbert.utils -   tokens: [CLS] morgan stanley maintains over ##weight on q ##or ##vo , raises price target to $ 223 [SEP]
06/11/2021 16:16:09 - INFO - finbert.utils -   input_ids: 101 5253 6156 9319 2058 11179 2006 1053 2953 6767 1010 13275 3976 4539 2000 1002 20802 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:16:09 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:16:09 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:16:09 - INFO - finbert.utils -   label: positive (id = 0)
06/11/2021 16:16:09 - INFO - finbert.finbert -   ***** Loading data *****
06/11/2021 16:16:09 - INFO - finbert.finbert -

Validating:   0%|          | 0/52 [00:00<?, ?it/s]

Epoch:  75%|███████▌  | 3/4 [18:58<06:44, 404.13s/it]

Validation losses: [0.42696812003850937, 0.46287959226622033, 0.456798695314389]


Iteration:   0%|          | 0/249 [00:00<?, ?it/s]

06/11/2021 16:26:07 - INFO - finbert.utils -   *** Example ***
06/11/2021 16:26:07 - INFO - finbert.utils -   guid: validation-1
06/11/2021 16:26:07 - INFO - finbert.utils -   tokens: [CLS] morgan stanley maintains over ##weight on q ##or ##vo , raises price target to $ 223 [SEP]
06/11/2021 16:26:07 - INFO - finbert.utils -   input_ids: 101 5253 6156 9319 2058 11179 2006 1053 2953 6767 1010 13275 3976 4539 2000 1002 20802 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:26:07 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:26:07 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/11/2021 16:26:07 - INFO - finbert.utils -   label: positive (id = 0)
06/11/2021 16:26:07 - INFO - finbert.finbert -   ***** Loading data *****
06/11/2021 16:26:07 - INFO - finbert.finbert -

Validating:   0%|          | 0/52 [00:00<?, ?it/s]

Epoch: 100%|██████████| 4/4 [28:55<00:00, 433.88s/it]

Validation losses: [0.42696812003850937, 0.46287959226622033, 0.456798695314389, 0.4658872416386238]





In [25]:
# 2021-06-11 Validation losses: [0.42696812003850937, 0.46287959226622033, 0.456798695314389, 0.4658872416386238]

## Test the model

`bert.evaluate` outputs the DataFrame, where true labels and logit values for each example is given

In [35]:
# test_data = finbert.get_data('test')
test_data = finbert.get_data('test')

In [36]:
results = finbert.evaluate(examples=test_data, model=trained_model)

06/14/2021 11:08:13 - INFO - finbert.utils -   *** Example ***
06/14/2021 11:08:13 - INFO - finbert.utils -   guid: test-1
06/14/2021 11:08:13 - INFO - finbert.utils -   tokens: [CLS] rec ##ap : der ##mt ##ech q ##1 earnings [SEP]
06/14/2021 11:08:13 - INFO - finbert.utils -   input_ids: 101 28667 9331 1024 4315 20492 15937 1053 2487 16565 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/14/2021 11:08:13 - INFO - finbert.utils -   attention_mask: 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/14/2021 11:08:13 - INFO - finbert.utils -   token_type_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
06/14/2021 11:08:13 - INFO - finbert.utils -   label: neutral (id = 2)
06/14/2021 11:08:13 - INFO - finbert.finbert -   ***** Loading data *****
06/14/2021 11:08:13 - INFO - finbert.finbert -     Num examples = 5665
06/14/2021 11:08:13 - INFO - finbert.finbert -

Testing:   0%|          | 0/178 [00:00<?, ?it/s]

In [37]:
def report(df, cols=['label','prediction','logits']):
    #print('Validation loss:{0:.2f}'.format(metrics['best_validation_loss']))
    cs = CrossEntropyLoss(weight=finbert.class_weights)
    loss = cs(torch.tensor(list(df[cols[2]])),torch.tensor(list(df[cols[0]])))
    print("Loss:{0:.2f}".format(loss))
    print("Accuracy:{0:.2f}".format((df[cols[0]] == df[cols[1]]).sum() / df.shape[0]) )
    print("\nClassification Report:")
    print(classification_report(df[cols[0]], df[cols[1]]))

In [38]:
results['prediction'] = results.predictions.apply(lambda x: np.argmax(x, axis=0))

In [44]:
report(results,cols=['labels','prediction','predictions'])

06/14/2021 11:10:36 - INFO - numexpr.utils -   NumExpr defaulting to 8 threads.


Loss:0.41
Accuracy:0.85

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      2530
           1       0.74      0.83      0.78       797
           2       0.86      0.84      0.85      2338

    accuracy                           0.85      5665
   macro avg       0.82      0.84      0.83      5665
weighted avg       0.85      0.85      0.85      5665



### Get predictions

In [39]:
db_test = pd.read_csv(str(cl_data_path) + '/test.csv', sep='\t')

In [40]:
db_test['prediction'] = results.prediction

In [41]:
label_finbert = {'positive': 0,'negative': 1, 'neutral': 2}
finbert_label = {0: 'positive', 1: 'negative', 2: 'neutral'}

In [42]:
db_test['label_num'] = db_test['label'].replace(label_finbert)
db_test['predict_label'] =  db_test['prediction'].replace(finbert_label)
db_test.head()

Unnamed: 0.1,Unnamed: 0,text,label,prediction,label_num,predict_label
0,14687,Recap: DermTech Q1 Earnings,neutral,2,2,neutral
1,5190,"Roth Capital Maintains Buy on Perion Network, Raises Price Target to $34",positive,0,0,positive
2,13748,"Morgan Stanley Maintains Equal-Weight on Datadog, Raises Price Target to $120",positive,0,0,positive
3,15606,Hill-Rom Holdings FQ2 2021 Earnings Preview,neutral,2,2,neutral
4,16522,"Apple Q2 Earnings Report Reinforces It Will Be \'Top-Performing\' FAANG Stock Of 2021, Says Munster",neutral,2,2,neutral


In [43]:
from sklearn.metrics import classification_report

y_test = db_test['label_num'].values
predicted = db_test['prediction'].values

print(classification_report(y_test, predicted, digits=5))

              precision    recall  f1-score   support

           0    0.87325   0.86047   0.86681      2530
           1    0.73549   0.82685   0.77850       797
           2    0.86380   0.84089   0.85219      2338

    accuracy                        0.84766      5665
   macro avg    0.82418   0.84274   0.83250      5665
weighted avg    0.84997   0.84766   0.84835      5665



In [None]:
# 2021-06-11 - train file - 8K and new traning model result
#               precision    recall  f1-score   support

#            0    0.86189   0.86007   0.86098      1901
#            1    0.74151   0.84228   0.78869       596
#            2    0.85623   0.81608   0.83567      1642

#     accuracy                        0.84006      4139
#    macro avg    0.81987   0.83948   0.82845      4139
# weighted avg    0.84231   0.84006   0.84053      4139

In [None]:
# 2021-06-06 - train file - 21K resault
#               precision    recall  f1-score   support

#            0    0.84095   0.87191   0.85615      1983
#            1    0.71965   0.78922   0.75283       631
#            2    0.87508   0.80872   0.84059      1767

#     accuracy                        0.83451      4381
#    macro avg    0.81189   0.82328   0.81653      4381
# weighted avg    0.83725   0.83451   0.83499      4381