# Behind the Pipeline

## Sentiment Analysis (finbert)

In [40]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from utils.bert_models_result import result_output_finbert, result_output_bert
import torch
import torch.nn.functional as F
from pprint import pprint

In [2]:
raw_inputs = [
    "This is an absolute joke!",
    "Cristiano Ronaldo is better than Messi.",
    "I'm genuinly happy for his success."
]

In [3]:
finbert_checkpoint = "ProsusAI/finbert"

### Using Pipeline

In [4]:
sent_pipe_finbert = pipeline(task='sentiment-analysis', model=finbert_checkpoint)

Device set to use mps:0


In [5]:
# sent_pipe.model.config

In [6]:
pprint(sent_pipe_finbert(raw_inputs))

[{'label': 'negative', 'score': 0.7869253158569336},
 {'label': 'neutral', 'score': 0.5224577188491821},
 {'label': 'positive', 'score': 0.7966387867927551}]


### Using AutoTokenizer and AutoModelForSequenceClassification 

In [7]:
# initialize tokenizer
finbert_tokenizer = AutoTokenizer.from_pretrained(finbert_checkpoint)

In [8]:
finbert_tokenizer

BertTokenizerFast(name_or_path='ProsusAI/finbert', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [9]:
# Extract token data
finbert_token_data = finbert_tokenizer(
    raw_inputs, 
    padding=True, 
    truncation=True,
    return_tensors="pt"
)

In [10]:
finbert_token_data

{'input_ids': tensor([[  101,  2023,  2003,  2019,  7619,  8257,   999,   102,     0,     0,
             0,     0,     0],
        [  101, 13675,  2923, 15668,  8923,  2080,  2003,  2488,  2084,  6752,
          2072,  1012,   102],
        [  101,  1045,  1005,  1049,  8991, 20023,  2135,  3407,  2005,  2010,
          3112,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [11]:
finbert_tokens = []
for inp in raw_inputs:
    finbert_tokens.append(finbert_tokenizer.tokenize(inp))

In [12]:
pprint(finbert_tokens, compact=True)

[['this', 'is', 'an', 'absolute', 'joke', '!'],
 ['cr', '##ist', '##iano', 'ronald', '##o', 'is', 'better', 'than', 'mess',
  '##i', '.'],
 ['i', "'", 'm', 'gen', '##uin', '##ly', 'happy', 'for', 'his', 'success', '.']]


In [13]:
# initialize model
finbert_model = AutoModelForSequenceClassification.from_pretrained(finbert_checkpoint)
# model

In [14]:
# model.config

In [15]:
# input the token data to our model
# Forward pass
with torch.no_grad():
    finbert_model_output = finbert_model(**finbert_token_data)

In [16]:
finbert_model_output

SequenceClassifierOutput(loss=None, logits=tensor([[-1.6557,  2.0578,  0.6569],
        [ 0.9340, -2.0800,  1.0718],
        [ 1.6271, -2.4764,  0.1948]]), hidden_states=None, attentions=None)

In [17]:
# logits
finbert_logits = finbert_model_output['logits']
finbert_logits

tensor([[-1.6557,  2.0578,  0.6569],
        [ 0.9340, -2.0800,  1.0718],
        [ 1.6271, -2.4764,  0.1948]])

In [18]:
finbert_logits.shape

torch.Size([3, 3])

In [19]:
# softmax
finbert_output_probs = F.softmax(input=finbert_logits, dim=-1)
finbert_output_probs

tensor([[0.0192, 0.7869, 0.1939],
        [0.4552, 0.0223, 0.5225],
        [0.7966, 0.0132, 0.1902]])

In [20]:
# results 
finbert_final_result = result_output_finbert(finbert_model, finbert_output_probs)
finbert_final_result

[{'label': 'NEGATIVE', 'score': 0.7869253158569336},
 {'label': 'NEUTRAL', 'score': 0.5224579572677612},
 {'label': 'POSITIVE', 'score': 0.7966387867927551}]

In [21]:
pprint(sent_pipe_finbert(raw_inputs))

[{'label': 'negative', 'score': 0.7869253158569336},
 {'label': 'neutral', 'score': 0.5224577188491821},
 {'label': 'positive', 'score': 0.7966387867927551}]


In [22]:
# save model
#  finbert_model.save_pretrained('./saved_models/finbert_model/')

## Sentiment Analysis (bert)

In [23]:
bert_checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

In [24]:
sent_pipe_bert = pipeline('sentiment-analysis', model=bert_checkpoint)

Device set to use mps:0


In [25]:
sent_pipe_bert(raw_inputs)

[{'label': 'NEGATIVE', 'score': 0.9991211295127869},
 {'label': 'POSITIVE', 'score': 0.9997335076332092},
 {'label': 'POSITIVE', 'score': 0.9998682737350464}]

In [26]:
# initialize tokenizer
bert_tokenizer = AutoTokenizer.from_pretrained(bert_checkpoint)

In [27]:
# Extract token data
bert_token_data = bert_tokenizer(
    raw_inputs, 
    padding=True, 
    truncation=True,
    return_tensors="pt"
)

In [28]:
bert_token_data

{'input_ids': tensor([[  101,  2023,  2003,  2019,  7619,  8257,   999,   102,     0,     0,
             0,     0,     0],
        [  101, 13675,  2923, 15668,  8923,  2080,  2003,  2488,  2084,  6752,
          2072,  1012,   102],
        [  101,  1045,  1005,  1049,  8991, 20023,  2135,  3407,  2005,  2010,
          3112,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [29]:
bert_tokens = []
for inp in raw_inputs:
    bert_tokens.append(bert_tokenizer.tokenize(inp))

In [31]:
pprint(bert_tokens, compact=True)

[['this', 'is', 'an', 'absolute', 'joke', '!'],
 ['cr', '##ist', '##iano', 'ronald', '##o', 'is', 'better', 'than', 'mess',
  '##i', '.'],
 ['i', "'", 'm', 'gen', '##uin', '##ly', 'happy', 'for', 'his', 'success', '.']]


In [32]:
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_checkpoint)

In [33]:
bert_model_output = bert_model(**bert_token_data)

In [34]:
bert_model_output

SequenceClassifierOutput(loss=None, logits=tensor([[ 3.9217, -3.1143],
        [-3.9471,  4.2828],
        [-4.2709,  4.6639]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [35]:
bert_logits = bert_model_output['logits']
bert_logits

tensor([[ 3.9217, -3.1143],
        [-3.9471,  4.2828],
        [-4.2709,  4.6639]], grad_fn=<AddmmBackward0>)

In [36]:
bert_output_probs = F.softmax(bert_logits, dim=-1)
bert_output_probs

tensor([[9.9912e-01, 8.7888e-04],
        [2.6649e-04, 9.9973e-01],
        [1.3171e-04, 9.9987e-01]], grad_fn=<SoftmaxBackward0>)

In [41]:
bert_final_result = result_output_bert(model=bert_model, output_probs=bert_output_probs)

In [42]:
bert_final_result

[{'label': 'NEGATIVE', 'score': 0.9991211295127869},
 {'label': 'POSITIVE', 'score': 0.9997335076332092},
 {'label': 'POSITIVE', 'score': 0.9998682737350464}]

In [43]:
sent_pipe_bert(raw_inputs)

[{'label': 'NEGATIVE', 'score': 0.9991211295127869},
 {'label': 'POSITIVE', 'score': 0.9997335076332092},
 {'label': 'POSITIVE', 'score': 0.9998682737350464}]