In [1]:
! pip install transformers



# Behind Pipelines

## pipeline usage

In [25]:
# Pipelines
from transformers import pipeline
raw_inputs =  [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
        ]
# Official way:
# model = pipeline("Taks_Name", possible_params)
# model(input, possible_params)
model = pipeline("sentiment-analysis")
model(raw_inputs)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9598049521446228},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

## AutoModel and AutoTokenizer
The **output of AutoModel** contians last_hidden_state which shape is made of
1.    the input sequnce length,
2.    batch size (number of inputs), and
3.    hidden state size (the vector dimention of each input).

In [26]:
from transformers import AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"    # A model name from the list of HuggingFace models.
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs    = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)


from transformers import AutoModel
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"    # NOTE: This model should be exactly the same model used for the AutoTokenizer
model      = AutoModel.from_pretrained(checkpoint)
output    = model(**inputs)

print(output.keys())
print (output.last_hidden_state.shape)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
odict_keys(['last_hidden_state'])
torch.Size([2, 16, 768])


## Model

The **model** is represented by its embeddings layer and the subsequent layers.
*   The embeddings layer converts each input ID in the tokenized input into a vector that represents the associated token.
*   The subsequent layers manipulate those vectors using the attention mechanism to produce the final representation of the sentences.


In [31]:
from transformers import BertConfig, BertModel
config = BertConfig()
#print (config)
model = BertModel(config)                              # Randomly initialized model
model = BertModel.from_pretrained("bert-base-cased")  # initialize with the bert-based-cased pretrained model

input_seq = ["Hello!", "Cool.", "Nice!"]
encoded_seq = [
    [101, 7592, 999, 102],
    [101, 4658, 1012, 102],
    [101, 3835, 999, 102],
]
model_input = torch.tensor(encoded_seq)
output = model(model_input)
print(output.keys())
model.save_pretrained("Directory_name_path")

odict_keys(['last_hidden_state', 'pooler_output'])


## Model Head

**AutoModelForSequenceClassification** contains AutoModel + Head. When AutoModel delivers [seq_length, betch_size, hidden_state], this model process the output through one more step and delivers "logits".

The following **archs** handle the Model-Head for variouse tasks:
 *   *Model (retrieve the hidden states)
 *   *ForCausalLM
 *   *ForMaskedLM
 *   *ForMultipleChoice
 *   *ForQuestionAnswering
 *   *ForSequenceClassification
 *   *ForTokenClassification
 *   ...

In [35]:
from transformers import AutoModelForSequenceClassification
classifier = AutoModelForSequenceClassification.from_pretrained(checkpoint)
output = classifier(**inputs)
print(output.keys())
print(output.logits.shape)

import torch
prediction = torch.nn.functional.softmax(output.logits, dim=-1)
print (prediction)
print(classifier.config.id2label)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


odict_keys(['logits'])
torch.Size([2, 2])
tensor([[0.4370, 0.5630],
        [0.4432, 0.5568]], grad_fn=<SoftmaxBackward0>)
{0: 'LABEL_0', 1: 'LABEL_1'}


## Tokenizer
Tokenizatin Algorithms are:
1. word-based
2. character-based
3. subword-based (BPE, WordPiece , unigram(or sentencePiece))

Tokenizer processes sequences in two steps
1. tokenize()
2. convert_tokens_to_ids().

You could use these steps (Aproach 1 in the following code) or simply call the the tokenizer object itself (Aproach 2).

IDs could be decodeed into sequences using decode().

**Note:** sometimes the ids created by Aproach1 are not the same as ids created by tokenize.__call__ (Approach2).
Infact, some models add special words at the beginning or at the end (or both!) of the sequences.

In any case, the tokenizer.__call__ knows which ones are expected and will deal with this for you, while tokenize+convert_tokens_to_ids returns exactly the same ids related to the sequence tokens.

In [39]:
# Saving tokens:
from transformers import BertTokenizer
checkpoint = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(checkpoint)
tokenizer.save_pretrained("a_dirctory_to_save_tokenizer")

# How tokenizer works:
import torch
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

# Aprotch 1: Internal Steps
from transformers import  AutoTokenizer
the_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
raw_input  = "This is my input data!"
tokens     = the_tokenizer.tokenize(raw_input)
IDs        = the_tokenizer.convert_tokens_to_ids(tokens)
IDs_tensor = torch.tensor([IDs])            # This tensor of ids could be fed into a model. the [] is addede to handle the model input compatibility
decoded_string = the_tokenizer.decode(IDs)  # converts IDs to tokens and group them together inorder to create a string


output = model(IDs_tensor)
print(output.logits)

# Aprotch 2: use the tokenizer object
# simply use the object of AutoTokenizer (or any other Tokenizer such as BertTokenizer Object)
from transformers import  AutoTokenizer
input = the_tokenizer(raw_input, padding=True, truncation = True , return_tensors='pt')
output = model(**input)
print(output.logits)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[-0.0036, -0.0829]], grad_fn=<AddmmBackward0>)
tensor([[-0.1554,  0.0003]], grad_fn=<AddmmBackward0>)


## Multiple Sequence
1. Batchig same length sequnces
2. Batchig different length sequnces
    1. Padding
    2. Attention Mask
3. Truncate the sequnce if they are too long. Most models can handle sequences of up to 512 or 1024 tokens.
    1.  Longformer and LED  are models that support very long sequences.
    2. to truncate a sequence: tr_seq = seq[:max], while max is the maximum length of a sequence which could be supported by the model

**Note**: the padding token-id could be found in **tokenizer.pad_token_id**


In [43]:
# 1) Batchig same length sequnces
batched_ids = [IDs,IDs]
bectched_output = model(torch.tensor(batched_ids))

# 2) Batchig different length sequnces
seq1 = [200, 200, 200]
seq2 = [200, 200]
batched_ids = [seq1, seq2]
#print(model(torch.tensor(batched_ids)).logits)  # Got Error. Need Padding and Mask

# 2-1) Padding
padding_ids = tokenizer.pad_token_id
padded_batched_ids = [seq1, [200, 200, padding_ids]]
print(model(torch.tensor([seq1])).logits)
print(model(torch.tensor([seq2])).logits)
print(model(torch.tensor(padded_batched_ids)).logits) # The result are not the same! Need to define whic value should be considered in the calculations. To do so, we use Attenstion Mask

# 2-2) Attention Mask
attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]   # 0 Shows the related valuse shoul not be considered in the calculations

print(model(
    torch.tensor(padded_batched_ids),
    attention_mask = torch.tensor(attention_mask)
    ).logits)

tensor([[-0.2260,  0.0486]], grad_fn=<AddmmBackward0>)
tensor([[-0.2355,  0.0344]], grad_fn=<AddmmBackward0>)
tensor([[-0.2260,  0.0486],
        [-0.1968, -0.0216]], grad_fn=<AddmmBackward0>)
tensor([[-0.2260,  0.0486],
        [-0.2355,  0.0344]], grad_fn=<AddmmBackward0>)


## Tokenized or Multiple sequences
Tokenizer Object can handle Padding, Mask Attention, as well as Truncate automatically.

padding could be done according to:
* The  Longest sequnce ("longest")
* The model maximum supported length ("max_length")
* The specified max length (padding="max_length",max_length=8")

Truncation is supported in the following forms:
* Truncate sequences that are longer than the model max length using truncation=True . For example BERT or DistilBERT max-length are 512
*  Truncate the sequences that are longer than the specified max length using (truncation=True, max_length=8)

Tokenizer could retunrs tensor in the following formats:
*  PyTorch tensors     (return_tensors="pt")
*  TensorFlow tensors  (return_tensors="tf")
*  Numpy tensors       (return_tensors="np"

Lets take a look:

In [45]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

input_sequences = ["This is the first sequence of tokens", "And this is the second one", "The list could be continued !"]
input_sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer  = AutoTokenizer.from_pretrained (checkpoint)

input = tokenizer(input_sequences, padding = "longest")                      # padding could be Longest sequnce
input = tokenizer(input_sequences, padding = "max_length")                   # Or The model maximum supported length
input = tokenizer(input_sequences, padding = "max_length", max_length=8)     # Or The specified max length (padding="max_length",max_length=8")

inputs = tokenizer(input_sequences, truncation=True)                         # Will truncate the sequences that are longer than the model max length(512 for BERT or DistilBERT)
inputs = tokenizer(input_sequences, truncation=True, max_length=8)           # Will truncate the sequences that are longer than the specified max length

inputs = tokenizer(input_sequences, padding=True, return_tensors="pt")       # Returns PyTorch tensors
inputs = tokenizer(input_sequences, padding=True, return_tensors="tf")       # Returns TensorFlow tensors
inputs = tokenizer(input_sequences, padding=True, return_tensors="np")       # Returns Numpy tensors

## Altogether

In [48]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

input_sequences = ["This is the first sequence of tokens", "And this is the second one", "The list could be continued !"]
input_sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model      = AutoModelForSequenceClassification.from_pretrained(checkpoint)
tokenizer  = AutoTokenizer.from_pretrained (checkpoint)
inputs = tokenizer(input_sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**inputs)
print(output.keys(), output.logits)

odict_keys(['logits']) tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>)
