In [1]:
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.", 
    "I hate this so much!",
]

inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

# input_ids -> are tokens of raw inputs created from vocabulary of model and ready to be used for embedding vector generation

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [4]:
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

# inputs into this downloaded  model will return "hidden states" == "features" as higher dim vectors 
# which represents contextual understanding of input by the model
# hidden states will be inputs to another model == "head"
# head determines what task will solve for this model


Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

# high dim vectors contain (batch size: total input count, seq length: for each input, hidden size: vectorial dim of each input)
# outputs are dictionaries (namedtuples)

torch.Size([2, 16, 768])


In [11]:
# different architectures exist for different task
# Model, ForCausalLM, ForMaskedLM, ForTokenClassification, etc.
# so for specific task using specific arch instead 'AutoModel' is best practice

from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)

In [12]:
print(outputs.logits.shape)

# model architecture for specific task returns different outputs, 
# here returned vectors containing 2 values (per label) for 2 sentences inputs

torch.Size([2, 2])


In [14]:
# these outputs are 'logits' not 'probabilities'.
# need to convert to porbabilities thus we normalize the outputs
# adding additional SoftMax layer will do the trick :)

import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward>)


In [16]:
# getting labels for corresponing predictions are as below 
# so the output close to 0 is "NEGATIVE" in terms of sentiment and vice versa
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}