# Behind the pipeline

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

# How Pipeline() function works in behind?
1. Preprocessing
-> Text to numbers using Autotokenizer with padding and trancation.
2. Model
-> Takes those preprocessing text and returns logits. We use AutoModel, AutoModelForSequenceClassification.
3. Postprocessing
-> softmax to map those logits to different labels.  

## Preprocessing

In [2]:
from transformers import AutoTokenizer
# Model weights
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [3]:
raw_inputs = [
    "I don't dislike this movie.",
    "This movie is sick."
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  1045,  2123,  1005,  1056, 18959,  2023,  3185,  1012,   102],
        [  101,  2023,  3185,  2003,  5305,  1012,   102,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}


## Model

In [7]:
from transformers import AutoModel
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [8]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 10, 768])


In [9]:
from transformers import AutoModelForSequenceClassification
checkpoint= "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
output = model(**inputs)

In [11]:
print(output.logits.shape)

torch.Size([2, 2])


In [12]:
print(output.logits)

tensor([[-2.5427,  2.5646],
        [ 4.6569, -3.7025]], grad_fn=<AddmmBackward0>)


In [14]:
import torch
pred = torch.nn.functional.softmax(output.logits,dim=-1)
print(pred)

tensor([[6.0164e-03, 9.9398e-01],
        [9.9977e-01, 2.3412e-04]], grad_fn=<SoftmaxBackward0>)


In [15]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [19]:
# What we see.
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier([
    "I don't dislike this movie.",
    "This movie is sick."
])

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9939836859703064},
 {'label': 'NEGATIVE', 'score': 0.999765932559967}]