In [15]:
#pip install transformers --user
#pip install torch --user 

In [33]:
from transformers import pipeline, AutoTokenizer,AutoModelForSequenceClassification
import torch
import torch.nn.functional as F


In [17]:
classifier = pipeline("sentiment-analysis")
results=classifier(["I'm so happy to see you", 'I am kind of not okay'])
results

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998531341552734},
 {'label': 'NEGATIVE', 'score': 0.9997685551643372}]

# How the pipeline works - under the hood 

## Tokenizer ====> Model ====> postprocessing 

---
<ol>
    <li>Raw text (I'm Good )</li>
    <li>outputids (101, 2002,566... 102 ) </li>
    <li>Logits ([-4.021, 5.2458] )</li>
    <li>prediction ({POSITIVE:98.245%, NEGATIVE:2.547%} )</li>
    </ol>
    
    
    
    

## Tokenizer 

raw text > tokens ->add special token (at the beginning and at the end of the text) -> input ids

In [2]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
# from_pretrained -> will download and cache the vocabulary associated with model_name (distilbert-base-uncased-finetuned-sst-2-english)
raw_inputs = ["I'm so happy to see you", 'I am kind of not okay']

inputs= tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')
# The inputs are not equal length all the time, so padding= true will add zeros to keep equal dimension array
# truncation=True any sentence longer than that maximum the model can handle will be truncated

print(inputs)

{'input_ids': tensor([[ 101, 1045, 1005, 1049, 2061, 3407, 2000, 2156, 2017,  102],
        [ 101, 1045, 2572, 2785, 1997, 2025, 3100,  102,    0,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])}


# Model

In [4]:

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name)
outputs = model(**inputs)
print(outputs.logits)

tensor([[-4.2329,  4.5928],
        [ 4.6835, -3.6875]], grad_fn=<AddmmBackward0>)


# Postprocessing 

In [5]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=1)
print(predictions)

tensor([[1.4689e-04, 9.9985e-01],
        [9.9977e-01, 2.3145e-04]], grad_fn=<SoftmaxBackward0>)


In [6]:
labels = torch.argmax(predictions, dim=1)

In [7]:
labels

tensor([1, 0])

In [8]:
labels = [model.config.id2label[label_id] for label_id in labels.tolist()]

In [9]:
labels

['POSITIVE', 'NEGATIVE']