In [1]:
!pip install torch
!pip install transformers



## Importing Required Libraries

In [2]:
from transformers import pipeline
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer



def warn(*args,**kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


## Text Classification with DistilBERT

#### Load the model and the tokenizer

In [9]:
# load the tokenizer and the model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:   8%|7         | 21.0M/268M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  31%|###1      | 83.9M/268M [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:  31%|###1      | 83.9M/268M [00:00<?, ?B/s]

## Process the input text

In [40]:
# sample text
text = "Congratulations! You've won a free ticket to the Bahamas. Reply WIN to claim."

# tokenize the input text
tokens = tokenizer(text,return_tensors='pt')

print(tokens)

{'input_ids': tensor([[  101, 23156,   999,  2017,  1005,  2310,  2180,  1037,  2489,  7281,
          2000,  1996, 17094,  1012,  7514,  2663,  2000,  4366,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [41]:
tokens["input_ids"]

tensor([[  101, 23156,   999,  2017,  1005,  2310,  2180,  1037,  2489,  7281,
          2000,  1996, 17094,  1012,  7514,  2663,  2000,  4366,  1012,   102]])

attention_mask is essential for correctly processing padded sequences, ensuring efficient computation, and maintaining model performance. 
Even when no tokens are explicitly masked, it helps the model differentiate between actual content and padding, which is critical for accurate and efficient processing of input data

In [42]:
with torch.no_grad():
    outputs = model(input_ids = tokens["input_ids"], attention_mask = tokens["attention_mask"])

In [43]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-3.9954,  4.3336]]), hidden_states=None, attentions=None)

In [44]:
logits = outputs.logits
logits.shape

torch.Size([1, 2])

## Post Process the output

In [45]:
# Convert logits to probabilities
probs = torch.softmax(logits,dim = -1)

# get the predicted class
predicted_class = torch.argmax(probs, dim = -1)

# Map the predicted class to the label
labels = ["NEGATIVE","POSITIVE"]
predicted_label = labels[predicted_class]

print(f"Predicted label: {predicted_label}")

Predicted label: POSITIVE


## Text Generation with GPT-2

In [46]:
# load the tokenizer and the model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [47]:
model = GPT2LMHeadModel.from_pretrained("gpt2")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [64]:
# Process the input text
prompt = "Once upon a time"

prompt_encoded = tokenizer(prompt,return_tensors = 'pt')

prompt_encoded

{'input_ids': tensor([[7454, 2402,  257,  640]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

## Perform Inference

* `inputs:` input token IDs form tokenizer
* `attention_mask:` mask indicating which tokens to attend to
* `pad_token:` Padding token ID set to the end of sequence token ID
* `max_length:` Maximum Length of the generated sequences
* `num_return_sequence:` Number of sequences to generate


In [65]:
# generate text
generate_text = model.generate(
    prompt_encoded.input_ids,
    attention_mask = prompt_encoded.attention_mask,
    pad_token_id = tokenizer.eos_token_id,
    max_length = 50,
    num_return_sequences = 1
    
)

In [66]:
generate_text

tensor([[7454, 2402,  257,  640,   11,  262,  995,  373,  257, 1295,  286, 1049,
         8737,  290, 1049, 3514,   13,  383,  995,  373,  257, 1295,  286, 1049,
         3514,   11,  290,  262,  995,  373,  257, 1295,  286, 1049, 3514,   13,
          383,  995,  373,  257, 1295,  286, 1049, 3514,   11,  290,  262,  995,
          373,  257]])

In [67]:
with torch.no_grad():
    outputs = model(**prompt_encoded)

In [68]:
outputs.logits

tensor([[[ -34.5646,  -34.4082,  -38.3080,  ...,  -41.6997,  -39.7802,
           -35.0522],
         [ -84.7255,  -82.9326,  -87.0165,  ...,  -91.6667,  -86.2354,
           -84.7094],
         [-109.0799, -105.7258, -109.9115,  ..., -114.2847, -107.6933,
          -105.3613],
         [ -57.8935,  -58.5540,  -64.7374,  ...,  -64.9437,  -62.9294,
           -60.0624]]])

In [69]:
# decode the generated text
text_output = tokenizer.decode(generate_text[0], skip_special_tokens = True)
print(text_output)

Once upon a time, the world was a place of great beauty and great danger. The world was a place of great danger, and the world was a place of great danger. The world was a place of great danger, and the world was a


## Hugging Face Pipeline() Function

The `pipeline()` function from the huggingface transformers library  is a high level API designed to simplify the usage of pretrained models for various natural language processing (NLP) task.

transformers.pipeline(
    
    task: str,
    model: Optional = None,
    config: Optional = None,
    tokenizer: Optional = None,
    feature_extractor: Optional = None,
    framework: Optional = None,
    revision: str = 'main',
    use_fast: bool = True,
    model_kwargs: Dict[str, Any] = None,
    **kwargs
)

**Parameters**

* task: str
  * The task to perform, such as "text classification", "text-generation", "question-answering"
  * example: "text-classification"
* model: Optional
    *  The model to use. This can be string (model identifier from huggingface model hub), a path to a directory
 

* **Task types**
    1. Text Classification:
    2. Text Generation:
    3. Q-A answering
    4. summarization
    5. Translation
    6. fill mask
    7. zero shot classification
    8. feature extraction
    9. named entity recognition  

## Text Classification using pipeline()

Initialize the pipeline for text classification task. load a pretrained text classification model and use it to classify a sample text.


In [77]:
## load the text classification model
classifier = pipeline(task = "text-classification",model = "distilbert-base-uncased-finetuned-sst-2-english")

## Classify a sample task
result = classifier("Congratulations! You've won the lottery")
print(result)

[{'label': 'POSITIVE', 'score': 0.9998331069946289}]


The output will return a dictionary, where each dictionary contains
* label: the predicted label
* scores: the confidence score of predictions

## Language detection using Pipeline()

In [78]:
# load the classifier
classifier = pipeline(task = 'text-classification', model = 'papluca/xlm-roberta-base-language-detection')
result = classifier('Bonjour, comment ca va?')
print(result)

config.json: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Error while downloading from https://huggingface.co/papluca/xlm-roberta-base-language-detection/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...


model.safetensors:   2%|1         | 21.0M/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

[{'label': 'fr', 'score': 0.9924910664558411}]


## Text Generation Using Pipeline()

In [80]:
## text generation using pipeline
generator = pipeline(task = 'text-generation', model = 'gpt2')

prompt = 'once upon a time'

result = generator(prompt,max_length = 50,num_return_sequences = 1)

# print the generated text
print(result[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


once upon a time.

I will not be silent. I will not say something unteniably. This is what was seen.

But you cannot hide from it.

You said, "Do not let me be surprised;


if we use:
print(result) --> we might get this
[{'generated_text': 'Once upon a time there was a little rabbit who loved...'}]

result -> a list
result[0] -> the first dictionary in that list
result[0]['generated_text'] -> the actual generated string



## Text generation using t5 with pipeline()

In [81]:
generator = pipeline(task = 'text2text-generation', model = 't5-small')
prompt = 'translate: English to French : How are you?'

result = generator(prompt,max_length =50, num_return_sequences = 1)

print(result[0]['generated_text'])

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

: Comment êtes-vous?


## Fill Mask task by pipeline()

In [86]:
mask_filler = pipeline(task = 'fill-mask', model = 'bert-base-uncased')

prompt = "The capital of France is [MASK]"

result = mask_filler(prompt)

for r in result:
    print(f"{r['sequence']} (score: {r['score']:.4f})")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


the capital of france is. (score: 0.7154)
the capital of france is ; (score: 0.2527)
the capital of france is | (score: 0.0281)
the capital of france is! (score: 0.0021)
the capital of france is? (score: 0.0013)
