In [None]:
!pip install transformers



In [None]:
import transformers

In [None]:
# let's talk about pipeline
# pipeline() is a connection between the model with pre-processing (tokenisation) and post-processing tasks (decoding)
# It downloads the model(if not specified, it chooses a default model based on task) for the task on call, which is cached. (recalling will not download again)

# Feautures:
# feature-extraction (get the vector representation of a text)
# fill-mask
# ner (named entity recognition)
# question-answering
# sentiment-analysis
# summarization
# text-generation
# translation
# zero-shot-classification

In [None]:
# sentinment analysis
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")      # distilled-bert

print(sent_pipeline("I love you"))          # single input
print(sent_pipeline(["I have been waiting for this to happen for a long time.","It was the best in the worst possible manner."]))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998656511306763}]
[{'label': 'NEGATIVE', 'score': 0.9863830804824829}, {'label': 'NEGATIVE', 'score': 0.9972840547561646}]


In [None]:
# zero-shot classification

zsc = pipeline("zero-shot-classification")     # BART (facebook)

print(zsc("This is a great place to spend the night!", candidate_labels=["education", "entertainment", "business"]))

No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


{'sequence': 'This is a great place to spend the night!', 'labels': ['entertainment', 'business', 'education'], 'scores': [0.9663365483283997, 0.017656628042459488, 0.016006896272301674]}


In [None]:
# text-generation

txt_gen = pipeline("text-generation")   # gpt2

txt_gen("Hi, I am Charles Xavier")

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Hi, I am Charles Xavier University\'s president." So that\'s some good. Good. We\'re looking forward to seeing you as president of all four schools. As it stands, we would like to thank you for your time. It would have been'}]

In [None]:
# specify a model

txt_gen2 = pipeline("text-generation", model="distilgpt2")
txt_gen2("Elephants are cute animals indeed!", num_return_sequences=2, max_length=50)

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Elephants are cute animals indeed! Their ability to hunt is amazing. They can be very sensitive and tough to touch. One's size and shape are completely different. They are very cute.\nDaughter\nDr. Phil is the man behind"},
 {'generated_text': "Elephants are cute animals indeed!\n\n\nAs I've explained above, alligators have big jaws, which makes them much more durable than any other animal I've heard of! The fact of the matter, when they were first introduced, they"}]

In [None]:
# fill-mask

fill_mask = pipeline("fill-mask")   #distilroberta-base

print(fill_mask("The capital of France is <mask>."))
print(fill_mask("Deep Learning is all about <mask>", top_k=2))

No model was supplied, defaulted to distilbert/distilroberta-base and revision fb53ab8 (https://huggingface.co/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'score': 0.24869529902935028, 'token': 2201, 'token_str': ' Paris', 'sequence': 'The capital of France is Paris.'}, {'score': 0.060371093451976776, 'token': 12790, 'token_str': ' Lyon', 'sequence': 'The capital of France is Lyon.'}, {'score': 0.029179412871599197, 'token': 4612, 'token_str': ' Barcelona', 'sequence': 'The capital of France is Barcelona.'}, {'score': 0.021435674279928207, 'token': 5459, 'token_str': ' Berlin', 'sequence': 'The capital of France is Berlin.'}, {'score': 0.020067056640982628, 'token': 12696, 'token_str': ' Monaco', 'sequence': 'The capital of France is Monaco.'}]
[{'score': 0.09428312629461288, 'token': 2239, 'token_str': ' learning', 'sequence': 'Deep Learning is all about learning'}, {'score': 0.02784472145140171, 'token': 31512, 'token_str': ' persistence', 'sequence': 'Deep Learning is all about persistence'}]


In [None]:
# NER

ner = pipeline("ner", grouped_entities=True)     # bert large
# grouped_entities: True, it will group together entities, which jointly refer to something, like Ayush Sur , Hugging Face and Google Colab

ner("Hello! I am Ayush Sur from Kolkata studying in Hugging Face and using Google Colab right now")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'entity_group': 'PER',
  'score': 0.9991752,
  'word': 'Ayush Sur',
  'start': 12,
  'end': 21},
 {'entity_group': 'LOC',
  'score': 0.99825805,
  'word': 'Kolkata',
  'start': 27,
  'end': 34},
 {'entity_group': 'ORG',
  'score': 0.60294354,
  'word': '##gging Face',
  'start': 49,
  'end': 59},
 {'entity_group': 'MISC',
  'score': 0.9414491,
  'word': 'Google Colab',
  'start': 70,
  'end': 82}]

In [None]:
question_answerer = pipeline("question-answering")  #distilbert
# answering based on question in context window

question_answerer(
    question="Where do I work?",
    context="My name is Ayush, from Kolkata, recently workign on YouTube",
)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


{'score': 0.6807622909545898, 'start': 52, 'end': 59, 'answer': 'YouTube'}

In [None]:
summarizer = pipeline("summarization") #distilbart-cnn-12

summarizer(
    """ India, officially the Republic of India,[j][21] is a country in South Asia. It is the seventh-largest country by area; the most populous country from June 2023 onwards;[22][23] and since its independence in 1947, the world's most populous democracy.[24][25][26] Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west;[k] China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is near Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia.Modern humans arrived on the Indian subcontinent from Africa no later than 55,000 years ago.[28][29][30] Their long occupation, predominantly in isolation as hunter-gatherers, has made the region highly diverse, second only to Africa in human genetic diversity.[31] Settled life emerged on the subcontinent in the western margins of the Indus river basin 9,000 years ago, evolving gradually into the Indus Valley Civilisation of the third millennium BCE.[32] By 1200 BCE, an archaic form of Sanskrit, an Indo-European language, had diffused into India from the northwest.[33][34] Its hymns recorded the dawning of Hinduism in India.[35] India's pre-existing Dravidian languages were supplanted in the northern regions.[36] By 400 BCE, caste had emerged within Hinduism,[37] and Buddhism and Jainism had arisen, proclaiming social orders unlinked to heredity.[38] Early political consolidations gave rise to the loose-knit Maurya and Gupta Empires.[39] Widespread creativity suffused this era,[40] but the status of women declined,[41] and untouchability became an organized belief.[l][42] In South India, the Middle kingdoms exported Dravidian language scripts and religious cultures to the kingdoms of Southeast Asia.[43]"""
    ,max_length=250,
    min_length=100
    )


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'summary_text': ' India, officially the Republic of India, is a country in South Asia . It is the seventh-largest country by area; the most populous country from June 2023 onwards . Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast . It shares land borders with Pakistan to the west; China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east . Its Andaman and Nicobar Islands share a maritime border with Thailand, Myanmar, and Indonesia .'}]

In [None]:
# inside a pipeline()
# preprocess -> model -> postprocess

# preprocess: tokenisation
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"          # checkpoint (specify which model's tokensizer is required)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                   # cached


In [None]:
raw_inputs = [
    "LLM is one of the most fascinating thing I have ever seen",
    "Thanks to the community",
]

# padding: for multiple sentences, shorter needs to be padded with additional tokens for parallel processing
# truncation: given a max_length (if not given, specific to model), it truncates extra tokens from right
# return_tenos='pt'  for pytorch
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)
# inputs: dictionary with input_ids (tokenised results) and attention masks (to prevent attention to pad)

{'input_ids': tensor([[  101,  2222,  2213,  2003,  2028,  1997,  1996,  2087, 17160,  2518,
          1045,  2031,  2412,  2464,   102],
        [  101,  4283,  2000,  1996,  2451,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
# model: model that works on tokensied inputs and produces logits
# model have different heads for different tasks
# eg: *ForSequenceClassification, *ForMaskedLM, *ForQuestionAnswering,
# default (only Model) will fetch the last hidden state
# ******IMP***********:
# a LM have different heads but share a same body(what's before head) -> transformer encoder for BERT
# for different task the head has different architecture as it produces different answers

from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

torch.Size([2, 15, 768])


In [None]:
# sequence classification

from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)        # **inputs means destructuring so that the function picks up automatically (python syntax)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-4.0629,  4.3351],
        [-4.2504,  4.5804]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
# last phase: post-processing
# include softmax and generating labels

import torch

out = torch.nn.functional.softmax(outputs.logits, dim=-1)        # dim=-1 : squash the col
print(out)

tensor([[2.2527e-04, 9.9977e-01],
        [1.4614e-04, 9.9985e-01]], grad_fn=<SoftmaxBackward0>)


In [None]:
temp = torch.tensor([[1,2],[3,4]], requires_grad=False)
temp2 = torch.sum(temp, dim=0)   # dim=0 (row) means reduce dim of row(squash) -> column wise
temp3 = torch.sum(temp,dim=1)   # dim=1 (col) means reducr dim of col(squash)  -> row-wise
print(temp2)
print(temp3)

tensor([4, 6])
tensor([3, 7])


In [None]:
temp_3d = torch.tensor([[[1,2],[2,3]],[[4,2],[5,3]],[[11,20],[21,0]]])
print(temp_3d.shape)

torch.Size([3, 2, 2])


In [None]:
# good example
temp_3d.sum(dim=0)   # reduce dim of 0

tensor([[16, 24],
        [28,  6]])

In [None]:
# tags
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [None]:
# this means first label is positive next is negative
out

tensor([[2.2527e-04, 9.9977e-01],
        [1.4614e-04, 9.9985e-01]], grad_fn=<SoftmaxBackward0>)

In [None]:
# Alternative to AutoModel

# The AutoModel class and all of its relatives are actually simple wrappers over the wide variety of models available in the library.
# It’s a clever wrapper as it can automatically guess the appropriate model architecture for your checkpoint, and then instantiates a model with this architecture.

# say we just want to import the model without weights

from transformers import BertConfig, BertModel

config = BertConfig()          # config: n_hidden, n_heads,...
print(config)
model = BertModel(config)      # init the model with initial random weights (following a init like Kaiming)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [None]:
# to import along with weights

# we don't need config explicitly, as config is fetched during saving model-weights
from transformers import BertModel
model = BertModel.from_pretrained("bert-base-cased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

In [None]:
# model.save_pretrained("rel_loc")
# it saves 2 files: config.json, model.bin
# model.bin stores weight, config.json stores metadata

In [None]:
# tokensizers

# word-level, character level, bpe, sentencepiece ,..

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
input = "Hello I am Ayush"

# encoded = tokenizer(input, padding=True, truncation=True, return_tensors="pt")            # what we have done: string to ids
# encoded

tokenized = tokenizer.tokenize(input)
print(tokenized)

ids = tokenizer.convert_tokens_to_ids(tokenized)
print(ids)

decoded_string = tokenizer.decode(ids)
print(decoded_string)

['Hello', 'I', 'am', 'A', '##yu', '##sh']
[8667, 146, 1821, 138, 9379, 2737]
Hello I am Ayush


In [None]:
# Note: HF AutoModel expects 2D tensor as input

from transformers import AutoTokenizer, AutoModel

checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

input = "This is a great cinema."
tokenized = tokenizer.tokenize(input)
ids = tokenizer.convert_tokens_to_ids(tokenized)

out = model(torch.tensor(ids))      # error : IndexError: too many indices for tensor of dimension 1

IndexError: too many indices for tensor of dimension 1

In [None]:
output = model(torch.tensor([ids]))
output.last_hidden_state.shape

torch.Size([1, 6, 768])

In [None]:
# or
tokenized_inp = tokenizer( "This is a great cinema.", padding=True, truncation=True, return_tensors='pt')       # automatically adds an extra dim
# padding = 'Longest'  -> based on longest seq in batch --> default  but if tokens cross limit of model it is truncated from end
# padding = 'max_length' -> based on max_len specified (if not-> max_seq_length of model)
output = model(**tokenized_inp)
# output = model(torch.tensor([tokenized_inp.input_ids]))      # if return_tensors is not 'pt'
output.last_hidden_state.shape

torch.Size([1, 8, 768])

In [None]:
torch.tensor(ids)

tensor([1188, 1110,  170, 1632, 7678,  119])

In [None]:
tokenized_inp

{'input_ids': [101, 1188, 1110, 170, 1632, 7678, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
# there is a difference in size of tokenization
tokenizer.decode([101,102])        # hope you remember the special tokens from BERT paper

'[CLS] [SEP]'

In [None]:
# hence always use the tokenizer() instead of breaking down into steps