In [36]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

encoded_input = tokenizer("Hello, I'm a Damodar Bagale!")
print(encoded_input)

{'input_ids': [101, 8667, 117, 146, 112, 182, 170, 8732, 16848, 1197, 18757, 6997, 1162, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [37]:
tokenizer.decode(encoded_input["input_ids"])

"[CLS] Hello, I ' m a Damodar Bagale! [SEP]"

In [38]:
encoded_input = tokenizer("How are you?", "I'm fine, thank you!")
print(encoded_input)

{'input_ids': [101, 1731, 1132, 1128, 136, 102, 146, 112, 182, 2503, 117, 6243, 1128, 106, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [39]:
# // lets' do the same but see for different models..
# how they tokenizes

from transformers import AutoTokenizer

tokenizers = {

        "GPT-2 (BPE)": "gpt2",
        "Bert (WordPiece)": "bert-base-cased",
        "T5 (BPE)": "t5-small",

}

test_texts = {

          "Shree Ganeshaya Namah !!",
          "Wau, Tokenization is really interssting !",
          "I love my Country, Nepal"

}

for name, model in tokenizers.items():
  tokenizer = AutoTokenizer.from_pretrained(model)
  print(f" \n{'-'*50}")
  print(f"  {name} -- vocab: {tokenizer.vocab_size:,}")
  print('='*60)

  for text in test_texts:
    tokens = tokenizer.tokenize(text)
    ids = tokenizer.encode(text)
    decoded = tokenizer.decode(ids)

    print(f"\n Text: {text}")
    print(f"Tokens: {tokens}")
    print(f"count: {len(tokens)} tokens")
    print(f" Decoded: {decoded}")

  print()

 
--------------------------------------------------
  GPT-2 (BPE) -- vocab: 50,257

 Text: Wau, Tokenization is really interssting !
Tokens: ['W', 'au', ',', 'ĠToken', 'ization', 'Ġis', 'Ġreally', 'Ġinter', 's', 'st', 'ing', 'Ġ!']
count: 12 tokens
 Decoded: Wau, Tokenization is really interssting !

 Text: I love my Country, Nepal
Tokens: ['I', 'Ġlove', 'Ġmy', 'ĠCountry', ',', 'ĠNepal']
count: 6 tokens
 Decoded: I love my Country, Nepal

 Text: Shree Ganeshaya Namah !!
Tokens: ['Sh', 'ree', 'ĠGan', 'esh', 'aya', 'ĠNam', 'ah', 'Ġ!!']
count: 8 tokens
 Decoded: Shree Ganeshaya Namah !!

 
--------------------------------------------------
  Bert (WordPiece) -- vocab: 28,996

 Text: Wau, Tokenization is really interssting !
Tokens: ['W', '##au', ',', 'To', '##ken', '##ization', 'is', 'really', 'inter', '##ss', '##ting', '!']
count: 12 tokens
 Decoded: [CLS] Wau, Tokenization is really interssting! [SEP]

 Text: I love my Country, Nepal
Tokens: ['I', 'love', 'my', 'Country', ',', 'Nepal']


In [40]:
# For Padding & Truncation

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
sequences = [

        "Hello, Its me ...",
        "I am a student from Nepal, who love his country",
        "Actually I am enjoying understanding it deeply after vibing, lol",

]

print('\n' + '-' * 60)
print("Padding & Truncation")
print('-' * 60)

# No Padding
result = tokenizer(sequences)
print("\nNo padding --lengths: ", [len(x) for x in result['input_ids']])

# // with padding
result = tokenizer(sequences, padding=True)
print("With padding -- lengths : " , [len(x) for x in result['input_ids']])

result = tokenizer(sequences, padding=True, truncation=True, max_length=14)
print("With padding & truncation -- lengths : " , [len(x) for x in result['input_ids']])


------------------------------------------------------------
Padding & Truncation
------------------------------------------------------------

No padding --lengths:  [9, 13, 15]
With padding -- lengths :  [15, 15, 15]
With padding & truncation -- lengths :  [14, 14, 14]


In [41]:
# //Checking different models architecture using config files..

rom transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification

# Understanding model architectures
models = {
    "BERT base": "bert-base-uncased",
    "DistilBERT": "distilbert-base-uncased",
    "GPT-2": "gpt2",
}

print("MODEL COMPARISON")
print("="*80)

for name, checkpoint in models.items():
    config = AutoConfig.from_pretrained(checkpoint)
    model = AutoModel.from_pretrained(checkpoint)

    print(config)

MODEL COMPARISON
BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.