# Encoder-Only Transformer

In [1]:
import sys

assert sys.version_info >= (3, 10)

In [2]:
pip install --upgrade transformers huggingface_hub

Note: you may need to restart the kernel to use updated packages.


In [3]:
import transformers
print(transformers.__version__)

4.57.0


In [None]:
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [5]:
import torch
print(torch.__version__)

2.7.1+cu118


In [6]:
import torch
from transformers import BertTokenizer  # Use specific class instead of Auto

# This avoids the chat template check
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

encoded = bert_tokenizer(
    "A man is playing guitar.",
    "A person is making music.",
    return_tensors="pt"
)

print("Input IDs:")
print(encoded["input_ids"])
print("\nToken Type IDs (Segment Embeddings):")
print(encoded["token_type_ids"])  # 0s for first sentence, 1s for second
print("\nAttention Mask:")
print(encoded["attention_mask"])

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Input IDs:
tensor([[ 101, 1037, 2158, 2003, 2652, 2858, 1012,  102, 1037, 2711, 2003, 2437,
         2189, 1012,  102]])

Token Type IDs (Segment Embeddings):
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])

Attention Mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [7]:
from transformers import BertConfig, BertForMaskedLM

config = BertConfig(
    vocab_size = bert_tokenizer.vocab_size,
    hidden_size=128,
    num_hidden_layers=2,
    num_attention_heads=4,
    intermediate_size=512,
    max_position_embeddings=128
)
bert = BertForMaskedLM(config)

2025-10-12 09:32:24.181142: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760261544.203056     233 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760261544.209730     233 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [8]:
from datasets import load_dataset

mlm_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
mlm_dataset


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 36718
})

In [9]:
def tokenize(example, tokenizer=bert_tokenizer):
  return tokenizer(example['text'], truncation=True, max_length = 128, padding="max_length")
mlm_dataset = mlm_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

In [10]:
mlm_dataset


Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36718
})

In [11]:
from transformers import DataCollatorForLanguageModeling

mlm_collator = DataCollatorForLanguageModeling(bert_tokenizer, mlm=True, mlm_probability=0.15)

In [13]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir='./my_bert',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    report_to=[]
)

trainer = Trainer(
    model=bert,
    args=args,
    train_dataset=mlm_dataset,
    data_collator=mlm_collator
)
trainer_output = trainer.train()

Step,Training Loss
500,7.2365
1000,7.3605
1500,7.3103
2000,7.2271
2500,7.1746
3000,7.1319
3500,7.1273
4000,7.0643
4500,7.0783
5000,7.0224


In [14]:
from transformers import pipeline

torch.manual_seed(42)
fill_mask = pipeline("fill-mask", model=bert, tokenizer=bert_tokenizer)
top_predictions = fill_mask("The capital of [MASK] is Rome.")
top_predictions[0]

Device set to use cuda:0


{'score': 0.029017271474003792,
 'token': 1010,
 'token_str': ',',
 'sequence': 'the capital of, is rome.'}

In [15]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
sentences = [
    "He's playing football",
    "He scored a goal",
    "He's reading a book"
]
embeddings = model.encode(sentences, convert_to_tensor=True)
similarities = model.similarity(embeddings, embeddings)
similarities

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

tensor([[1.0000, 0.4097, 0.4713],
        [0.4097, 1.0000, 0.3182],
        [0.4713, 0.3182, 1.0000]], device='cuda:0')

# Decoder-Only Transformers

In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "gpt2"
gpt2_tokenizer = AutoTokenizer.from_pretrained(model_id)
gpt2 = AutoModelForCausalLM.from_pretrained(
    model_id, device_map="auto", dtype="auto")

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [48]:
def generate(model, tokenizer, prompt, max_new_tokens=100, **generate_kwargs):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens,
                             pad_token_id=tokenizer.eos_token_id,
                             **generate_kwargs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [18]:
torch.manual_seed(42)
prompt = "In a distant galaxy, there was a planet where"
print(generate(gpt2, gpt2_tokenizer, prompt,do_sample=True, top_p=0.6))

In a distant galaxy, there was a planet where life was possible. The first known known example of such a planet was the planet Mettok. The first known example of such a planet was the planet Mettok.

In the year 1371, a planet was discovered on a planet known as the moon Enceladus. The moon Enceladus was a star that was approximately 100 million years old. The moon Enceladus was a star that was approximately 100 million years old.

The first known example of


In [19]:
DEFAULT_TEMPLATE = "Capital city of France = Paris\nCapital city of {country} ="

def get_capital_city(model, tokenizer, country, template=DEFAULT_TEMPLATE):
  prompt = template.format(country=country)
  extended_text = generate(model, tokenizer, prompt, max_new_tokens=10)
  answer = extended_text[len(prompt):]
  return answer.strip().splitlines()[0]

In [20]:
get_capital_city(gpt2, gpt2_tokenizer, "India")

'Mumbai'

In [24]:
model_id = "mistralai/Mistral-7B-v0.3"
mistral7b_tokenizer = AutoTokenizer.from_pretrained(model_id)
mistral7b = AutoModelForCausalLM.from_pretrained(
    model_id, device_map='auto', dtype='auto'
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [52]:
generate(mistral7b, mistral7b_tokenizer, prompt)

'How to go on Mars?\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars'

In [53]:
get_capital_city(mistral7b, mistral7b_tokenizer, "India")

'New Delhi'

In [54]:
prompt = 'How to go on Mars'
generate(mistral7b, mistral7b_tokenizer, prompt)

'How to go on Mars?\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars is scheduled for 2030.\n\nThe first manned mission to Mars'

In [55]:
Tom_intro ="""
Tom is an amazing chatbot. It knows everything and it's incredibly helpful.
"""
full_prompt = f'{Tom_intro}Me:{prompt}\nTom:'
extended_text = generate(mistral7b, mistral7b_tokenizer, full_prompt, max_new_tokens=100)
answer = extended_text[len(full_prompt):].strip()
print(answer)

You can go on Mars by taking a spaceship.
Me:How to go on Mars
Tom:You can go on Mars by taking a spaceship.
Me:How to go on Mars
Tom:You can go on Mars by taking a spaceship.
Me:How to go on Mars
Tom:You can go on Mars by taking a spaceship.
Me:How to go on Mars
Tom:You can go on Mars by taking a spaceship.


In [56]:
print(answer.split('\nMe:')[0])

You can go on Mars by taking a spaceship.
