In [1]:
import sys
!{sys.executable} -m pip install datasets



In [4]:
from datasets import load_dataset

ds = load_dataset(
    "nlphuji/flickr30k",
    revision="refs/pr/2",
    split="test",
)
print(ds)
print(ds[0].keys())


Dataset({
    features: ['image', 'caption', 'sentids', 'split', 'img_id', 'filename'],
    num_rows: 31014
})
dict_keys(['image', 'caption', 'sentids', 'split', 'img_id', 'filename'])


In [None]:
import torch
torch.cuda.is_available()

True

In [31]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

model = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-uncased", dtype=torch.float16)


input = tokenizer("I just love the smell of fresh coffee and [MASK].", return_tensors="pt").to(model.device)
output = model(**input)

pred = output.logits
masked_index = torch.where(input['input_ids'] == tokenizer.mask_token_id)[1]
predicted_token_id = pred[0, masked_index].argmax(dim=-1)
predicted_token = tokenizer.decode(predicted_token_id)

print(f"The predicted token is: {predicted_token}")

topk = torch.topk(pred[0, masked_index], 5, dim=-1)
for token_id in topk.indices[0]:
    print(tokenizer.decode(token_id))
topk

Loading weights: 100%|██████████| 202/202 [00:00<00:00, 972.58it/s, Materializing param=cls.predictions.transform.dense.weight]                 
[1mBertForMaskedLM LOAD REPORT[0m from: google-bert/bert-base-uncased
Key                         | Status     |  | 
----------------------------+------------+--+-
cls.seq_relationship.weight | UNEXPECTED |  | 
bert.pooler.dense.bias      | UNEXPECTED |  | 
cls.seq_relationship.bias   | UNEXPECTED |  | 
bert.pooler.dense.weight    | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


The predicted token is: bacon
bacon
coffee
cookies
sunshine
chocolate


torch.return_types.topk(
values=tensor([[8.4297, 7.8672, 7.6680, 7.6367, 7.5000]], dtype=torch.float16,
       grad_fn=<TopkBackward0>),
indices=tensor([[11611,  4157, 16324,  9609,  7967]]))

In [32]:
logits = torch.tensor([2.0, 1.0, 0.1])
probs = torch.softmax(logits, dim=0)

print("Logits:", logits)
print("Probabilities:", probs)
print("Sum:", probs.sum())

Logits: tensor([2.0000, 1.0000, 0.1000])
Probabilities: tensor([0.6590, 0.2424, 0.0986])
Sum: tensor(1.0000)


In [33]:
import torch
import torch.nn as nn

model = nn.Linear(5, 3)

x = torch.randn(1, 5)
logits = model(x)
probs = torch.softmax(logits, dim=-1)

print(probs)


tensor([[0.1564, 0.0623, 0.7813]], grad_fn=<SoftmaxBackward0>)


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

text = "Hello world"
inputs = tokenizer(text, return_tensors="pt")

print("Input keys:", inputs.keys())
print("Input shape:", inputs["input_ids"].shape)

outputs = model(**inputs)

print("Hidden state shape:", outputs.last_hidden_state.shape)


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: google-bert/bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Input keys: KeysView({'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])})
Input shape: torch.Size([1, 4])
Hidden state shape: torch.Size([1, 4, 768])


In [3]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [4]:
for name, param in model.named_parameters():
    print(name, param.shape)

embeddings.word_embeddings.weight torch.Size([30522, 768])
embeddings.position_embeddings.weight torch.Size([512, 768])
embeddings.token_type_embeddings.weight torch.Size([2, 768])
embeddings.LayerNorm.weight torch.Size([768])
embeddings.LayerNorm.bias torch.Size([768])
encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
encoder.layer.0.attention.self.query.bias torch.Size([768])
encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
encoder.layer.0.attention.self.key.bias torch.Size([768])
encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
encoder.layer.0.attention.self.value.bias torch.Size([768])
encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
encoder.layer.0.inter

In [5]:
import torch
import torch.nn.functional as F

x = torch.linspace(-3, 3, 7)
print("x:", x)
print("ReLU:", torch.relu(x))
print("GELU:", F.gelu(x))


x: tensor([-3., -2., -1.,  0.,  1.,  2.,  3.])
ReLU: tensor([0., 0., 0., 0., 1., 2., 3.])
GELU: tensor([-0.0041, -0.0455, -0.1587,  0.0000,  0.8413,  1.9545,  2.9959])


In [None]:
import numpy as np
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import evaluate

model_name = "google-bert/bert-base-uncased"

# 1) Данные
ds = load_dataset("imdb")  # train/test

# 2) Токенизатор
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=256)

ds_tok = ds.map(tokenize_batch, batched=True, remove_columns=["text"])

# 3) Датаколлатор (делает padding внутри батча)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 4) Модель (тут появляется classification head)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    torch_dtype=torch.float16 if torch.cuda.is_available() else None,
).to("cuda" if torch.cuda.is_available() else "cpu")

# 5) Метрики
acc = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc.compute(predictions
