<a href="https://colab.research.google.com/github/Tvorozh0k/MyGPT/blob/main/MyGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Подключение библиотек

In [1]:
#@title Подключение библиотеки PyTorch

import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
#@title Работа с трансформерами из Hugging Face

!pip install transformers



In [47]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.2 MB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m0.7/1.2 MB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Теория

### 0. Архитектура GPT2

In [21]:
from transformers import AutoModelForCausalLM, GPT2LMHeadModel

checkpoint = 'gpt2'

model = AutoModelForCausalLM.from_pretrained('gpt2')
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


### 1. Как работает `Dropout`?

На вход подается `tensor`. Слой `nn.Dropout(p)` проходится по всем элементам тензора и с вероятностью `p` обнуляет элементы. Все элементы, также, умножаются на значение $\frac{1}{1-p}$.



$$\begin{bmatrix}0.9225 & 1.3627 \\ 0.2685 & 1.1718\end{bmatrix} \to {\text{nn.Dropout(0.2)}} \to \begin{bmatrix}0.9225 \cdot \frac{1}{1-0.2} & 0 \\ 0.2685 \cdot \frac{1}{1-0.2} & 1.1718 \cdot \frac{1}{1-0.2}\end{bmatrix} = \begin{bmatrix}1.1531 & 0 \\ 0.3357 & 1.4647\end{bmatrix}$$

Почему умножение происходит на $\frac{1}{1-p}$? Пусть дан тензор размерности $n \times m$, все элементы которого равны $v$. Тогда, сумма элементов тензора равна $nmv$. После применения `dropout` с параметром $p$ математическое ожидание суммы равно $(1-p)nmv$ (сумма оставшихся ненулевых элементов). Таким образом, мы потеряли в сумме, мы потеряли ненулевые значения, и теперь мы хотим их перераспределить так, чтобы сумма осталось примерно такой же. Для этого нужно все оставшиеся ненулевые элементы в тензоре умножить на $${\frac{1}{1-p}}$$. Тогда сумма: $\frac{1}{1-p}(1-p)nmv = nmv$.

In [None]:
#@title Пример

d = nn.Dropout(p=0.2)

input_tensor = torch.randn(2, 2)

print(f"[BEFORE DROPOUT]:")
print(f"input_tensor:\n{input_tensor}")

print(f"\n[AFTER DROPOUT]:")
print(f"input_tensor:\n{d(input_tensor)}")

[BEFORE DROPOUT]:
input_tensor:
tensor([[0.9225, 1.3627],
        [0.2685, 1.1718]])

[AFTER DROPOUT]:
input_tensor:
tensor([[1.1531, 0.0000],
        [0.3357, 1.4647]])


### 2. Арифметические операции над тензорами высшего порядка

#### a. Перемножение трехмерных тензоров

$$A = \begin{bmatrix}A_1 & A_2 & \dots & A_n\end{bmatrix}, \;\;B = \begin{bmatrix}B_1 & B_2 & \dots & B_n\end{bmatrix} \quad \to \quad C = AB = \begin{bmatrix}A_1 \cdot B_1 & A_2 \cdot B_2 & \dots & A_n \cdot B_n\end{bmatrix}$$

$$A: (n, m, p), \;\; B: (n, p, s) \to C = AB: (n, m, s)$$

In [59]:
a = torch.randint(5, (2, 2, 3))
print(f"a[2, 2, 3]:\n{a}\n")

b = torch.randint(5, (2, 3, 2))
print(f"b[2, 3, 2]:\n{b}\n")

c = a @ b
print(f"c[2, 2, 2]:\n{c}")

a[2, 2, 3]:
tensor([[[2, 2, 2],
         [4, 4, 1]],

        [[3, 3, 2],
         [3, 3, 1]]])

b[2, 3, 2]:
tensor([[[1, 1],
         [1, 2],
         [1, 3]],

        [[1, 1],
         [2, 3],
         [2, 4]]])

c[2, 2, 2]:
tensor([[[ 6, 12],
         [ 9, 15]],

        [[13, 20],
         [11, 16]]])


> Аналогично и с другими измерениями (4 и выше)

#### b. Транспонирование тензоров

In [2]:
a = torch.randn((1, 2, 3, 4))
print(f"Before: {a.shape}")

a = a.transpose(1, 2)
print(f"After: {a.shape}")

Before: torch.Size([1, 2, 3, 4])
After: torch.Size([1, 3, 2, 4])


In [6]:
a = torch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]])

print(a.shape)
print(a)

print(a.transpose(0, 1).shape)
print(a.transpose(0, 1))

print(a.transpose(0, 2).shape)
print(a.transpose(0, 2))

print(a.transpose(1, 2).shape)
print(a.transpose(1, 2))

torch.Size([3, 2, 2])
tensor([[[ 1,  2],
         [ 3,  4]],

        [[ 5,  6],
         [ 7,  8]],

        [[ 9, 10],
         [11, 12]]])
torch.Size([2, 3, 2])
tensor([[[ 1,  2],
         [ 5,  6],
         [ 9, 10]],

        [[ 3,  4],
         [ 7,  8],
         [11, 12]]])
torch.Size([2, 2, 3])
tensor([[[ 1,  5,  9],
         [ 3,  7, 11]],

        [[ 2,  6, 10],
         [ 4,  8, 12]]])
torch.Size([3, 2, 2])
tensor([[[ 1,  3],
         [ 2,  4]],

        [[ 5,  7],
         [ 6,  8]],

        [[ 9, 11],
         [10, 12]]])


### 3. Как работает `scaled_dot_product_attention`?

In [9]:
#@title Генерируем случайные матрицы $Q$, $K$ и $V$ и маску $M$

q = torch.randn((3, 2))
k = torch.randn((3, 2))
v = torch.randn((3, 2))

m = torch.tril(torch.ones(3, 3))
m = m.masked_fill(m==0, float('-inf')) - 1

print(f"q:\n{q}\n")
print(f"k:\n{k}\n")
print(f"v:\n{v}\n")
print(f"m:\n{m}")

q:
tensor([[ 0.6891, -0.2827],
        [ 1.8924, -0.3903],
        [ 1.7954,  1.0172]])

k:
tensor([[ 1.0728,  1.2521],
        [ 0.1752, -0.2029],
        [-0.2853, -1.4800]])

v:
tensor([[ 0.4326, -1.5850],
        [ 0.2770, -0.8679],
        [ 1.2798, -1.3554]])

m:
tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]])


In [10]:
#@title Собственная реализация

out = q

out @= k.T
print(f"QK^T:\n{out}")

out *= 2 ** -0.5
print(f"QK^T / sqrt(d_k):\n{out}")

out += m
print(f"QK^T / sqrt(d_k) + M:\n{out}")

out = F.softmax(out, dim=-1)
print(f"Softmax(QK^T / sqrt(d_k) + M):\n{out}")

out @= v
print(f"Softmax(QK^T / sqrt(d_k) + M) * V:\n{out}")

QK^T:
tensor([[ 0.3853,  0.1781,  0.2218],
        [ 1.5415,  0.4108,  0.0377],
        [ 3.1996,  0.1082, -2.0177]])
QK^T / sqrt(d_k):
tensor([[ 0.2724,  0.1259,  0.1569],
        [ 1.0900,  0.2904,  0.0267],
        [ 2.2625,  0.0765, -1.4267]])
QK^T / sqrt(d_k) + M:
tensor([[ 0.2724,    -inf,    -inf],
        [ 1.0900,  0.2904,    -inf],
        [ 2.2625,  0.0765, -1.4267]])
Softmax(QK^T / sqrt(d_k) + M):
tensor([[1.0000, 0.0000, 0.0000],
        [0.6899, 0.3101, 0.0000],
        [0.8792, 0.0988, 0.0220]])
Softmax(QK^T / sqrt(d_k) + M) * V:
tensor([[ 0.4326, -1.5850],
        [ 0.3844, -1.3626],
        [ 0.4359, -1.5091]])


In [14]:
#@title Готовый метод
print(nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=m))
print(nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)) # передавать маску НЕ ОБЯЗАТЕЛЬНО

tensor([[ 0.4326, -1.5850],
        [ 0.3844, -1.3626],
        [ 0.4359, -1.5091]])
tensor([[ 0.4326, -1.5850],
        [ 0.3844, -1.3626],
        [ 0.4359, -1.5091]])


### 4. Как работает `LayerNorm`?

In [6]:
#@title Генерация данных

a = torch.randn((3, 4))

print(f"a:\n{a}")

a:
tensor([[ 0.7600, -0.7958,  0.1175,  0.8433],
        [-0.5222,  2.5007, -1.0763, -1.5622],
        [-0.0441, -1.3374,  0.6868, -0.0910]])


In [14]:
#@title Собственная реализация

print((a[0] - torch.mean(a[0])) / torch.sqrt(torch.var(a[0], correction=0) + 1e-05))

tensor([ 0.8058, -1.5653, -0.1734,  0.9329])


In [7]:
#@title Готовый метод

ln = nn.LayerNorm(4)

print(f"LayerNorm weights:\n{ln.weight}\n")
print(f"LayerNorm bias:\n{ln.bias}\n")

print(f"LayerNorm(x):\n{ln(a)}")

LayerNorm weights:
Parameter containing:
tensor([1., 1., 1., 1.], requires_grad=True)

LayerNorm bias:
Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True)

LayerNorm(x):
tensor([[ 0.8058, -1.5653, -0.1734,  0.9329],
        [-0.2258,  1.6846, -0.5759, -0.8829],
        [ 0.2094, -1.5686,  1.2143,  0.1449]],
       grad_fn=<NativeLayerNormBackward0>)


## Реализация классов

### 1. FFN (Feed Forward Network)

**Размерности:**

**[INPUT]: `(batch_size, sequence_length, n_embd)`**

1. `c_fc`: `(batch_size, sequence_length, n_embd)` $\to$ `(batch_size, sequence_length, 4 * n_embd)`
2. `gelu`: `(batch_size, sequence_length, 4 * n_embd)` $\to$ `(batch_size, sequence_length, 4 * n_embd)`
3. `c_proj`: `(batch_size, sequence_length, 4 * n_embd)` $\to$ `(batch_size, sequence_length, n_embd)`
4. `dropout`: `(batch_size, sequence_length, n_embd)` $\to$ `(batch_size, sequence_length, n_embd)`

**[OUTPUT]: `(batch_size, sequence_length, n_embd)`**

In [15]:
#@title Реализация класса

class FeedForward(nn.Module):

    def __init__(self, config):
        super().__init__()

        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        x = self.dropout(x)

        return x

In [None]:
#@title Пример работы

class ExampleConfig:

    def __init__(self):
        self.n_embd = 2


config = ExampleConfig()
ff_layer = FeedForward(config)

input_tensor = torch.randn(config.n_embd)
print(f"Input tensor (x): {input_tensor}")

print("\nFeed Forward [FIRST_STEP]:\n")

# layer parameters
print(f"c_fc.weight:\n{ff_layer.c_fc.weight}")
print(f"c_fc.bias:\n{ff_layer.c_fc.bias}")

# validation
print(f"[CLASS] x = c_fc(x): {ff_layer.c_fc(input_tensor)}")
print(f"[CHECK]: xW^T + b = {input_tensor @ ff_layer.c_fc.weight.T + ff_layer.c_fc.bias}")

print("\nFeed Forward [SECOND_STEP]:\n")

# validation
print(f"[CLASS] x = GELU(c_fc(x)): {ff_layer.gelu(ff_layer.c_fc(input_tensor))}")

m = nn.GELU(approximate='tanh')
print(f"[CHECK]: x = GELU(xW^T + b) = {m(input_tensor @ ff_layer.c_fc.weight.T + ff_layer.c_fc.bias)}")

print("\nFeed Forward [THIRD_STEP]:\n")

# layer parameters
print(f"c_fc.weight:\n{ff_layer.c_proj.weight}")
print(f"c_fc.bias:\n{ff_layer.c_proj.bias}")

# validation
print(f"[CLASS] x = c_proj(GELU(c_fc(x))): {ff_layer.c_proj(ff_layer.gelu(ff_layer.c_fc(input_tensor)))}")
print(f"[CHECK]: GELU(xW^T + b)W^T + b = {m(input_tensor @ ff_layer.c_fc.weight.T + ff_layer.c_fc.bias) @ ff_layer.c_proj.weight.T + ff_layer.c_proj.bias}")

print("\nFeed Forward [FOURTH_STEP, DROPOUT]:\n")
print(f"[CLASS] x = FeedForward(x): {ff_layer.forward(input_tensor)}")

Input tensor (x): tensor([-1.5839,  0.0219])

Feed Forward [FIRST_STEP]:

c_fc.weight:
Parameter containing:
tensor([[-0.5494,  0.1919],
        [-0.5304, -0.4630],
        [ 0.6974,  0.6517],
        [-0.6406, -0.3088],
        [ 0.2704,  0.4117],
        [-0.5300, -0.2456],
        [ 0.1082,  0.5320],
        [ 0.6661,  0.2216]], requires_grad=True)
c_fc.bias:
Parameter containing:
tensor([-0.6870,  0.5093, -0.4400,  0.3836, -0.6684, -0.0895,  0.2645, -0.6611],
       requires_grad=True)
[CLASS] x = c_fc(x): tensor([ 0.1875,  1.3393, -1.5304,  1.3914, -1.0877,  0.7446,  0.1048, -1.7112],
       grad_fn=<ViewBackward0>)
[CHECK]: xW^T + b = tensor([ 0.1875,  1.3393, -1.5304,  1.3914, -1.0877,  0.7446,  0.1048, -1.7112],
       grad_fn=<AddBackward0>)

Feed Forward [SECOND_STEP]:

[CLASS] x = GELU(c_fc(x)): tensor([ 0.1077,  1.2182, -0.0966,  1.2770, -0.1507,  0.5746,  0.0568, -0.0746],
       grad_fn=<GeluBackward0>)
[CHECK]: x = GELU(xW^T + b) = tensor([ 0.1077,  1.2182, -0.0966,  1.2

### 2. MultiHeadAttention

**Размерности:**

**[INPUT]: `(batch_size, sequence_length, n_embd)`**

1. `c_attn`: `(batch_size, sequence_length, n_embd)` $\to$ `(batch_size, sequence_length, 3 * n_embd)`
2. **[SPLIT]**: `(batch_size, sequence_length, 3 * n_embd)` $\to$ `(batch_size, sequence_length, n_embd)`
3. **[HEADS]**: `(batch_size, sequence_length, n_embd)` $\to$ `(batch_size, sequence_length, n_heads, head_size)`
4. **[TRANSPOSE]**: `(batch_size, sequence_length, n_heads, head_size)` $\to$ `(batch_size, n_heads, sequence_length, head_size)`
5. **$Q \cdot K^T$**: `(batch_size, n_heads, sequence_length, head_size)` $\times$ `(batch_size, n_heads, head_size, sequence_length)` $\to$ `(batch_size, n_heads, sequence_length, sequence_length)`
6. **[SELF-ATTENTION]**: `(batch_size, n_heads, sequence_length, sequence_length)` $\times$ `(batch_size, n_heads, sequence_length, head_size)` $\to$ `(batch_size, n_heads, sequence_length, head_size)`
7. **[TRANSPOSE]**: `(batch_size, n_heads, sequence_length, head_size)` $\to$ `(batch_size, sequence_length, n_heads, head_size)`
8. **[MERGE]**: `(batch_size, sequence_length, n_heads, head_size)` $\to$ `(batch_size, sequence_length, n_embd)`
9. `c_proj`: `(batch_size, sequence_length, n_embd)` $\to$ `(batch_size, sequence_length, n_embd)`
**[OUTPUT]: `(batch_size, sequence_length, n_embd)`**

In [4]:
#@title Реализация класса

class MultiHeadAttention(nn.Module):

    def __init__(self, config):
        super().__init__()

        # n_embd = n_heads * head_size
        assert config.n_embd % config.n_heads == 0, "Please, check the divisibility of n_embd by n_heads"

        # c_attn = [w_k | w_q | w_v] (concatenated)
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)

        self.attn_dropout = nn.Dropout(p=0.1)
        self.resid_dropout = nn.Dropout(p=0.1)

        # n_heads, head_size
        self.NH = config.n_heads
        self.HS = config.n_embd // config.n_heads

    def forward(self, x):
        # batch_size, sequence_length, n_embd
        BS, SL, EMB = x.shape

        qkv = self.c_attn(x)
        q, k, v = qkv.split(EMB, dim=2)

        q = q.view(BS, SL, self.NH, self.HS).transpose(1, 2)
        k = k.view(BS, SL, self.NH, self.HS).transpose(1, 2)
        v = v.view(BS, SL, self.NH, self.HS).transpose(1, 2)

        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = self.attn_dropout(y)

        y = y.transpose(1, 2).contiguous().view(BS, SL, EMB)

        y = self.c_proj(y)
        y = self.resid_dropout(y)

        return y

In [5]:
#@title Пример

class ExampleConfig:

    def __init__(self):
        self.n_embd = 8
        self.n_heads = 2


config = ExampleConfig()
m = MultiHeadAttention(config)

x = torch.randn((3, 4, 8))
print(m(x).shape)

torch.Size([3, 4, 8])


### 3. Block

**Размерности:**

**[INPUT]: `(batch_size, sequence_length, n_embd)`**

1. `attn`: `(batch_size, sequence_length, n_embd)` $\to$ `(batch_size, sequence_length, n_embd)`
2. `mlp`: `(batch_size, sequence_length, n_embd)` $\to$ `(batch_size, sequence_length, n_embd)`

**[OUTPUT]: `(batch_size, sequence_length, n_embd)`**

In [17]:
#@title Реализация класса

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()

        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = MultiHeadAttention(config)

        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = FeedForward(config)

    def forward(self, x):
        # residual connection
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))

        return x

In [18]:
#@title Пример

class ExampleConfig:

    def __init__(self):
        self.n_embd = 8
        self.n_heads = 2


config = ExampleConfig()
bl = Block(config)

x = torch.randn((3, 4, 8))
print(bl(x).shape)

torch.Size([3, 4, 8])


### 4. GPTConfig

In [43]:
#@title Реализация класса

from dataclasses import dataclass

@dataclass
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layers: int = 12 # number of layers
    n_heads: int = 12 # number of heads
    n_embd: int = 768 # embedding dimension

### 5. MyGPT

In [44]:
#@title Реализация класса

class MyGPT(nn.Module):

    def __init__(self, config):
        super().__init__()

        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layers)]),
            ln_f = nn.LayerNorm(config.n_embd)
        ))

        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

    def forward(self, idx, targets=None):
        BS, SL = idx.shape

        assert SL <= config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"

        tok_emb = self.transformer.wte(idx)

        pos = torch.arange(0, SL, dtype=torch.long)
        pos_emb = self.transformer.wpe(pos)

        x = tok_emb + pos_emb

        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        loss = None

        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))

        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel

        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layers=12, n_heads=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layers=24, n_heads=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layers=36, n_heads=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layers=48, n_heads=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257
        config_args['block_size'] = 1024

        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = MyGPT(config)

        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"

        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, device_type):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}

        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]
        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        if master_process:
            print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
            print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and device_type == "cuda"
        if master_process:
            print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8, fused=use_fused)
        return optimizer

In [45]:
model = MyGPT.from_pretrained("gpt2")

loading weights from pretrained gpt: gpt2


In [48]:
import tiktoken

enc = tiktoken.get_encoding("gpt2")

In [57]:
model.eval()

num_return_sequences = 4
max_length = 32

tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)

xgen = tokens
sample_rng = torch.Generator()
sample_rng.manual_seed(42)

while xgen.size(1) < max_length:
    # forward the model to get the logits
    with torch.no_grad():
        with torch.autocast(device_type='cpu', dtype=torch.bfloat16):
            logits, loss = model(xgen) # (B, T, vocab_size)
        # take the logits at the last position
        logits = logits[:, -1, :] # (B, vocab_size)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)
        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1, generator=sample_rng) # (B, 1)
        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # append to the sequence
        xgen = torch.cat((xgen, xcol), dim=1)
# print the generated text
for i in range(num_return_sequences):
    tokens = xgen[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(f"rank {ddp_rank} sample {i}: {decoded}")

AttributeError: 'ExampleConfig' object has no attribute 'block_size'