In [1]:
import torch
from model import Transformer
from transformers import AutoTokenizer  # pip install transformers
from utils import (
    BATCH_SIZE,
    BLOCK_SIZE,
    DEVICE,
    DROPOUT,
    LEARNING_RATE,
    NUM_EMBED,
    NUM_HEAD,
    NUM_LAYER,
    MAX_ITER,
    EVAL_INTER,
    encode,
    decode,
    get_batch,
    save_model_to_chekpoint,
    estimate_loss,
)






Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


Model with 89.48M parameters


In [None]:
# raw data
path_do_data = "data/english.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
# data_raw = data_raw[4000000:] # short dataset

# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# train a new model
model = Transformer(
    vocab_size=vocab_size,
    num_embed=NUM_EMBED,
    block_size=BLOCK_SIZE,
    num_heads=NUM_HEAD,
    num_layers=NUM_LAYER,
    dropout=DROPOUT,
)
# load model to GPU if available
m = model.to(DEVICE)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in m.parameters()) / 1e6)
)

In [4]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)
MAX_ITER = 500
EVAL_INTER=50
for step in range(MAX_ITER):
    #print(f'step -> {step}')
    # every EVAL_INTER evaluate the loss on train and val sets
    if step % EVAL_INTER == 0 or step == MAX_ITER - 1:
        loss_train = estimate_loss(
            data=train_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        loss_val = estimate_loss(
            data=val_data, model=m, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    logits, loss = m.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients 
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters 
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()



step          0 | train loss 0.4995 | val loss 8.0422
step         50 | train loss 0.4098 | val loss 8.2266
step        100 | train loss 0.3308 | val loss 8.7301
step        150 | train loss 0.2732 | val loss 8.9612
step        200 | train loss 0.2346 | val loss 8.9518
step        250 | train loss 0.2345 | val loss 9.1536
step        300 | train loss 0.2041 | val loss 9.4915
step        350 | train loss 0.1985 | val loss 9.7089
step        400 | train loss 0.1832 | val loss 9.4781
step        450 | train loss 0.1838 | val loss 9.4950
step        499 | train loss 0.1817 | val loss 9.7152


In [3]:
save_model_to_chekpoint(model=m, path_to_checkpoint="checkpoint", epoch=step)

Successfully saved the model to checkpoint/checkpoint_epoch-0_19.03.2023_19:19:43.pt


In [5]:
# generate some output based on the context
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
print(
    decode(
        enc_sec=m.generate(idx=context, max_new_tokens=100, block_size=BLOCK_SIZE)[0],
        tokenizer=tokenizer,
    )
)

[PAD] of capture has increased a lot, hence we don't a lot of look at max - pooling operator, but we will not be using, but now the capacity of a logarithmic relationship between classification accuracy and receptive field, which suggests that large receptive field size, or right slant ( pixels ). receptive field! this is how this works in a nutshell : split the image into patches ( 16x16 ) flatten the patches produce lower - dimensional linear embed


In [14]:
import numpy as np
# hyperparameters
BATCH_SIZE = 32  # how many independent sequences will we process in parallel?
BLOCK_SIZE = 64  # what is the maximum context length for predictions?
MAX_ITER = 5000  # number of training iterations
EVAL_INTER = 50
LEARNING_RATE = 3e-4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
NUM_HEAD = 6
NUM_EMBED = NUM_HEAD * 128
NUM_LAYER = 6
DROPOUT = 0.2


layout = np.zeros([3, 3], dtype=bool)
layout

array([[False, False, False],
       [False, False, False],
       [False, False, False]])

In [39]:
q=np.random.randint(10,size=(2,3))
k=np.random.randint(10,size=(3,5))
q,k

(array([[8, 6, 1],
        [3, 0, 3]]),
 array([[3, 0, 6, 7, 5],
        [8, 0, 5, 7, 5],
        [8, 9, 6, 0, 4]]))

In [28]:
kL=np.tril(k)
kU=np.tril(k.T)
kU,kL

(array([[4, 0, 0, 0],
        [8, 7, 0, 0],
        [3, 9, 5, 0],
        [4, 9, 6, 7]]),
 array([[4, 0, 0, 0],
        [3, 7, 0, 0],
        [4, 4, 5, 0],
        [4, 5, 1, 7]]))

In [None]:
for i in range(4):
    for j in range(4)
    row=q_idx//2
    col=q_idx//2
    q[q_idx,]

In [31]:
np.matmul(q,k)

array([[ 75, 142, 115, 142],
       [ 94, 147, 117, 161],
       [ 62,  88,  72,  96],
       [ 78, 117,  97, 147]])

In [36]:
len(k)

TypeError: object of type 'int' has no len()

In [47]:
num_row_A=len(q)
num_col_A=len(q[0])
num_col_B=len(k[0])
num_row_B=len(k)
print(f'num_row_A {num_row_A},num_col_B {num_col_B}, num_row_B {num_row_B}')


num_row_A 2,num_col_B 5, num_row_B 3


array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]])

In [64]:
result=np.zeros([num_row_A,num_col_B])
result
for i in range(num_row_A):
    # iterating by column by B
    for j in range(num_col_B):
        # iterating by rows of B
        for n in range(num_row_B):
            if(n<1 and j < 1):
                result[i][j] += q[i][n] * k[n][j]

result

array([[24.,  0.,  0.,  0.,  0.],
       [ 9.,  0.,  0.,  0.,  0.]])

array([[80.,  9., 84., 98., 74.],
       [33., 27., 36., 21., 27.]])

In [58]:
np.matmul(q,k)

array([[80,  9, 84, 98, 74],
       [33, 27, 36, 21, 27]])

In [8]:
import torch
import torch_blocksparse

# Z: non-sparse batch dimension
# H: sparse batch dimension
# M: row dimension
# N: column dimension
Z, H, M, N, K = 4, 2, 256, 512, 384
a = torch.rand((Z, H, M, K), dtype=torch.float32).cuda()
b = torch.rand((Z, H, K, N), dtype=torch.float32).cuda()
# create sparsity layout
block = 16
layout = torch.randint(0, 2, (H, M//block, N//block))
# create object for Sparse = trans(Dense) x Dense (sdd)
# some overhead there as it pre-computes look-up tables 
# internally needed by GPU kernels
dot = torch_blocksparse.MatMul(layout, block, 'sdd', trans_a=True, trans_b=False)
c = dot(a, b)
# create object for Sparse = softmax(Sparse)
softmax = torch_blocksparse.Softmax(layout, block)
d = softmax(c)

ModuleNotFoundError: No module named 'torch_blocksparse_cpp_utils'

In [10]:
!pip install triton
!pip install torch_blocksparse_cpp_utils

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[31mERROR: Could not find a version that satisfies the requirement torch_blocksparse_cpp_utils (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torch_blocksparse_cpp_utils[0m[31m
[0m