In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import mlflow as mlflow
import tiktoken

# Hyper Parameters
batch_size = 64
block_size = 128
n_embd = 32
n_head = 4
nvocab = 65
bias = False
dropout = 0.2
Bias = False
import math

In [53]:
torch.manual_seed(1337)
c = nn.Linear(n_embd,3*n_embd)

In [54]:
torch.manual_seed(1337)
embedding_table = nn.Embedding(nvocab,n_embd)
positional_embedding = nn.Embedding(block_size,n_embd)

In [55]:
torch.manual_seed(1337)
d = embedding_table(torch.randint(65,(batch_size,block_size)))+positional_embedding(torch.arange(block_size))

In [56]:
class CasualSelfAttention1(nn.Module):
    def __init__(self):
        torch.manual_seed(1337)
        super().__init__()
        
        assert n_embd % n_head == 0
        self.headsize = n_embd//n_head
        self.csAttn = nn.Linear(n_embd,3 *n_embd ,bias = Bias)
        self.mh_proj = nn.Linear(n_embd,n_embd ,bias = Bias)
        self.register_buffer("tril",torch.tril(torch.ones(block_size,block_size)).view(1,1,block_size,block_size))
        self.attdropout = nn.Dropout(dropout)
        self.mhdropout = nn.Dropout(dropout)
    
    def forward(self,x):
        torch.manual_seed(1337)
        B,T,C = x.shape
        q,k,v = self.csAttn(x).split(n_embd,dim=-1)
        
        k = k.view(B,T,n_head,C // n_head).transpose(1,2)
        q = q.view(B,T,n_head,C // n_head).transpose(1,2)
        v = v.view(B,T,n_head,C // n_head).transpose(1,2)
        
        wei = (q @ k.transpose(-2,-1)) * (1.0/math.sqrt(k.size(-1)))
        wei = wei.masked_fill(self.tril[:,:,:T,:T]==0,float('-inf'))
        wei = F.softmax(wei,dim=-1)
        wei = self.attdropout(wei)
        out = wei @ v
        out = out.transpose(1,2).contiguous().view(B,T,C)
        out = self.mhdropout(self.mh_proj(out))
        return out


In [57]:
class CausalSelfAttention(nn.Module):

    def __init__(self):
        super().__init__()
        assert n_embd % n_head == 0
        
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(n_embd, 3 * n_embd, bias=bias)
        # output projection
        self.c_proj = nn.Linear(n_embd, n_embd, bias=bias)
        # regularization
        self.attn_dropout = nn.Dropout(dropout)
        self.resid_dropout = nn.Dropout(dropout)
        self.n_head = n_head
        self.n_embd = n_embd
        self.dropout = dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
        # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("bias", torch.tril(torch.ones(block_size, block_size))
                                    .view(1, 1, block_size, block_size))

    def forward(self, x):
        torch.manual_seed(1337)
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        # print(k.shape,q.shape,v.shape)
        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        return y


In [58]:
torch.manual_seed(1337)
ca = CausalSelfAttention()
cb = CasualSelfAttention1()



In [59]:
ca(d) == cb(d)

tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]],

        [[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]],

        [[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]],

In [60]:
d.shape[-1],d.size(-1)

(32, 32)

In [61]:
flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="openai-community/gpt2")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [63]:
# Load model directly
from transformers import  GPT2LMHeadModel, GPT2Tokenizer, GPT2Config

tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2")
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
config = GPT2Config.from_pretrained("openai-community/gpt2")

In [64]:
[ f"{k} : {v.shape}" for k,v in model.state_dict().items()]

['transformer.wte.weight : torch.Size([50257, 768])',
 'transformer.wpe.weight : torch.Size([1024, 768])',
 'transformer.h.0.ln_1.weight : torch.Size([768])',
 'transformer.h.0.ln_1.bias : torch.Size([768])',
 'transformer.h.0.attn.c_attn.weight : torch.Size([768, 2304])',
 'transformer.h.0.attn.c_attn.bias : torch.Size([2304])',
 'transformer.h.0.attn.c_proj.weight : torch.Size([768, 768])',
 'transformer.h.0.attn.c_proj.bias : torch.Size([768])',
 'transformer.h.0.ln_2.weight : torch.Size([768])',
 'transformer.h.0.ln_2.bias : torch.Size([768])',
 'transformer.h.0.mlp.c_fc.weight : torch.Size([768, 3072])',
 'transformer.h.0.mlp.c_fc.bias : torch.Size([3072])',
 'transformer.h.0.mlp.c_proj.weight : torch.Size([3072, 768])',
 'transformer.h.0.mlp.c_proj.bias : torch.Size([768])',
 'transformer.h.1.ln_1.weight : torch.Size([768])',
 'transformer.h.1.ln_1.bias : torch.Size([768])',
 'transformer.h.1.attn.c_attn.weight : torch.Size([768, 2304])',
 'transformer.h.1.attn.c_attn.bias : torc

In [7]:
idx = torch.zeros((1,1))

In [8]:
idx[:,-block_size:]

tensor([[0.]])

In [2]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [5]:
tokenizer.encode("Hello I am Pavan balaji")

[15496, 314, 716, 350, 12421, 3652, 26436]

In [6]:
tokenizer.n_vocab

50257

In [15]:
a =  torch.randint(10000,size=(10000,))

In [16]:
b = torch.randint(1000,size=(64,))

In [6]:
shakspheredata= open("shakeshpere.txt",mode="r",encoding="utf8").read()
device = "cpu"
tokenizer = tiktoken.get_encoding("gpt2")
data = torch.tensor(tokenizer.encode(shakspheredata))
B,T=4,32

In [7]:
data = data[torch.randint(high=len(data),size=((B*T)+1,))]
x = data[:-1].view(B,T)
y = data[1:].view(B,T)
x, y = x.to(device), y.to(device)

In [12]:
c = nn.Embedding(65,32)

In [14]:
c(x)

IndexError: index out of range in self

In [15]:
x

tensor([[ 6720,  1986,    35,  3598,  3843,   683,   743,  6159,   329,   466,
         27322,   588,   198,   464,   379,    13,   683,  2566,    13, 10686,
           257,    43, 45648, 17062,   339,    11,    11,   705,   617,    25,
           198,    26],
        [  284, 39743, 13110,   290,   345,   284,    48,   268,   198,  1677,
          1705,    26, 15125,     0,    25,   287,    56,  1276,   550,    11,
          1242,   257,  3993,  7714,  8764,   262,   351, 16827,   815,   994,
           766,   502],
        [ 2740,    44, 24421,  3750,   716,   314,  2514,   674,  8128,   514,
          3483, 13970,    11,     6,  6070, 20739,    11,   788,  4844,   339,
         18522,   314,   257,    11,   198,   314,   286,   198,   198,   198,
           407,  1336],
        [  347,  2460,  3285,   198,    11,  1577,   616,    30,    25,   279,
          2911,   428,  7813,   284,   584,    11,  1865,   503,  1011,   198,
           534,    11,    11,   287,   319,    11, 15189,  