In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 128 #B, how many independent sequences will we process in parallel
block_size = 256 #T, what is the maximum context length for predictions
n_embd = 128 #C, embedding length
n_head = 4 # number of heads in transformer block
#head_size = n_embd//n_head
n_layer = 4 # how many transformer blocks

max_iters = 5000 # how many batches in training
eval_interval = 100 # how many batches between loss estimations in training
eval_iters = 200 # how many batches for loss estimation

learning_rate = 1e-3
dropout = 0.3

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
#load Alice.txt to Colab
with open('Alice.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [6]:
print(text[:1000])

Alice’s Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents

 CHAPTER I.     Down the Rabbit-Hole
 CHAPTER II.    The Pool of Tears
 CHAPTER III.   A Caucus-Race and a Long Tale
 CHAPTER IV.    The Rabbit Sends in a Little Bill
 CHAPTER V.     Advice from a Caterpillar
 CHAPTER VI.    Pig and Pepper
 CHAPTER VII.   A Mad Tea-Party
 CHAPTER VIII.  The Queen’s Croquet-Ground
 CHAPTER IX.    The Mock Turtle’s Story
 CHAPTER X.     The Lobster Quadrille
 CHAPTER XI.    Who Stole the Tarts?
 CHAPTER XII.   Alice’s Evidence




CHAPTER I.
Down the Rabbit-Hole


Alice was beginning to get very tired of sitting by her sister on the
bank, and of having nothing to do: once or twice she had peeped into
the book her sister was reading, but it had no pictures or
conversations in it, “and what is the use of a book,” thought Alice
“without pictures or conversations?”

So she was considering in her own mind (as well as she could, for the
hot day made her feel very 

In [7]:
#number of characters in the text
len(text)

144584

In [8]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
chars

['\n',
 ' ',
 '!',
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '0',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'ù',
 '—',
 '‘',
 '’',
 '“',
 '”']

In [9]:
vocab_size = len(chars)
vocab_size

75

In [10]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
#and from integers to characters
itos = { i:ch for i,ch in enumerate(chars) }

In [11]:
stoi

{'\n': 0,
 ' ': 1,
 '!': 2,
 '(': 3,
 ')': 4,
 '*': 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '0': 9,
 '3': 10,
 ':': 11,
 ';': 12,
 '?': 13,
 'A': 14,
 'B': 15,
 'C': 16,
 'D': 17,
 'E': 18,
 'F': 19,
 'G': 20,
 'H': 21,
 'I': 22,
 'J': 23,
 'K': 24,
 'L': 25,
 'M': 26,
 'N': 27,
 'O': 28,
 'P': 29,
 'Q': 30,
 'R': 31,
 'S': 32,
 'T': 33,
 'U': 34,
 'V': 35,
 'W': 36,
 'X': 37,
 'Y': 38,
 'Z': 39,
 '[': 40,
 ']': 41,
 '_': 42,
 'a': 43,
 'b': 44,
 'c': 45,
 'd': 46,
 'e': 47,
 'f': 48,
 'g': 49,
 'h': 50,
 'i': 51,
 'j': 52,
 'k': 53,
 'l': 54,
 'm': 55,
 'n': 56,
 'o': 57,
 'p': 58,
 'q': 59,
 'r': 60,
 's': 61,
 't': 62,
 'u': 63,
 'v': 64,
 'w': 65,
 'x': 66,
 'y': 67,
 'z': 68,
 'ù': 69,
 '—': 70,
 '‘': 71,
 '’': 72,
 '“': 73,
 '”': 74}

In [12]:
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [13]:
print(text[:25])
enc=encode(text[:25])
print()
print(enc)

Alice’s Adventures in Won

[14, 54, 51, 45, 47, 72, 61, 1, 14, 46, 64, 47, 56, 62, 63, 60, 47, 61, 1, 51, 56, 1, 36, 57, 56]


In [14]:
decode(enc)

'Alice’s Adventures in Won'

In [15]:
# Encode text
encoded_text=encode(text)
data = torch.tensor(encoded_text, dtype=torch.long)
ndata=len(data)
print(data[:1000])
print()
print(ndata)

tensor([14, 54, 51, 45, 47, 72, 61,  1, 14, 46, 64, 47, 56, 62, 63, 60, 47, 61,
         1, 51, 56,  1, 36, 57, 56, 46, 47, 60, 54, 43, 56, 46,  0,  0, 44, 67,
         1, 25, 47, 65, 51, 61,  1, 16, 43, 60, 60, 57, 54, 54,  0,  0, 33, 21,
        18,  1, 26, 22, 25, 25, 18, 27, 27, 22, 34, 26,  1, 19, 34, 25, 16, 31,
        34, 26,  1, 18, 17, 22, 33, 22, 28, 27,  1, 10,  8,  9,  0,  0, 16, 57,
        56, 62, 47, 56, 62, 61,  0,  0,  1, 16, 21, 14, 29, 33, 18, 31,  1, 22,
         8,  1,  1,  1,  1,  1, 17, 57, 65, 56,  1, 62, 50, 47,  1, 31, 43, 44,
        44, 51, 62,  7, 21, 57, 54, 47,  0,  1, 16, 21, 14, 29, 33, 18, 31,  1,
        22, 22,  8,  1,  1,  1,  1, 33, 50, 47,  1, 29, 57, 57, 54,  1, 57, 48,
         1, 33, 47, 43, 60, 61,  0,  1, 16, 21, 14, 29, 33, 18, 31,  1, 22, 22,
        22,  8,  1,  1,  1, 14,  1, 16, 43, 63, 45, 63, 61,  7, 31, 43, 45, 47,
         1, 43, 56, 46,  1, 43,  1, 25, 57, 56, 49,  1, 33, 43, 54, 47,  0,  1,
        16, 21, 14, 29, 33, 18, 31,  1, 

In [16]:
# Train and test split
ntrain = int(0.7*ndata) # first 70% will be train, rest val
train_data = data[:ntrain]
val_data = data[ntrain:]
nval=len(val_data)

print(ntrain)
print(nval)

101208
43376


In [17]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]) #characters
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) #next characters
    x,y = x.to(device), y.to(device)
    return x,y

In [18]:
x,y = get_batch('train')
print(x.shape)
print(y.shape)

torch.Size([128, 256])
torch.Size([128, 256])


In [19]:
print(x[0])
print()
print(x[0].shape)

tensor([56, 46,  1, 29, 47, 58, 58, 47, 60,  0,  0,  0, 19, 57, 60,  1, 43,  1,
        55, 51, 56, 63, 62, 47,  1, 57, 60,  1, 62, 65, 57,  1, 61, 50, 47,  1,
        61, 62, 57, 57, 46,  1, 54, 57, 57, 53, 51, 56, 49,  1, 43, 62,  1, 62,
        50, 47,  1, 50, 57, 63, 61, 47,  6,  1, 43, 56, 46,  1, 65, 57, 56, 46,
        47, 60, 51, 56, 49,  1, 65, 50, 43, 62,  0, 62, 57,  1, 46, 57,  1, 56,
        47, 66, 62,  6,  1, 65, 50, 47, 56,  1, 61, 63, 46, 46, 47, 56, 54, 67,
         1, 43,  1, 48, 57, 57, 62, 55, 43, 56,  1, 51, 56,  1, 54, 51, 64, 47,
        60, 67,  1, 45, 43, 55, 47,  1, 60, 63, 56, 56, 51, 56, 49,  1, 57, 63,
        62,  1, 57, 48,  1, 62, 50, 47,  0, 65, 57, 57, 46, 70,  3, 61, 50, 47,
         1, 45, 57, 56, 61, 51, 46, 47, 60, 47, 46,  1, 50, 51, 55,  1, 62, 57,
         1, 44, 47,  1, 43,  1, 48, 57, 57, 62, 55, 43, 56,  1, 44, 47, 45, 43,
        63, 61, 47,  1, 50, 47,  1, 65, 43, 61,  1, 51, 56,  1, 54, 51, 64, 47,
        60, 67, 11,  0, 57, 62, 50, 47, 

In [20]:
y[0]

tensor([46,  1, 29, 47, 58, 58, 47, 60,  0,  0,  0, 19, 57, 60,  1, 43,  1, 55,
        51, 56, 63, 62, 47,  1, 57, 60,  1, 62, 65, 57,  1, 61, 50, 47,  1, 61,
        62, 57, 57, 46,  1, 54, 57, 57, 53, 51, 56, 49,  1, 43, 62,  1, 62, 50,
        47,  1, 50, 57, 63, 61, 47,  6,  1, 43, 56, 46,  1, 65, 57, 56, 46, 47,
        60, 51, 56, 49,  1, 65, 50, 43, 62,  0, 62, 57,  1, 46, 57,  1, 56, 47,
        66, 62,  6,  1, 65, 50, 47, 56,  1, 61, 63, 46, 46, 47, 56, 54, 67,  1,
        43,  1, 48, 57, 57, 62, 55, 43, 56,  1, 51, 56,  1, 54, 51, 64, 47, 60,
        67,  1, 45, 43, 55, 47,  1, 60, 63, 56, 56, 51, 56, 49,  1, 57, 63, 62,
         1, 57, 48,  1, 62, 50, 47,  0, 65, 57, 57, 46, 70,  3, 61, 50, 47,  1,
        45, 57, 56, 61, 51, 46, 47, 60, 47, 46,  1, 50, 51, 55,  1, 62, 57,  1,
        44, 47,  1, 43,  1, 48, 57, 57, 62, 55, 43, 56,  1, 44, 47, 45, 43, 63,
        61, 47,  1, 50, 47,  1, 65, 43, 61,  1, 51, 56,  1, 54, 51, 64, 47, 60,
        67, 11,  0, 57, 62, 50, 47, 60, 

In [21]:
#Goal of nanoGPT: given a sequence of characters, predict the next character
#one block of training data gives the following training information:
K=10
for k in range(K):
  print(str(k+1)+': previous characters:'+str(x[0][:k+1].tolist())+' -> next character: '+str(y[0][k].tolist()))
print()
print(str(block_size)+': previous characters:'+str(x[0].tolist())+' -> next character: '+str(y[0][-1].tolist()))

1: previous characters:[56] -> next character: 46
2: previous characters:[56, 46] -> next character: 1
3: previous characters:[56, 46, 1] -> next character: 29
4: previous characters:[56, 46, 1, 29] -> next character: 47
5: previous characters:[56, 46, 1, 29, 47] -> next character: 58
6: previous characters:[56, 46, 1, 29, 47, 58] -> next character: 58
7: previous characters:[56, 46, 1, 29, 47, 58, 58] -> next character: 47
8: previous characters:[56, 46, 1, 29, 47, 58, 58, 47] -> next character: 60
9: previous characters:[56, 46, 1, 29, 47, 58, 58, 47, 60] -> next character: 0
10: previous characters:[56, 46, 1, 29, 47, 58, 58, 47, 60, 0] -> next character: 0

256: previous characters:[56, 46, 1, 29, 47, 58, 58, 47, 60, 0, 0, 0, 19, 57, 60, 1, 43, 1, 55, 51, 56, 63, 62, 47, 1, 57, 60, 1, 62, 65, 57, 1, 61, 50, 47, 1, 61, 62, 57, 57, 46, 1, 54, 57, 57, 53, 51, 56, 49, 1, 43, 62, 1, 62, 50, 47, 1, 50, 57, 63, 61, 47, 6, 1, 43, 56, 46, 1, 65, 57, 56, 46, 47, 60, 51, 56, 49, 1, 65, 50, 43

## GPT model train

In [22]:
# nanoGPT-model

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size): # head_size = n_embd//n_head
        super().__init__()
        self.key = nn.Linear(n_embd, head_size) # K_Weights [head_size,n_embd] + K_bias [head_size]
        self.query = nn.Linear(n_embd, head_size) # Q_Weights [head_size,n_embd]  +Q_bias [head_size]
        self.value = nn.Linear(n_embd, head_size) # V_Weights [head_size,n_embd] + V_bias [head_size]
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #mask, [block_size,block_size], tril = lower triangular
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size [B,T,n_embd]
        # output of size [B,T,head_size]
        B,T,C = x.shape
        K_vectors = self.key(x)   # K_vectors=x@K_weights.T+K_bias, [B,T,head_size]
        Q_vectors = self.query(x) # Q_vectors=x@Q_weights.T+Q_bias, [B,T,head_size]
        # compute attention scores ("affinities")
        Attention_Matrix = Q_vectors @ K_vectors.transpose(-2,-1) * K_vectors.shape[-1]**-0.5 # [B,T,T]
        Attention_Matrix = Attention_Matrix.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # [B,T,T], upper diagonal elements to -infinity
        Attn_Matrix_Softmax = F.softmax(Attention_Matrix, dim=-1) # [B,T,T], exp(-inf) = 0 -> upper diagonal elements = 0
        Attn_Matrix_Softmax = self.dropout(Attn_Matrix_Softmax)
        # perform the weighted aggregation of the values
        V_vectors = self.value(x) # V_vectors=x@V_weights.T+V_bias, [B,T,head_size]
        V_output = Attn_Matrix_Softmax@V_vectors #  [B,T,head_size]
        return V_output

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)]) #self-attention heads
        self.proj = nn.Linear(head_size * n_heads, n_embd) # Projection_Weights [n_embd,n_embd] + Projection_Bias [n_embd]
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        Stack_V_outputs = torch.cat([h(x) for h in self.heads], dim=-1) #stacked V_outputs, [B,T,n_embd]
        Attention_Output = self.proj(Stack_V_outputs) # Attention_output = Stack_V_outputs@Projection_Weights.T+Projection_Bias, [B,T,n_embd]
        Attention_Output = self.dropout(Attention_Output)
        return Attention_Output

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), # MLP_weights [4*n_embd,n_embd] + MLP_Bias [4*n_embd]
            nn.GELU(approximate='tanh'), # MLP_Activation, GaussianErrorLinearUnit
            nn.Linear(4 * n_embd, n_embd), # MLP_Projection_Weights [n_embd,4*n_embd] + MLP_Projection_Bias [n_embd]
            nn.Dropout(dropout),
        )

    def forward(self, x): # x = Layer_Norm, [B,T,n_embd]
        return self.net(x) # MLP_Result
        # MLP = x@MLP_weights.T + MLP_Bias, [B,T,4*n_embd]
        # MLP_activation = GELU(MLP), [B,T,4*n_embd]
        # MLP_result = MLP_activation@MLP_Projection_Weights.T+MLP_Projection_Bias, [B,T,n_embd]

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # layer normalization 1, weight gamma1 [n_embd], bias beta1 [n_embd]
        self.ln2 = nn.LayerNorm(n_embd) # layer normalization 2, weight gamma2 [n_embd], bias beta2 [n_embd]

    def forward(self, x): # x = [B,T,n_embd]
        Layer_Norm1=self.ln1(x) # [B,T,n_embd]
        Attention_Output=self.sa(Layer_Norm1) # [B,T,n_embd]
        Attention_Residual=Attention_Output+x # [B,T,n_embd]
        Layer_Norm2=self.ln2(Attention_Residual) # [B,T,n_embd]
        MLP_Result=self.ffwd(Layer_Norm2) # [B,T,n_embd],
        MLP_Residual=Attention_Residual+MLP_Result # [B,T,n_embd]
        return MLP_Residual

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)]) # transformer blocks
        self.ln_f = nn.LayerNorm(n_embd) # final layer normalization, weight gamma3 [n_embd], bias beta3 [n_embd]
        self.lm_head = nn.Linear(n_embd, vocab_size, bias=False) # LM_Head_Weights [vocab_size,n_embd]

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both [B,T] tensors of integers
        Token_Embed = self.token_embedding_table(idx) # [B,T,n_embd]
        Position_Embed = self.position_embedding_table(torch.arange(T, device=device)) # [T,n_embd]
        Input_Embed = Token_Embed + Position_Embed # [B,T,n_embd]
        Final_MLP_Residual = self.blocks(Input_Embed) # [B,T,n_embd]
        Final_Layer_Norm = self.ln_f(Final_MLP_Residual) # [B,T,n_embd]
        logits = self.lm_head(Final_Layer_Norm) # logits = Final_Layer_Norm@LM_Head_Weights.T, [B,T,vocab_size]

        if targets is None:
            loss = None
        else:
            B,T,vs = logits.shape #vs = vocab_size
            logits = logits.view(B*T,vs) #[B,T,vs] -> [B*T,vs]
            targets = targets.view(B*T) #[B,T] -> [B*T]
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_characters):
      #idx is [B,T] array of indices in the current context
      for _ in range(max_new_characters):
          # crop idx to the last block_size tokens
          idx_cond = idx[:, -block_size:]
          # get the predictions
          logits, loss = self(idx_cond) #Logits = [B,T,vocab_size]
          # focus only on the last time step
          logits = logits[:, -1, :] # becomes [B,vocab_size]
          # apply softmax to get probabilities
          probs = F.softmax(logits, dim=-1) # [B,vocab_size]
          # sample from the distribution
          idx_next = torch.multinomial(probs, num_samples=1) # [B,1]
          # append sampled index to the running sequence
          idx = torch.cat((idx, idx_next), dim=1) # [B,T+1]
      return idx



In [23]:
model = GPTLanguageModel().to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in model.parameters()), ' parameters')

845312  parameters


In [24]:
for name, param in model.named_parameters():
  if param.requires_grad: #trainable parameters
    print(name)
    print(param.data.shape)

token_embedding_table.weight
torch.Size([75, 128])
position_embedding_table.weight
torch.Size([256, 128])
blocks.0.sa.heads.0.key.weight
torch.Size([32, 128])
blocks.0.sa.heads.0.key.bias
torch.Size([32])
blocks.0.sa.heads.0.query.weight
torch.Size([32, 128])
blocks.0.sa.heads.0.query.bias
torch.Size([32])
blocks.0.sa.heads.0.value.weight
torch.Size([32, 128])
blocks.0.sa.heads.0.value.bias
torch.Size([32])
blocks.0.sa.heads.1.key.weight
torch.Size([32, 128])
blocks.0.sa.heads.1.key.bias
torch.Size([32])
blocks.0.sa.heads.1.query.weight
torch.Size([32, 128])
blocks.0.sa.heads.1.query.bias
torch.Size([32])
blocks.0.sa.heads.1.value.weight
torch.Size([32, 128])
blocks.0.sa.heads.1.value.bias
torch.Size([32])
blocks.0.sa.heads.2.key.weight
torch.Size([32, 128])
blocks.0.sa.heads.2.key.bias
torch.Size([32])
blocks.0.sa.heads.2.query.weight
torch.Size([32, 128])
blocks.0.sa.heads.2.query.bias
torch.Size([32])
blocks.0.sa.heads.2.value.weight
torch.Size([32, 128])
blocks.0.sa.heads.2.value.b

In [25]:
#for training to calculate losses and accuracies
@torch.no_grad() #no backpropagation used
def estimate_loss():
    losses = {}
    accuracies={}
    model.eval() #model to evaluation (prediction) phase
    for split in ['train', 'val']:
        loss_split = torch.zeros(eval_iters)
        correct_split=torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            loss_split[k] = loss.item()
            logits=logits.view(batch_size,block_size, vocab_size) #[batch_size*block_size,vocab_size] -> [batch_size,block_size,vocab_size]
            predictions=torch.argmax(logits,axis=-1) #predicted next characters
            correct_split[k]=(predictions==Y).sum().item() #number of correct predictions
        losses[split] = loss_split.mean()
        accuracies[split]=correct_split.sum()/(eval_iters*batch_size*block_size)
    model.train() #model to training phase
    return losses, accuracies

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
# for training create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [28]:
# training the model, use GPU

best_val_acc=0

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses, accuracies  = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, train acc {accuracies['train']:.4f}, val loss {losses['val']:.4f}, val acc {accuracies['val']:.4f}")
        if accuracies['val']>best_val_acc:
          best_val_acc=accuracies['val']
          torch.save(model.state_dict(), '/content/drive/MyDrive/nanoGPT.pt') #save best weights
          print('saved')

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb) #forward propagation
    optimizer.zero_grad(set_to_none=True) #clear previous derivates
    loss.backward() #backpropagation
    optimizer.step() #update parameters

step 0: train loss 4.3356, train acc 0.0111, val loss 4.3288, val acc 0.0124
saved
step 100: train loss 2.4897, train acc 0.2899, val loss 2.5232, val acc 0.2893
saved
step 200: train loss 2.4110, train acc 0.2965, val loss 2.4481, val acc 0.3001
saved
step 300: train loss 2.3202, train acc 0.3241, val loss 2.3773, val acc 0.3238
saved
step 400: train loss 2.0946, train acc 0.3883, val loss 2.1663, val acc 0.3804
saved
step 500: train loss 1.8237, train acc 0.4577, val loss 1.9303, val acc 0.4406
saved
step 600: train loss 1.6518, train acc 0.4981, val loss 1.8022, val acc 0.4761
saved
step 700: train loss 1.5258, train acc 0.5309, val loss 1.7201, val acc 0.5016
saved
step 800: train loss 1.4414, train acc 0.5517, val loss 1.6823, val acc 0.5131
saved
step 900: train loss 1.3506, train acc 0.5764, val loss 1.6387, val acc 0.5294
saved
step 1000: train loss 1.2830, train acc 0.5949, val loss 1.6043, val acc 0.5389
saved
step 1100: train loss 1.2194, train acc 0.6129, val loss 1.5777, v

In [29]:
#define GPTLanguageModel and load saved weights

#using CPU
#model.load_state_dict(torch.load('/content/drive/MyDrive/nanoGPT.pt', weights_only=True, map_location=torch.device('cpu')))

#using GPU
model.load_state_dict(torch.load('/content/drive/MyDrive/nanoGPT.pt', weights_only=True))

model.eval()

GPTLanguageModel(
  (token_embedding_table): Embedding(75, 128)
  (position_embedding_table): Embedding(256, 128)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=128, out_features=32, bias=True)
            (query): Linear(in_features=128, out_features=32, bias=True)
            (value): Linear(in_features=128, out_features=32, bias=True)
            (dropout): Dropout(p=0.3, inplace=False)
          )
        )
        (proj): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.3, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=128, out_features=512, bias=True)
          (1): GELU(approximate='tanh')
          (2): Linear(in_features=512, out_features=128, bias=True)
          (3): Dropout(p=0.3, inplace=False)
        )
      )
      (ln1): LayerNorm((128,), eps=1e-05, eleme

In [32]:
import os
import numpy as np

# Create directory if it doesn't exist
save_path = "/content/drive/MyDrive/nanoGPT_parameters/"
os.makedirs(save_path, exist_ok=True)

# Save trainable parameters
for name, param in model.named_parameters():
    if param.requires_grad:  # Trainable parameters
        np.savetxt(os.path.join(save_path, f"{name}.txt"), param.data.cpu().detach().numpy())

In [35]:
#generate new characters from the model
#context = torch.zeros((1, 1), dtype=torch.long, device=device)
context = torch.tensor([encode('Alice to her eyes')], dtype=torch.long, device=device)
max_new_characters=1000
new_characters=model.generate(context, max_new_characters)
print(decode(new_characters[0].tolist()))

Alice to her eyes to, for I dare
old dry know, and white Rabbit was to go notice look and the looked at
it it. “Dinah!” Alice only replied very much Mouse, “but I know oh, and
yet, or Majesty place you know! A back o’clain the moment, and Esquite Rabbit of Hearts: now
She and felt tired have great got up know on flown to little golden, and
neighten the door, who was only
a confusion the Queen.”

“—who am I have can’t been your head?” shouted they! “I can doesn’t like think me
things are! I’ll try inters about if I get in at first, I am I’m not I have go down
from or here? The way are of the Party!”

The Rabbit had begun in the Mouse. The Rabbit’s mouse in the way Alice quite
again, cried to replied among—(she appeared had to be held down and sightener to her.

“One mind, does!” Alice was to get get on the glass, and began thought to through the
herself.

“Why, it did then?” said the Cat. “I do readfully
it is again,” replied Alice; and _very_ nearly forgot to the house should change to

In [None]:
# there also cat mouse rabit in the context of generated text