In [None]:
from google.colab import drive
drive.mount('/content/drive')
with open('/content/drive/My Drive/physics-science.txt', 'r') as f:
    text = f.read()
print("Length of characters in physics: ",len(text))
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("thermodynamics"))
print(decode(encode("thermodynamics")))
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this
# let's look at the first 1000 characters
print(text[:1000])
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
block_size = 8
train_data[:block_size+1]
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")
print(xb) # our input to the transformer
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(100): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x

out.shape
wei[0]
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5
k.var()
q.var()
wei.var()
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot
class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Length of characters in physics:  1057404
	
 !"#%&'()+,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{|}~ ©­°±µº½Å×åòˆ̂ΔΣΩαβγδεηθλμνπρστφχωϕँआकगतमयलाुृोौ् ‍–‘’“”•…′ℓΩ→⇒∆∑−∙√∝∞∠∫∴∵∼≅≈≠≤≥⊕⊗⊥⋅￼
208
[100, 88, 85, 98, 93, 95, 84, 105, 94, 81, 93, 89, 83, 99]
thermodynamics
torch.Size([1057404]) torch.int64
tensor([  2,  70,  88,  85,  21,  53,  95,  95,  98,  84,  89,  94,  81, 100,
         89,  95,  94,  21,  53,  95,  93,  93,  89, 100, 100,  85,  85,  21,
         86,  95,  98,  93,  85,  84,  21,  82, 105,  21,  57,  68,  21,  64,
         95,  33,  21,  51,  82,  88, 105,  81,  99,  21,  32,  21,  37,  36,
         36,  41,  34,  28,  66,  98,  81,  33,  61,  98,  81,  33,  39,  38,
         34,  36,  41,  29,  21,  69,  54,  21,  32,  21,  39,   1,  54,  81,
        100,  85,  84,  21,  37,  40,  3

(tensor(-9.5367e-09), tensor(1.0000))

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open('/content/drive/My Drive/physics-science.txt', 'r') as f:
    text = f.read()

In [None]:
print("Length of characters in physics: ",len(text))

Length of characters in physics:  1057404


In [None]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 !"#%&'()+,-./0123456789:;<=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_`abcdefghijklmnopqrstuvwxyz{|}~ ©­°±µº½Å×åòˆ̂ΔΣΩαβγδεηθλμνπρστφχωϕँआकगतमयलाुृोौ् ‍–‘’“”•…′ℓΩ→⇒∆∑−∙√∝∞∠∫∴∵∼≅≈≠≤≥⊕⊗⊥⋅￼
208


In [None]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("thermodynamics"))
print(decode(encode("thermodynamics")))

[100, 88, 85, 98, 93, 95, 84, 105, 94, 81, 93, 89, 83, 99]
thermodynamics


In [None]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([1057404]) torch.int64
tensor([  2,  70,  88,  85,  21,  53,  95,  95,  98,  84,  89,  94,  81, 100,
         89,  95,  94,  21,  53,  95,  93,  93,  89, 100, 100,  85,  85,  21,
         86,  95,  98,  93,  85,  84,  21,  82, 105,  21,  57,  68,  21,  64,
         95,  33,  21,  51,  82,  88, 105,  81,  99,  21,  32,  21,  37,  36,
         36,  41,  34,  28,  66,  98,  81,  33,  61,  98,  81,  33,  39,  38,
         34,  36,  41,  29,  21,  69,  54,  21,  32,  21,  39,   1,  54,  81,
        100,  85,  84,  21,  37,  40,  33,  39,  33,  37,  35,  36,  41,  21,
         88,  81,  99,  21,  87,  89, 102,  85,  94,  21,  81,  96,  96,  98,
         95, 102,  81,  92,  21, 100,  95,  21,  96,  98,  85,  99,  83,  98,
         89,  82,  85,  21, 100,  88,  89,  99,  21, 100,  85, 104, 100,  82,
         95,  95,  91,  21,  89,  94,  21,  89, 100,  99,  21,  93,  85,  85,
        100,  89,  94,  87,  21,  88,  85,  92,  84,  21,  95,  94,   1,  38,
         35,  33,  35,  36,  3

In [None]:
# let's look at the first 1000 characters
print(text[:1000])

The Coordination Committee formed by GR No. Abhyas - 2116/(Pra.Kra.43/16) SD - 4
Dated 25.4.2016 has given approval to prescribe this textbook in its meeting held on
30.01.2020 and it has been decided to implement it from academic year 2020-21

PHYSICS
Standard XII

Download DIKSHA App on your smartphone. If you
scan the Q.R.Code on this page of your textbook, you
will be able to access full text and the audio-visual study
material relevant to each lesson, provided as teaching
and learning aids.

2020

Maharashtra State Bureau of Textbook Production and
Curriculum Research, Pune.

First Edition :
2020

© Maharashtra State Bureau of Textbook Production and
Curriculum Research, Pune - 411 004.
The Maharashtra State Bureau of Textbook Production
and Curriculum Research reserves all rights relating to
the book. No part of this book should be reproduced
without the written permission of the Director, Maharashtra
State Bureau of Textbook Production and Curriculum
Research, ‘Balbharati’, Se

In [None]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
block_size = 8
train_data[:block_size+1]

tensor([ 2, 70, 88, 85, 21, 53, 95, 95, 98])

In [None]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([2]) the target: 70
when input is tensor([ 2, 70]) the target: 88
when input is tensor([ 2, 70, 88]) the target: 85
when input is tensor([ 2, 70, 88, 85]) the target: 21
when input is tensor([ 2, 70, 88, 85, 21]) the target: 53
when input is tensor([ 2, 70, 88, 85, 21, 53]) the target: 95
when input is tensor([ 2, 70, 88, 85, 21, 53, 95]) the target: 95
when input is tensor([ 2, 70, 88, 85, 21, 53, 95, 95]) the target: 98


In [None]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[ 88,  21,  84,  81,  98,  91,  21,  96],
        [ 21,  86,  89,  85,  92,  84,  21,  84],
        [ 88,  85,  98,  85,  86,  95,  98,  85],
        [ 21,  86,  98,  95,  93,  21, 100,  88]])
targets:
torch.Size([4, 8])
tensor([[ 21,  84,  81,  98,  91,  21,  96,  95],
        [ 86,  89,  85,  92,  84,  21,  84, 101],
        [ 85,  98,  85,  86,  95,  98,  85,  31],
        [ 86,  98,  95,  93,  21, 100,  88,  85]])
----
when input is [88] the target: 21
when input is [88, 21] the target: 84
when input is [88, 21, 84] the target: 81
when input is [88, 21, 84, 81] the target: 98
when input is [88, 21, 84, 81, 98] the target: 91
when input is [88, 21, 84, 81, 98, 91] the target: 21
when input is [88, 21, 84, 81, 98, 91, 21] the target: 96
when input is [88, 21, 84, 81, 98, 91, 21, 96] the target: 95
when input is [21] the target: 86
when input is [21, 86] the target: 89
when input is [21, 86, 89] the target: 85
when input is [21, 86, 89, 85] the targe

In [None]:
print(xb) # our input to the transformer

tensor([[ 88,  21,  84,  81,  98,  91,  21,  96],
        [ 21,  86,  89,  85,  92,  84,  21,  84],
        [ 88,  85,  98,  85,  86,  95,  98,  85],
        [ 21,  86,  98,  95,  93,  21, 100,  88]])


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 208])
tensor(5.7169, grad_fn=<NllLossBackward0>)
	DTZm‘ त∴θ∝u#3~9“∵→yθ′यM′wh`-
ϕϕHv&Ω}no⋅Lòोाηxθ′tˆW∙uयG/6±µσℓ–)ò√πम∴ZbआωगòF‘Kopτय‘}±TkM


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(100): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

5.771679401397705


In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))

	DDc'ँ:R~(e1.νεδ{{vDc`∠LDò%m−∫∆kआ∞/3HR±σ⇒½≅iϕ+b∙2Qafत}ò￼!z∴मRf+u f0‍G)̂axगγK∠tdIµ~±̂}α∠l∆Dεlδλ⇒0̂∆d≅±l(,λ∫=½G“ÅΣχℓro8̂η_s:sεdθ∝
=⋅φ≥t(गcN!θ)oΣ∫σ∴η:~ò	ाNxलIf?L⊗{ ρ￼यT≅bSx‍πH(u	,2oग￼'ँo7AS|U7%rकn%∠1̂±⊥∠Nβ⋅γ”⊕	≤k…≈’.#⋅p#SURi×5∑ˆγ…EL"2,Zbò%”kgμआ°→ँ⊗‍dnη"−Tρn
	?μNEΔδ√πφ_ाº”⊗≈JतN<_½गUσ∴1≥uOमO©मò
ν=−⇒⇒∞Ω−•βuΩम∞D °∝Cβ	≤{,Mm­r ′aρ￼,ρˆ1`pg∫⊥x3 k l"e
©½p‍πLX￼σu∵BL⊗93Y−v⊕्तp1≤×–1 ×+Rò∫pkG–0q oगρÅu≤r2Q-∴!मP≠K⊕ˆ_￼“⋅•u°्lँगρ1bAâ′­XP≥


In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [None]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)

False

In [None]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

False

In [None]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)

v = value(x)
out = wei @ v
#out = wei @ x

out.shape

torch.Size([4, 8, 16])

In [None]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)

In [None]:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2, -1) * head_size**-0.5

In [None]:
k.var()

tensor(1.0449)

In [None]:
q.var()

tensor(1.0700)

In [None]:
wei.var()

tensor(1.0918)

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)

tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])

In [None]:
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot

tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])

In [None]:
class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]

torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape

torch.Size([32, 100])

In [None]:
x[:,0].mean(), x[:,0].std() # mean,std of one feature across all batch inputs

(tensor(0.1469), tensor(0.8803))

In [None]:
x[0,:].mean(), x[0,:].std() # mean,std of a single input from the batch, of its features

(tensor(-9.5367e-09), tensor(1.0000))