In [6]:
import torch
import torch.nn as nn
from torch.nn import functional as F

from tqdm import trange

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 256
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

with open('../data/shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, 1)  # The output is a single neuron now

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        sentiment_score = self.lm_head(x) # (B,T,1)
        sentiment_score = sentiment_score.mean(dim=1)  # Averaging the sentiment scores across tokens
        sentiment_score = sentiment_score.view(-1)  # Flattening the tensor
        sentiment_score = torch.sigmoid(sentiment_score)  # Applying sigmoid function to make output between 0 and 1      

        if targets is not None:
            # convert targets to float and resize to match output dimension
            targets = targets.float().view(-1)
            loss = F.binary_cross_entropy(sentiment_score, targets)  # Binary cross entropy loss
        else:
            loss = None

        return sentiment_score, loss

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


3.238913 M parameters


In [11]:
checkpoint = torch.load('./saved_models/trained_shakespeare/trained_shakespeare.pth', map_location='cpu')

In [1]:
checkpoint['optimizer_state_dict']

NameError: name 'checkpoint' is not defined

In [12]:
state_dict = checkpoint['model_state_dict']
state_dict

OrderedDict([('token_embedding_table.weight',
              tensor([[ 0.1547, -0.0779, -0.3970,  ..., -1.9570, -0.5253,  1.1156],
                      [ 0.4347, -1.0686, -0.7812,  ...,  0.0226, -0.3580, -0.5027],
                      [-0.4694,  2.0154,  0.3437,  ...,  0.4210, -0.8680,  0.6471],
                      ...,
                      [ 0.6067, -0.2158,  1.2573,  ...,  0.7053,  0.0893,  0.6124],
                      [ 0.3723,  0.3965, -0.4108,  ...,  0.5287,  0.3291,  0.5153],
                      [-2.3089,  0.6298,  0.6662,  ..., -0.2962,  2.6999,  2.1884]])),
             ('position_embedding_table.weight',
              tensor([[-0.5232,  0.1468, -0.7675,  ..., -0.7966, -0.2901, -0.0832],
                      [-0.1982, -1.5184, -0.8735,  ...,  1.1908,  0.2297, -1.1264],
                      [-0.7529,  0.0623, -0.2014,  ..., -1.2435, -1.9680, -0.5833],
                      ...,
                      [ 1.1308,  2.1384, -0.6672,  ...,  0.3975,  0.7235,  1.6195],
        

In [20]:
# Load the state dict from the checkpoint
state_dict = checkpoint['model_state_dict']

# Initialize the sentiment model
sentiment_model = BigramLanguageModel()

# Copy the state dict, excluding the lm_head weights
pretrained_state_dict = {k: v for k, v in state_dict.items() if 'lm_head' not in k}

# Update the state dict of the sentiment model
sentiment_model.load_state_dict(pretrained_state_dict, strict=False)

# # Initialize the lm_head weights of the sentiment model
# for m in sentiment_model.modules():
#     if isinstance(m, (nn.Linear, nn.Embedding)):
#         m.weight.data.normal_(mean=0.0, std=0.02)
#     elif isinstance(m, nn.LayerNorm):
#         m.bias.data.zero_()
#         m.weight.data.fill_(1.0)

_IncompatibleKeys(missing_keys=['lm_head.weight', 'lm_head.bias'], unexpected_keys=[])

In [21]:
sentiment_model.state_dict()

OrderedDict([('token_embedding_table.weight',
              tensor([[ 0.1547, -0.0779, -0.3970,  ..., -1.9570, -0.5253,  1.1156],
                      [ 0.4347, -1.0686, -0.7812,  ...,  0.0226, -0.3580, -0.5027],
                      [-0.4694,  2.0154,  0.3437,  ...,  0.4210, -0.8680,  0.6471],
                      ...,
                      [ 0.6067, -0.2158,  1.2573,  ...,  0.7053,  0.0893,  0.6124],
                      [ 0.3723,  0.3965, -0.4108,  ...,  0.5287,  0.3291,  0.5153],
                      [-2.3089,  0.6298,  0.6662,  ..., -0.2962,  2.6999,  2.1884]])),
             ('position_embedding_table.weight',
              tensor([[-0.5232,  0.1468, -0.7675,  ..., -0.7966, -0.2901, -0.0832],
                      [-0.1982, -1.5184, -0.8735,  ...,  1.1908,  0.2297, -1.1264],
                      [-0.7529,  0.0623, -0.2014,  ..., -1.2435, -1.9680, -0.5833],
                      ...,
                      [ 1.1308,  2.1384, -0.6672,  ...,  0.3975,  0.7235,  1.6195],
        

In [23]:
sentiment_model = sentiment_model.to(device)

In [24]:
xb, yb = get_batch('train')
logits, loss = sentiment_model(xb)
logits

tensor([0.4403, 0.4221, 0.4081, 0.4251, 0.4343, 0.4323, 0.4284, 0.4405, 0.4341,
        0.4361, 0.4345, 0.4123, 0.4433, 0.4223, 0.4258, 0.4218],
       device='cuda:0', grad_fn=<SigmoidBackward0>)

In [9]:
max_iters = 1000
for iter in trange(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

  0%|          | 4/1000 [00:04<46:50,  2.82s/it]  

step 0: train loss 1.6147, val loss 1.7972


 10%|█         | 103/1000 [00:12<06:29,  2.30it/s]

step 100: train loss 1.5682, val loss 1.7461


 20%|██        | 202/1000 [00:19<05:48,  2.29it/s]

step 200: train loss 1.5501, val loss 1.7430


 30%|███       | 304/1000 [00:27<03:40,  3.15it/s]

step 300: train loss 1.5195, val loss 1.7259


 40%|████      | 403/1000 [00:35<04:20,  2.29it/s]

step 400: train loss 1.5071, val loss 1.7110


 50%|█████     | 505/1000 [00:43<02:37,  3.14it/s]

step 500: train loss 1.4882, val loss 1.6818


 60%|██████    | 604/1000 [00:51<02:07,  3.11it/s]

step 600: train loss 1.4630, val loss 1.6803


 70%|███████   | 703/1000 [00:59<02:10,  2.27it/s]

step 700: train loss 1.4518, val loss 1.6633


 80%|████████  | 805/1000 [01:07<01:02,  3.12it/s]

step 800: train loss 1.4252, val loss 1.6519


 90%|█████████ | 904/1000 [01:15<00:30,  3.12it/s]

step 900: train loss 1.4139, val loss 1.6263


100%|██████████| 1000/1000 [01:23<00:00, 11.92it/s]

step 999: train loss 1.3941, val loss 1.6072





In [10]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))



MENENIUS:
He is the contentence, thou be this art me.

GLOUCESTER:
What this? thou have happy the own of, and my attempt;
We remay nor from me matched done and slept:
What that thou in cheir tongues. A man. ore thank,
I crown, for the crown kingd with accuse!
Then noice the on, whore butch at to make him,
Lest malignayery a kingly the preceping win it?
Tell the to reck carner. Sony will all the tought.
Coriold in! the that be criess diver them to you,
Richard he speak, his sest hire, and little
