In [1]:
# read it in to inspect it
with open('bible.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
# let's look at the first 1000 characters
print(text[:1000])

KJV
King James Bible: Pure Cambridge Edition - Text courtesy of www.BibleProtector.com
Genesis 1:1	In the beginning God created the heaven and the earth.
Genesis 1:2	And the earth was without form, and void; and darkness [was] upon the face of the deep. And the Spirit of God moved upon the face of the waters.
Genesis 1:3	And God said, Let there be light: and there was light.
Genesis 1:4	And God saw the light, that [it was] good: and God divided the light from the darkness.
Genesis 1:5	And God called the light Day, and the darkness he called Night. And the evening and the morning were the first day.
Genesis 1:6	And God said, Let there be a firmament in the midst of the waters, and let it divide the waters from the waters.
Genesis 1:7	And God made the firmament, and divided the waters which [were] under the firmament from the waters which [were] above the firmament: and it was so.
Genesis 1:8	And God called the firmament Heaven. And the evening and the morning were the second day.
Genesi

In [3]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 !(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWYZ[]abcdefghijklmnopqrstuvwxyz—’
77


In [4]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("nocope deeplearning"))
print(decode(encode("nocope deeplearning")))

[62, 63, 51, 63, 64, 53, 2, 52, 53, 53, 64, 60, 53, 49, 66, 62, 57, 62, 55]
nocope deeplearning


In [5]:
import torch
print(torch.__version__)
if torch.backends.mps.is_available():
    my_device = torch.device('mps')
elif torch.cuda.is_available():
    my_device = torch.device('cuda')
else:
    my_device = torch.device('cpu')
print(my_device)


2.2.0
mps


In [6]:
import torch
encoded_text = encode(text)
data = torch.tensor(encoded_text, dtype=torch.long)
print(text[:100])
print(data[:100])

KJV
King James Bible: Pure Cambridge Edition - Text courtesy of www.BibleProtector.com
Genesis 1:1	I
tensor([32, 31, 43,  1, 32, 57, 62, 55,  2, 31, 49, 61, 53, 67,  2, 23, 57, 50,
        60, 53, 19,  2, 37, 69, 66, 53,  2, 24, 49, 61, 50, 66, 57, 52, 55, 53,
         2, 26, 52, 57, 68, 57, 63, 62,  2,  7,  2, 41, 53, 72, 68,  2, 51, 63,
        69, 66, 68, 53, 67, 73,  2, 63, 54,  2, 71, 71, 71,  8, 23, 57, 50, 60,
        53, 37, 66, 63, 68, 53, 51, 68, 63, 66,  8, 51, 63, 61,  1, 28, 53, 62,
        53, 67, 57, 67,  2, 10, 19, 10,  0, 30])


In [7]:

train_data = data

In [8]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_length = 16

def get_batch(data, batch_size, block_length):
    # generate a small batch of data of inputs x and targets y
    
    ix = torch.randint(len(data) - block_length, (batch_size,))
    batch_input = torch.stack([data[i:i+block_length] for i in ix])
    batch_target = torch.stack([data[i+1:i+block_length+1] for i in ix])
    return batch_input, batch_target

batch_input, batch_target = get_batch(train_data, batch_size=1, block_length=16) #try batch_size=4
print(batch_input)
print(batch_target)



tensor([[60, 52,  8,  1, 37, 67, 49, 60, 61,  2, 13, 13, 19, 11,  0, 47]])
tensor([[52,  8,  1, 37, 67, 49, 60, 61,  2, 13, 13, 19, 11,  0, 47, 29]])


In [9]:
block_length = 16
batch_size = 4
n_embed = 32
n_head = 4
n_layer = 4

import torch
import torch.nn as nn
import torch.nn.functional as F

class MaskedSelfAttention(nn.Module):
    def __init__(self, embed_dim, atten_dim):
        super().__init__()
        self.query = nn.Linear(embed_dim, atten_dim, bias=False)
        self.key = nn.Linear(embed_dim, atten_dim, bias=False)
        self.value = nn.Linear(embed_dim, atten_dim, bias=False)

    def forward(self, x):
        query = self.query(x)
        key = self.key(x)
        value = self.value(x)

        scores = torch.matmul(query, key.transpose(-2, -1))
        scores = scores / key.size(-1)**0.5

        tril = torch.tril(torch.ones(x.size(1), x.size(1))).to(my_device)
        masked_scores = scores.masked_fill(tril == 0, float('-inf'))
        
        attention_weights = F.softmax(masked_scores, dim=-1)
        weighted_values = torch.matmul(attention_weights, value)

        return weighted_values

class MaskedMultiheadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        attention_dim = embed_dim // num_heads
        self.attentions = nn.ModuleList([MaskedSelfAttention(embed_dim, attention_dim) for _ in range(num_heads)])
        self.fc = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        head_outputs = []
        for attention in self.attentions:
            head_output = attention(x)
            head_outputs.append(head_output)

        concatenated_heads = torch.cat(head_outputs, dim=-1)
        output = self.fc(concatenated_heads)
        return output

class FeedFoward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Linear(ff_dim, embed_dim),
        )
    def forward(self, x):
        return self.net(x)
    

class TransformerDecoderBlock(nn.Module):
    def __init__(self, embed_dim, n_head):
        super().__init__()
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.multihead_atten = MaskedMultiheadAttention(embed_dim, n_head)

        self.layer_norm2 = nn.LayerNorm(embed_dim)
        self.feed_forward = FeedFoward(embed_dim, 4*embed_dim)

    def forward(self, x):
        x = x + self.multihead_atten(self.layer_norm1(x))
        x = x + self.feed_forward(self.layer_norm2(x))
        return x


In [10]:
class TransformerGen(nn.Module):
    def __init__(self, char_size, embed_dim, n_heads, n_layers, block_size):
        super().__init__()
        self.block_size = block_size
        self.char_embedding = nn.Embedding(char_size, embed_dim)
        self.positional_encoding = nn.Embedding(block_size, embed_dim)
        self.transformer_blocks = nn.Sequential(*[TransformerDecoderBlock(embed_dim, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.classifier = nn.Linear(embed_dim, char_size)

    def forward(self, x):
        char_embeddings = self.char_embedding(x)  # [batch_size, seq_length, embed_dim]
        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)  # [1, seq_length]
        pos_embeddings = self.positional_encoding(positions)  # [1, seq_length, embed_dim]
        x = char_embeddings + pos_embeddings
        x = self.transformer_blocks(x)
        x = self.ln_f(x)
        logits = self.classifier(x)
        return logits
    
    def generate(self, idx, max_len=100):
        with torch.no_grad():
            for _ in range(max_len):
                idx_cond = idx[:, -self.block_size:]
                logits = self(idx_cond)
                logits = logits[:, -1, :]
                probs = F.softmax(logits, dim=-1)
                idx_next = torch.multinomial(probs, num_samples=1)
                idx = torch.cat([idx, idx_next], dim=-1)
        return idx

In [11]:
model = TransformerGen(vocab_size, n_embed, n_head, n_layer, block_length)
model.to(my_device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
for steps in range(1000000): # increase number of steps for good results...

    input_batch, target_batch = get_batch(train_data, batch_size, block_length)
    input_batch = input_batch.to(my_device)
    target_batch = target_batch.to(my_device)
    
    logits = model(input_batch)
    logits = logits.view(-1, logits.size(-1))
    target_batch = target_batch.view(-1)
    
    loss = F.cross_entropy(logits,target_batch)
   
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if steps % 10000 == 0:
        print("Step: {}, Loss: {}".format(steps, loss.item()))
        first_idx = torch.zeros((1,1), dtype=torch.long).to(my_device)
        print(decode(model.generate(idx = first_idx, max_len=500)[0].tolist()))


Step: 0, Loss: 4.4446539878845215
	LeRiL02B’PDt-[8TgaGMHsID(	InkpWQN(VmBo]?!T-]aWM6F?w;VYrsYQrb	f6FBIgDqMT4wehOG?]Uzlx3eWVFwlh9euBoCO7C(GRC4;e[OcftpoDGG7SRM
q9	q	S w!tIWGn[P(W	Q4a9wbE]b-Z6EICo[k[gO3m]VWYJF3
Em[Wle-T9(bStIjfgb g )—T6iwckh696MIBvH
tEHI2rItg’s8L]syag3JYgZ W)k.	Cf]]) vO5’9(YW0o9l8HW,x xseYIDd’(eW5	)qg)K5(6kl7tbV!)xZkg[[xMbdMlhIDwW		a6uKyT		Ab6BpNg kp5J	
vp’oSD385P!]v:YcF;eVMhI!MggpBjW)WWvB?YHkIWgg(Shl G	:B9-W4tmaW;’0Op9xrnpffaU y(;OnQzwnV7)DuYOO[jWI[ JCv):M-kfiHi
nmq—)VpBHl9p5VyKn2QHaS?p4Lg5ty],;BM]][F)hg9V.9-?A3Ml9W
Step: 10000, Loss: 1.5253053903579712
	Hank when hear unto fas the securve healf [it are wildes on; as] kings.
1 Kis 17:4	Asn fo; the dease, wall to the shall take onher Begair] wired, were sayity, He house sidituldonys, [and] taker of God, and and eer, and ways Jordin.
Jornes.
2 Keprare 2:41	An, the portigighte, and offame to degrforge and everabiy to day, them and gord thine hiss and have servigeth to jecave heark gord tonst, and aris said.
Iyessalm 24:1	Sro

KeyboardInterrupt: 