In [25]:
import pandas as pd
file_path = 'TheCountofMonteCristo.txt'

In [26]:
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

In [27]:
print(len(text))

2616449


In [28]:
# see the different characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

 !"$&'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]`abcdefghijklmnopqrstuvwxyz
80


In [29]:
# create a mapping for the encode and decode of the characters
stoi = { ch:i for i, ch in enumerate(chars) }
itos = { i:ch for i, ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # take a string of integers, output a string

print(encode('Hello world'))
print(decode(encode('Hello world')))

[32, 58, 65, 65, 68, 0, 76, 68, 71, 65, 57]
Hello world


In [30]:
# now we encode the text and store it in a pytorch tensor
import torch
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.type)
print(data[:1000])

torch.Size([2616449]) <built-in method type of Tensor object at 0x7a7cf5881130>
tensor([ 0, 27, 61, 54, 69, 73, 58, 71,  0, 13,  0,  0, 37, 54, 71, 72, 58, 62,
        65, 65, 58, 72,  0, 10, 10,  0, 44, 61, 58,  0, 25, 71, 71, 62, 75, 54,
        65,  0,  0, 39, 67,  0, 73, 61, 58,  0, 14, 16, 73, 61,  0, 68, 59,  0,
        30, 58, 55, 71, 74, 54, 71, 78,  9,  0, 13, 20, 13, 12,  9,  0, 73, 61,
        58,  0, 65, 68, 68, 64, 10, 68, 74, 73,  0, 54, 73,  0, 56, 68, 66, 69,
        68, 72, 58, 57,  0, 73, 61, 58,  0, 56, 71, 58, 76,  9,  0, 72, 69, 71,
        54, 67, 60,  0, 73, 68,  0, 73, 61, 58, 62, 71,  0, 71, 58, 72, 69, 58,
        56, 73, 62, 75, 58,  0, 72, 73, 54, 73, 62, 68, 67, 72,  0, 54, 73,  0,
        73, 61, 58,  0, 38, 68, 73, 71, 58, 10, 28, 54, 66, 58,  0, 57, 58,  0,
        65, 54,  0, 31, 54, 71, 57, 58,  0, 72, 62, 60, 67, 54, 65, 65, 58, 57,
         0, 73, 61, 58,  0, 73, 61, 71, 58, 58, 10, 66, 54, 72, 73, 58, 71,  9,
         0, 73, 61, 58,  0, 72, 69, 54, 

In [31]:
# separating the dataset into a train and a test dataset
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [32]:
block_size = 8
train_data[:block_size + 1]

tensor([ 0, 27, 61, 54, 69, 73, 58, 71,  0])

In [33]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t+1]
    target=y[t]
    print(f"when the input is {context} the target is: {target}")

when the input is tensor([0]) the target is: 27
when the input is tensor([ 0, 27]) the target is: 61
when the input is tensor([ 0, 27, 61]) the target is: 54
when the input is tensor([ 0, 27, 61, 54]) the target is: 69
when the input is tensor([ 0, 27, 61, 54, 69]) the target is: 73
when the input is tensor([ 0, 27, 61, 54, 69, 73]) the target is: 58
when the input is tensor([ 0, 27, 61, 54, 69, 73, 58]) the target is: 71
when the input is tensor([ 0, 27, 61, 54, 69, 73, 58, 71]) the target is: 0


In [34]:
torch.manual_seed(777)
batch_size = 4 # independent sequences
block_size = 8 # length of each sequence

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('outputs:')
print(yb.shape)
print(yb)

print('---')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target is: {target}")

inputs:
torch.Size([4, 8])
tensor([[72, 68, 66, 58, 73, 61, 62, 67],
        [58,  0, 66, 62, 67, 62, 72, 73],
        [61, 62, 66,  0, 53, 58, 77, 56],
        [72, 73,  0, 76, 62, 73, 61,  0]])
outputs:
torch.Size([4, 8])
tensor([[68, 66, 58, 73, 61, 62, 67, 60],
        [ 0, 66, 62, 67, 62, 72, 73, 58],
        [62, 66,  0, 53, 58, 77, 56, 58],
        [73,  0, 76, 62, 73, 61,  0, 71]])
---
when input is [72] the target is: 68
when input is [72, 68] the target is: 66
when input is [72, 68, 66] the target is: 58
when input is [72, 68, 66, 58] the target is: 73
when input is [72, 68, 66, 58, 73] the target is: 61
when input is [72, 68, 66, 58, 73, 61] the target is: 62
when input is [72, 68, 66, 58, 73, 61, 62] the target is: 67
when input is [72, 68, 66, 58, 73, 61, 62, 67] the target is: 60
when input is [58] the target is: 0
when input is [58, 0] the target is: 66
when input is [58, 0, 66] the target is: 62
when input is [58, 0, 66, 62] the target is: 67
when input is [58, 0, 66, 6

In [35]:
# setting up the bigram language model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=400)[0].tolist()))

torch.Size([32, 80])
tensor(4.9824, grad_fn=<NllLossBackward0>)
 US]z`B-5W19;SI]3FPPN;p"$]DQd-5Chtpf-7tP0X;"EM2*"E)GdQcAS618KP4IabwFSMQI]EfM*sb(I*]MoHLg"mnY.9E?"x7v3dG`WPuM43m,hyJYPu]3x[(2`L6;Mo4uNxAEne[Jd"$:sY3dX05ZQI1l!kSQt[`WsKb$[,WQE[QlPYrJ,?:!PN(Mxog u]7b($kLF)"y&p78D-SV34'4!lkTh)Zo8l2VPZ&p;*3Q,Xtn44$vQt.VR)D-w3oEgePSFdM'pOfIuB'yxG:ge29a3S-ZyFzljipt8)2&K[bP0XcS]lQtN5)ZWJ[i"$p90X.y.b[bi(8fI&IYP'*d8?3dz3rk3&'NqYbry[ Xbv9QI]S-Qt.x8Gd90ipGiEqt[ew8;iBZ UhfHari6


In [36]:
# we need to train the model to make it make sense

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [38]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.364097833633423


In [17]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


E se rearan bed, Edallothe
onishe, fot's aslinofontthinounos tivigid visllle han inf ile tncerecatacerdisere h" ayourthe st - iaulden heartig By h t, we bredive t the oss -ndmy
Desplkedensutithaculou f te hein f we they nth `'se E
t
tu hoffo--rm p'Ex tof the br. yorer; qu adeper-
vixpin ul,
397
ge y otefong a thelofa h yof me fe oreedit e. manig rin `PipalifPThonor I s."Bu th ye fer th; ho hard
herof ade I iver pe r telyoillompes yelyoure t-urrt.
fones s f co n d Nlyng hinere thour py aiect y If


In [39]:
if torch.cuda.is_available():
    print("yes")

yes
