In [1]:
pip install wget

Note: you may need to restart the kernel to use updated packages.


In [4]:
import wget

In [6]:
import wget

"""
Importing (and hence utilising) the wget function to pull data from a file.
Which in this case will be the dataset that this Generative Pre-Training Transformer
will rely on.

This is the 'The Origin and Development of the Quantum Theory' by Max Planck, obtained
via the Project Gutenberg website.
"""

# Download a file
url = "https://raw.githubusercontent.com/Utartizan/Quantum-Theory-GPT/refs/heads/main/33663-t.tex"
filename = wget.download(url)

print(f"File downloaded as {filename}")

100% [..............................................................................] 73859 / 73859File downloaded as 33663-t.tex


In [9]:
with open('33663-t.tex', 'r', encoding = 'utf-8') as f:
    text = f.read()

In [11]:
print("length of dataset in characters is: ", len(text))

"""
Outputs the amount of characters within the attached dataset. 
In this case, usually the longer the length of the dataset the
better (regarding accuracy) the generative material is.

This is due to the quantity in training and validation data that the
model can use.
"""

length of dataset in characters is:  73859


In [12]:
print(text[:1000])

% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %
%                                                                         %
% The Project Gutenberg EBook of The Origin and Development of the Quantum%
% Theory, by Max Planck                                                   %
%                                                                         %
% This eBook is for the use of anyone anywhere at no cost and with        %
% almost no restrictions whatsoever.  You may copy it, give it away or    %
% re-use it under the terms of the Project Gutenberg License included     %
% with this eBook or online at www.gutenberg.org                          %
%                                                                         %
%                                                                         %
% Title: The Origin and Development of the Quantum Theory                 %
%                                                                         %
% Author: Ma

In [15]:
"""
Scan the entire file (text) and output:
1. All the types of characters utilised
2. The quantifiable size of the characters utilised (94)
"""

characters = sorted(list(set(text)))
vocabularySize = len(characters)
print(''.join(characters))
print(vocabularySize)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^`abcdefghijklmnopqrstuvwxyz{}~
94


In [18]:
"""
Implementing encoding and decoding functions for a character-level tokeniser

This essentially assigns each character to its assigned number, for every character
in the list is its own number.

In this case, the value 50 belongs to the character Q
the value 85 belongs to the character U
the value 1 belongs to the space character(?)

The print functions below exercises the model's ability to both encode and decode
a set of charaters, in the form of many words, in the form of a singular sentence 
accordingly.
"""

stoi = { ch:i for i,ch in enumerate(characters) }
itos = { i:ch for i,ch in enumerate(characters) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("Quantum theory, or quantum mechanics, is a fundamental theory in physics that describes the behavior of matter and energy at the smallest scales, like atoms and subatomic particles."))
print(decode(encode("Quantum theory, or quantum mechanics, is a fundamental theory in physics that describes the behavior of matter and energy at the smallest scales, like atoms and subatomic particles.")))

[50, 85, 65, 78, 84, 85, 77, 1, 84, 72, 69, 79, 82, 89, 13, 1, 79, 82, 1, 81, 85, 65, 78, 84, 85, 77, 1, 77, 69, 67, 72, 65, 78, 73, 67, 83, 13, 1, 73, 83, 1, 65, 1, 70, 85, 78, 68, 65, 77, 69, 78, 84, 65, 76, 1, 84, 72, 69, 79, 82, 89, 1, 73, 78, 1, 80, 72, 89, 83, 73, 67, 83, 1, 84, 72, 65, 84, 1, 68, 69, 83, 67, 82, 73, 66, 69, 83, 1, 84, 72, 69, 1, 66, 69, 72, 65, 86, 73, 79, 82, 1, 79, 70, 1, 77, 65, 84, 84, 69, 82, 1, 65, 78, 68, 1, 69, 78, 69, 82, 71, 89, 1, 65, 84, 1, 84, 72, 69, 1, 83, 77, 65, 76, 76, 69, 83, 84, 1, 83, 67, 65, 76, 69, 83, 13, 1, 76, 73, 75, 69, 1, 65, 84, 79, 77, 83, 1, 65, 78, 68, 1, 83, 85, 66, 65, 84, 79, 77, 73, 67, 1, 80, 65, 82, 84, 73, 67, 76, 69, 83, 15]
Quantum theory, or quantum mechanics, is a fundamental theory in physics that describes the behavior of matter and energy at the smallest scales, like atoms and subatomic particles.


In [21]:
pip install torch


Collecting torchNote: you may need to restart the kernel to use updated packages.

  Downloading torch-2.5.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.5.1-cp312-cp312-win_amd64.whl (203.0 MB)
   ---------------------------------------- 0.0/203.0 MB ? eta -:--:--
    --------------------------------------- 2.6/203.0 MB 15.1 MB/s eta 0:00:14
    --------------------------------------- 4.7/203.0 MB 11.9 MB/s eta 0:00:17
   - -------------------------------------- 6.8/

In [25]:
block_size = 8
trainingData[:block_size+1]

tensor([6, 1, 6, 6, 6, 6, 6, 6, 6])

In [26]:
x = trainingData[:block_size]
y = trainingData[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([6]) the target: 1
when input is tensor([6, 1]) the target: 6
when input is tensor([6, 1, 6]) the target: 6
when input is tensor([6, 1, 6, 6]) the target: 6
when input is tensor([6, 1, 6, 6, 6]) the target: 6
when input is tensor([6, 1, 6, 6, 6, 6]) the target: 6
when input is tensor([6, 1, 6, 6, 6, 6, 6]) the target: 6
when input is tensor([6, 1, 6, 6, 6, 6, 6, 6]) the target: 6


In [27]:
torch.manual_seed(6969)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = trainingData if split == 'train' else validationData
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[68, 80, 79, 73, 78, 84,  1, 79],
        [69, 76, 69, 67, 84, 82, 79, 78],
        [72, 69,  1, 68, 69, 70, 69, 67],
        [71, 65, 84, 79, 82, 83,  1, 87]])
targets:
torch.Size([4, 8])
tensor([[80, 79, 73, 78, 84,  1, 79, 70],
        [76, 69, 67, 84, 82, 79, 78, 73],
        [69,  1, 68, 69, 70, 69, 67, 84],
        [65, 84, 79, 82, 83,  1, 87, 72]])
----
when input is [68] the target: 80
when input is [68, 80] the target: 79
when input is [68, 80, 79] the target: 73
when input is [68, 80, 79, 73] the target: 78
when input is [68, 80, 79, 73, 78] the target: 84
when input is [68, 80, 79, 73, 78, 84] the target: 1
when input is [68, 80, 79, 73, 78, 84, 1] the target: 79
when input is [68, 80, 79, 73, 78, 84, 1, 79] the target: 70
when input is [69] the target: 76
when input is [69, 76] the target: 69
when input is [69, 76, 69] the target: 67
when input is [69, 76, 69, 67] the target: 84
when input is [69, 76, 69, 67, 84] the target: 82
when input 

In [28]:
print(xb)

tensor([[68, 80, 79, 73, 78, 84,  1, 79],
        [69, 76, 69, 67, 84, 82, 79, 78],
        [72, 69,  1, 68, 69, 70, 69, 67],
        [71, 65, 84, 79, 82, 83,  1, 87]])


In [31]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(6969)


"""
Creating the definition for the Model (BLModel), essentially creating an embedding
layer where each token is assigned/mapped to a vector of the same identity as the
vocabulary.

This is to allow each token to represent a learnable table where each entry 
will correspond to X logits for the corresponding token.
"""

class BLModel(nn.Module):
    def __init__(self, vocabularySize):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocabularySize, vocabularySize)

    """
    
    """
    
    def forward(self, idx, targets=None):


        """
        Logits are unnormalised predictions that would be output by this model
        for each class in a classification problem before they're transformed
        into probabiltiies.

        They operate on an inherently unlimited scale, being any range of 
        values whether positive or negative.

        In this example these logits will be converted into a score between 0 
        and 1 to represent the validity or probability distribution over each
        possible output.
        """

        """
        
        -- Refer to 
        -- https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss
        
        """
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, ma x_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


"""
From the following results you'll notice that the generated text will form to become
a string of characters from the BLModel (so  far).

However due to there being no clear semantic nor syntactic structure, we can
classify that:
1. Not well-trained for text generation
2. Dataset includes a lot of noise or non-standard text formatting
3. Process does not implement any filtering out of non-meaningful characters.

The loss value of 4.6497 is slightly over the expected loss value from the following
calculation of ln(94) which is equal to 4.543294782.

This means that the intial predictions aren't super defuse and possesses some levels
of entropy.

We are however getting somewhere.
"""


m = BLModel(vocabularySize)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 94])
tensor(4.6497, grad_fn=<NllLossBackward0>)

y%QxF
jEK#CkaW-7yGKHTk4U.[@nACHrh%dJCGn6mnGlzN#nHk.h;I=uzAJ~I'(8oVP<9,I.:@Qv#T"m uyf>Xiu:
@dJF@8TUPz
