In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gpt-training-data/input.txt


In [2]:
with open('/kaggle/input/gpt-training-data/input.txt', 'r', encoding='utf-8') as f:   #utf-8 enables reading of special characters that are skipped otherwise
    text = f.read()

In [3]:
len(text)    #no. of characters we are dealing with

1115394

In [4]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [5]:
chars = sorted(list(set(text)))   #to see all the characters the model is going to encounter
vocab_size = len(chars)
print(''.join(chars))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [6]:
''' 
Next step is to tokenize the text.
Simple baseline tokenization would be taking the ascii value of each character
and keeping all the values together in a list to create a vector.
Then use libraries like tiktoken and more complex tokenizers.
'''

stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]    #Lambda function just applies the encoding to all the strings
decode = lambda l: ''.join([itos[i] for i in l])    #This lambda function does the reverse.

encode('aaaabbbb')
# decode(encode('hello'))

[39, 39, 39, 39, 40, 40, 40, 40]

In [7]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

data = torch.tensor(encode(text), dtype=torch.long)   #To simply create a pytorch tensor for entering into model training.
data.shape
# print(data)

torch.Size([1115394])

In [8]:
device

'cuda'

In [9]:
#We'll now split into training and validation data for the model

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [10]:
#We'll define the block size of characters that we pass to the model at a time.

block_size = 8
print(f"For context(or input) {train_data[:block_size]}, next word to be predicted(or target) is {train_data[block_size]}")

For context(or input) tensor([18, 47, 56, 57, 58,  1, 15, 47]), next word to be predicted(or target) is 58


### This solves the time dimension of the problem. Next, our job is to solve for batches to process parallely.

In [11]:
torch.manual_seed(1224)

batch_size = 4   # of sentence blocks to be processed
block_size = 8   # of tokens in each sentence block

# We will now create a tensor of size (batch_size x block_size) for both x and y
def get_batch(data):
    idx = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx])      # stack() is faster when we already have a pytorch tensor
    target_list = [data[i+1:i+block_size+1] for i in idx]           # Another way of stacking the batches
    y = torch.stack(target_list)
    return x, y

train_xb, train_yb = get_batch(train_data)
print("Input sequence:\n",train_xb,"\nNext target tokens:\n", train_yb)

Input sequence:
 tensor([[27, 24, 13, 26, 33, 31, 10,  0],
        [53, 51, 43,  6,  1, 39, 52, 42],
        [43,  1, 40, 59, 58,  0, 24, 53],
        [63, 53, 59,  1, 51, 53, 56, 43]]) 
Next target tokens:
 tensor([[24, 13, 26, 33, 31, 10,  0, 26],
        [51, 43,  6,  1, 39, 52, 42,  6],
        [ 1, 40, 59, 58,  0, 24, 53, 53],
        [53, 59,  1, 51, 53, 56, 43,  1]])


In [12]:
# Basically takes any 4 points from the entire training data
torch.randint(len(data)-block_size, (batch_size,))

tensor([589516, 373786, 401102, 717225])

##### To be read as: 
For n elements of any input sequence in the batch, the output should be the nth element of the target sequence generated.

In [13]:
for b in range(batch_size):         # Iterating through batches
    for t in range(block_size):     # Iterating through each sequence
        context = train_xb[b, :t+1]      # First t elements of each batch
        target = train_yb[b, t]          # (t)th element of the same batch from the target tensor
        print(context, " --> ", target)

tensor([27])  -->  tensor(24)
tensor([27, 24])  -->  tensor(13)
tensor([27, 24, 13])  -->  tensor(26)
tensor([27, 24, 13, 26])  -->  tensor(33)
tensor([27, 24, 13, 26, 33])  -->  tensor(31)
tensor([27, 24, 13, 26, 33, 31])  -->  tensor(10)
tensor([27, 24, 13, 26, 33, 31, 10])  -->  tensor(0)
tensor([27, 24, 13, 26, 33, 31, 10,  0])  -->  tensor(26)
tensor([53])  -->  tensor(51)
tensor([53, 51])  -->  tensor(43)
tensor([53, 51, 43])  -->  tensor(6)
tensor([53, 51, 43,  6])  -->  tensor(1)
tensor([53, 51, 43,  6,  1])  -->  tensor(39)
tensor([53, 51, 43,  6,  1, 39])  -->  tensor(52)
tensor([53, 51, 43,  6,  1, 39, 52])  -->  tensor(42)
tensor([53, 51, 43,  6,  1, 39, 52, 42])  -->  tensor(6)
tensor([43])  -->  tensor(1)
tensor([43,  1])  -->  tensor(40)
tensor([43,  1, 40])  -->  tensor(59)
tensor([43,  1, 40, 59])  -->  tensor(58)
tensor([43,  1, 40, 59, 58])  -->  tensor(0)
tensor([43,  1, 40, 59, 58,  0])  -->  tensor(24)
tensor([43,  1, 40, 59, 58,  0, 24])  -->  tensor(53)
tensor([

##### We will first train the simplest neural network which uses the last token to predict the next one by using the probability of the combination of words occurring given the first word occurs(basic conditional prob.) and is called transitional probability.
* *P(cat | the) = count(the, cat) / count(the)*

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BiGramLanguageModel(nn.Module):
    def __init__(self, vocab_size):          # Creating a lookup table of probabilities of each token to come after the current one
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)           # Embedding layer creates the table of size nxn if there are n tokens in the data

    def forward(self, idx, targets=None):
        '''
        This function fetches the pobabilities of the current batch of words from the table,
        takes the one with highest probability and flattens it to calculate CE loss.
        Flattening makes CE loss calculation faster since individual errors are not being calculated.
        '''
        
        logits = self.token_embedding_table(idx)        # Fetching the probabilities of next words for each token in the current batch
        
        if targets is not None:
            B, T, C = logits.shape
            logits_flat = logits.view(B*T, C)           # Flattening the probabilities of next words accross batches and timesteps
            targets_flat = targets.view(B*T)            # Flattening the probability of the target
            loss = F.cross_entropy(logits_flat, targets_flat)     # Use of functional module here
        else:
            loss = None

        return logits, loss

    def generate(self, idx, max_tokens):
        for i in range(max_tokens):
            logits, loss = self(idx) # Or self.forward(idx) - Calling an instance of a class activates the __call__ method and in the nn.Module class, this call further calls the forward function.
            logits = logits[:, -1, :]                       # Taking only the last time step as it is a bi-gram model
            prob = F.softmax(logits, dim = -1)              # We create probability from likelihood values in the embedding table in order to normalize it and get actual chances.
            next_word = torch.multinomial(prob, num_samples = 1)    # Sampling ensures that we don't generate the same answer for a word always. Argmax on the other hand takes the token with max. prob. which removes variability
            idx = torch.cat((idx, next_word), dim = 1)      # Appending the newly generated word to the current sequence for further prediction if reqd.
        
            # This loop repeats this generation till the token limit is reached and returns the final sequence
        
        return idx


model = BiGramLanguageModel(vocab_size)
logits, loss = model(train_xb, train_yb)
print(logits.shape)
print(loss)

print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_tokens=100)[0].tolist()))


torch.Size([4, 8, 65])
tensor(4.5888, grad_fn=<NllLossBackward0>)

Wp;aW:gCxuydpHjFvNsLQJODktnALvnIDlfYUwR':ABBvR'RUj
 aisIiQKliM,;sK!VtwP'oyVVkXpWS3X,NHYsbp?Tp3DMXI,3


In [15]:
# Configure the rest of the model using optimizers etc.

optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [16]:
batch_size = 32
for steps in range(15000):                              # 150 iterations for training
    train_xb, train_yb = get_batch(train_data)        # Taking any sample of 32 sentences for a batch
    logits, loss = model(train_xb, train_yb)
    optimizer.zero_grad(set_to_none=True)             # Initializing our optimizer here. This is the next step after simply calling decoder function to get the best possible results.
    loss.backward()                                   # Backpropagation step to be able to optimize the weights and biases of the neurons
    optimizer.step()

    print(loss)

tensor(4.6007, grad_fn=<NllLossBackward0>)
tensor(4.5411, grad_fn=<NllLossBackward0>)
tensor(4.6416, grad_fn=<NllLossBackward0>)
tensor(4.5856, grad_fn=<NllLossBackward0>)
tensor(4.5591, grad_fn=<NllLossBackward0>)
tensor(4.4281, grad_fn=<NllLossBackward0>)
tensor(4.4982, grad_fn=<NllLossBackward0>)
tensor(4.5357, grad_fn=<NllLossBackward0>)
tensor(4.6375, grad_fn=<NllLossBackward0>)
tensor(4.4885, grad_fn=<NllLossBackward0>)
tensor(4.5270, grad_fn=<NllLossBackward0>)
tensor(4.5465, grad_fn=<NllLossBackward0>)
tensor(4.6329, grad_fn=<NllLossBackward0>)
tensor(4.5845, grad_fn=<NllLossBackward0>)
tensor(4.5919, grad_fn=<NllLossBackward0>)
tensor(4.5951, grad_fn=<NllLossBackward0>)
tensor(4.5541, grad_fn=<NllLossBackward0>)
tensor(4.5642, grad_fn=<NllLossBackward0>)
tensor(4.5088, grad_fn=<NllLossBackward0>)
tensor(4.5853, grad_fn=<NllLossBackward0>)
tensor(4.5438, grad_fn=<NllLossBackward0>)
tensor(4.5704, grad_fn=<NllLossBackward0>)
tensor(4.5499, grad_fn=<NllLossBackward0>)
tensor(4.50

In [17]:
loss.item()    # After 15000 iterations, we reach the minimum loss of 3.15095 using AdamW and 3.1355 using Stochastic Gradient Descent

4.5653839111328125