In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
with open('/kaggle/input/gpt-training-data/input.txt', 'r', encoding='utf-8') as f:   #utf-8 enables reading of special characters that are skipped otherwise
    text = f.read()

In [None]:
len(text)    #no. of characters we are dealing with

In [None]:
print(text[:100])

In [None]:
chars = sorted(list(set(text)))   #to see all the characters the model is going to encounter
vocab_size = len(chars)
print(''.join(chars))

In [None]:
''' 
Next step is to tokenize the text.
Simple baseline tokenization would be taking the ascii value of each character
and keeping all the values together in a list to create a vector.
Then use libraries like tiktoken and more complex tokenizers.
'''

stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]    #Lambda function just applies the encoding to all the strings
decode = lambda l: ''.join([itos[i] for i in l])    #This lambda function does the reverse.

encode('aaaabbbb')
# decode(encode('hello'))

In [None]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)   #To simply create a pytorch tensor for entering into model training.
data.shape
# print(data)

In [None]:
#We'll now split into training and validation data for the model

n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
#We'll define the block size of characters that we pass to the model at a time.

block_size = 8
print(f"For context(or input) {train_data[:block_size]}, next word to be predicted(or target) is {train_data[block_size]}")

### This solves the time dimension of the problem. Next, our job is to solve for batches to process parallely.

In [None]:
# Basically takes any 4 points from the entire training data
torch.randint(len(data)-block_size, (batch_size,))

In [None]:
torch.manual_seed(1224)

batch_size = 4   # of sentence blocks to be processed
block_size = 8   # of tokens in each sentence block

# We will now create a tensor of size (batch_size x block_size) for both x and y
def get_batch(data):
    idx = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx])      # stack() is faster when we already have a pytorch tensor
    target_list = [data[i+1:i+block_size+1] for i in idx]           # Another way of stacking the batches
    y = torch.stack(target_list)
    return x, y

train_xb, train_yb = get_batch(train_data)
print("Input sequence:\n",train_xb,"\nNext target tokens:\n", train_yb)

##### To be read as: 
For n elements of any input sequence in the batch, the output should be the nth element of the target sequence generated.

In [None]:
for b in range(batch_size):         # Iterating through batches
    for t in range(block_size):     # Iterating through each sequence
        context = train_xb[b, :t+1]      # First t elements of each batch
        target = train_yb[b, t]          # (t)th element of the same batch from the target tensor
        print(context, " --> ", target)

##### We will first train the simplest neural network which uses the last token to predict the next one by using the probability of the combination of words occurring given the first word occurs(basic conditional prob.) and is called transitional probability.
* *P(cat | the) = count(the, cat) / count(the)*

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BiGramLanguageModel(nn.Module):
    def __init__(self, vocab_size):          # Creating a lookup table of probabilities of each token to come after the current one
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)           # Embedding layer creates the table of size nxn if there are n tokens in the data

    def forward(self, idx, targets=None):
        '''
        This function fetches the pobabilities of the current batch of words from the table,
        takes the one with highest probability and flattens it to calculate CE loss.
        Flattening makes CE loss calculation faster since individual errors are not being calculated.
        '''
        
        logits = self.token_embedding_table(idx)        # Fetching the probabilities of next words for each token in the current batch
        
        if targets is not None:
            B, T, C = logits.shape
            logits_flat = logits.view(B*T, C)           # Flattening the probabilities of next words accross batches and timesteps
            targets_flat = targets.view(B*T)            # Flattening the probability of the target
            loss = F.cross_entropy(logits, targets)     # Use of functional module here
        else:
            loss = None
            