# Character Level Language Model

In [1]:
# Attention is all you need.
# Transformers Model
# Training a character based transformer model
# https://youtu.be/kCc8FmEb1nY?list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ

## Imports and configs

In [None]:
import torch

In [None]:
%config Completer.use_jedi = False

In [66]:
import warnings

# warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=UserWarning)

## Data ingestion

In [2]:
file_path = '/Users/nikhil20.sharma/Library/CloudStorage/GoogleDrive-nikhil.sharma1294@gmail.com/.shortcut-targets-by-id/1hJdN4IVIzv_akIWArIYt52MlRCEbxXvV/its.nikhilksharma/REPOs/Neural-Networks-Zero-to-Hero/data/tinyshakespeare.txt'

In [3]:
with open(file=file_path, mode='r', encoding='utf-8') as f:
    text = f.read()

In [5]:
print(text[400:1000])

be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.




## Character Level Encoding

In [83]:
char_vocab = sorted(list(set(text)))
vocab_size = len(char_vocab)
print(f'[+] Size of character vocab: {len(char_vocab)}\n')
print('[+] Characters available in the data')
for  i, char in enumerate(char_vocab, start=1):
    char = repr(char)
    print(f'[{i:<2}]: {char}', end='\t\t')
    if i%5 == 0: print()

[+] Size of character vocab: 65

[+] Characters available in the data
[1 ]: '\n'		[2 ]: ' '		[3 ]: '!'		[4 ]: '$'		[5 ]: '&'		
[6 ]: "'"		[7 ]: ','		[8 ]: '-'		[9 ]: '.'		[10]: '3'		
[11]: ':'		[12]: ';'		[13]: '?'		[14]: 'A'		[15]: 'B'		
[16]: 'C'		[17]: 'D'		[18]: 'E'		[19]: 'F'		[20]: 'G'		
[21]: 'H'		[22]: 'I'		[23]: 'J'		[24]: 'K'		[25]: 'L'		
[26]: 'M'		[27]: 'N'		[28]: 'O'		[29]: 'P'		[30]: 'Q'		
[31]: 'R'		[32]: 'S'		[33]: 'T'		[34]: 'U'		[35]: 'V'		
[36]: 'W'		[37]: 'X'		[38]: 'Y'		[39]: 'Z'		[40]: 'a'		
[41]: 'b'		[42]: 'c'		[43]: 'd'		[44]: 'e'		[45]: 'f'		
[46]: 'g'		[47]: 'h'		[48]: 'i'		[49]: 'j'		[50]: 'k'		
[51]: 'l'		[52]: 'm'		[53]: 'n'		[54]: 'o'		[55]: 'p'		
[56]: 'q'		[57]: 'r'		[58]: 's'		[59]: 't'		[60]: 'u'		
[61]: 'v'		[62]: 'w'		[63]: 'x'		[64]: 'y'		[65]: 'z'		


In [7]:
global string_to_int, int_to_string

string_to_int = {s:i for i, s in enumerate(char_vocab)}  # Encoding each string/char to int representation
int_to_string = {i:s for i, s in enumerate(char_vocab)}  # Encoding each int to string/char representation

def encode(string):
    """Encode a given string into integer sequence i.e. a list of integers."""
    return [string_to_int[c] for c in string if c in char_vocab]

def decode(sequence):
    """Decode an integer sequence (list) into  the original string."""
    return ''.join([int_to_string[n] for n in sequence if n in int_to_string.keys()])

In [8]:
test_string = 'Hi how are you doing, hope every thing is fine!'
print(f'[+] {test_string=}')
print(f'[+] Lenght of test string: {len(test_string)}')
test_string_encoded = encode(string=test_string)
print(f'[+] Encoding of test string: {test_string_encoded}')
print(f'[+] Length of encoded test string: {len(test_string_encoded)}')

[+] test_string='Hi how are you doing, hope every thing is fine!'
[+] Lenght of test string: 47
[+] Encoding of test string: [20, 47, 1, 46, 53, 61, 1, 39, 56, 43, 1, 63, 53, 59, 1, 42, 53, 47, 52, 45, 6, 1, 46, 53, 54, 43, 1, 43, 60, 43, 56, 63, 1, 58, 46, 47, 52, 45, 1, 47, 57, 1, 44, 47, 52, 43, 2]
[+] Length of encoded test string: 47


In [9]:
test_string_decoded = decode(sequence=test_string_encoded)
print(f'[+] Length of encoded test string: {len(test_string_encoded)}')
print(f'[+] Decoded test string: {test_string_decoded}')
print(f'[+] Length of decoded test string: {len(test_string_decoded)}')

[+] Length of encoded test string: 47
[+] Decoded test string: Hi how are you doing, hope every thing is fine!
[+] Length of decoded test string: 47


##### **Notes**
1. Above we implemented the character level encoding for the sake of simplicity, but it is not used nowdays in the real world.
2. Other methods like word-level or subword-level encoding are most commonly used instead.
3. SentencePiece is widely used as compared to tikToken.
<br>
<br>

| Tokenizer Name | Used/Developed by | URL                                                    | Notes                                   |
|:---------------|:-----------------:|:-------------------------------------------------------|:----------------------------------------|
|  SentencePiece |   Google          | [Github Repository](https://github.com/google/sentencepiece) <br> [Reference 1](https://colabdoge.medium.com/understanding-sentencepiece-under-standing-sentence-piece-ac8da59f6b08)| Sub-words(Byte-Pair-Encoding) <br> Character n-grams <br> Capture morphemes (smallest unit of maningful word in a language e.g. un-usual-ly) <br> Language independent <br> Fixed vocab size|
|   tikToken     |   openAI          | [Repo](https://github.com/openai/tiktoken?tab=readme-ov-file) | Sub-words(Byte-Pair-Encoding), Language independent , Dynamic Vocab Size |


In [11]:
# Encoding the entire text dataset and storing it in the torch.tensor object

data = torch.tensor(data=encode(text), dtype=torch.long)

In [12]:
print(f'[+] {data.dtype=}')
print(f'[+] {data.shape=}')
print(

[+] data.dtype=torch.int64
[+] data.shape=torch.Size([1115394])


In [21]:
print(data[600:1000])
print()
print(decode(sequence=data[600:1000].tolist()))

tensor([40, 59, 58,  1, 58, 46, 43,  1, 57, 59, 54, 43, 56, 44, 50, 59, 47, 58,
        63,  6,  1, 61, 46, 47, 50, 43,  1, 47, 58,  1, 61, 43, 56, 43,  0, 61,
        46, 53, 50, 43, 57, 53, 51, 43,  6,  1, 61, 43,  1, 51, 47, 45, 46, 58,
         1, 45, 59, 43, 57, 57,  1, 58, 46, 43, 63,  1, 56, 43, 50, 47, 43, 60,
        43, 42,  1, 59, 57,  1, 46, 59, 51, 39, 52, 43, 50, 63, 11,  0, 40, 59,
        58,  1, 58, 46, 43, 63,  1, 58, 46, 47, 52, 49,  1, 61, 43,  1, 39, 56,
        43,  1, 58, 53, 53,  1, 42, 43, 39, 56, 10,  1, 58, 46, 43,  1, 50, 43,
        39, 52, 52, 43, 57, 57,  1, 58, 46, 39, 58,  0, 39, 44, 44, 50, 47, 41,
        58, 57,  1, 59, 57,  6,  1, 58, 46, 43,  1, 53, 40, 48, 43, 41, 58,  1,
        53, 44,  1, 53, 59, 56,  1, 51, 47, 57, 43, 56, 63,  6,  1, 47, 57,  1,
        39, 57,  1, 39, 52,  0, 47, 52, 60, 43, 52, 58, 53, 56, 63,  1, 58, 53,
         1, 54, 39, 56, 58, 47, 41, 59, 50, 39, 56, 47, 57, 43,  1, 58, 46, 43,
        47, 56,  1, 39, 40, 59, 52, 42, 

## Train and validation split

In [26]:
n = int(0.9*len(data))

train_data = data[:n]
validation_data = data[n:]

print(f'[+] {len(train_data)=}')
print(f'[+] {len(validation_data)=}')

[+] len(train_data)=1003854
[+] len(validation_data)=111540


## Example of training data on a block size of 8

In [44]:
# Maximum context length for prediction.
# Maximum input that the model will ingest to generate the output.
block_size = 8

In [42]:
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [43]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

print(f'{x=}')
print(f'{y=}')
print()

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'[+] Input --> Target | {context.tolist()} --> {target}')

x=tensor([18, 47, 56, 57, 58,  1, 15, 47])
y=tensor([47, 56, 57, 58,  1, 15, 47, 58])

[+] Input --> Target | [18] --> 47
[+] Input --> Target | [18, 47] --> 56
[+] Input --> Target | [18, 47, 56] --> 57
[+] Input --> Target | [18, 47, 56, 57] --> 58
[+] Input --> Target | [18, 47, 56, 57, 58] --> 1
[+] Input --> Target | [18, 47, 56, 57, 58, 1] --> 15
[+] Input --> Target | [18, 47, 56, 57, 58, 1, 15] --> 47
[+] Input --> Target | [18, 47, 56, 57, 58, 1, 15, 47] --> 58


## Generating the training data

In [70]:
torch.manual_seed(1337)

# How many independent sequence will we process in parallel
batch_size = 4

# Maximum context length for prediction.
# Maximum input that the model will ingest to generate the output.
block_size = 8

def get_batch(split):
    '''Generate a small batch of data of inputsx and targets y'''

    if split=='train':
        data = train_data 
        print(f'[+] Generating "training" data...')
    else:
        data = validation_data
        print(f'[+] Generating "validation" data...')

    ix = torch.randint(
        low=0,
        high=len(data)-block_size,
        size=(batch_size,)
    )  # tensor([ 76049, 234249, 934904, 560986])

    x = [torch.tensor(data[i:i+block_size]) for i in ix]
    x = torch.stack(x)

    y = [data[i+1:i+block_size+1] for i in ix]
    y = torch.stack(y)

    return x, y

## Understanding training data with respect to batch and block size

In [79]:
print('''
Block Size: 1. Maximum context length for prediction.
            2. Maximum input that the model will ingest to generate the output.
            
Batch Size: How many independent sequence will we process in parallel
''')


Block Size: 1. Maximum context length for prediction.
            2. Maximum input that the model will ingest to generate the output.
            
Batch Size: How many independent sequence will we process in parallel



In [72]:
xb, yb = get_batch('train')
print()
print(f'[+] {xb.shape=}')
print(xb)
print()
print(f'[+] {yb.shape=}')
print(yb)

[+] Generating "training" data...

[+] xb.shape=torch.Size([4, 8])
tensor([[57, 43, 60, 43, 52,  1, 63, 43],
        [60, 43, 42,  8,  0, 25, 63,  1],
        [56, 42,  5, 57,  1, 57, 39, 49],
        [43, 57, 58, 63,  6,  1, 58, 46]])

[+] yb.shape=torch.Size([4, 8])
tensor([[43, 60, 43, 52,  1, 63, 43, 39],
        [43, 42,  8,  0, 25, 63,  1, 45],
        [42,  5, 57,  1, 57, 39, 49, 43],
        [57, 58, 63,  6,  1, 58, 46, 47]])


In [82]:
for batch in range(batch_size):
    print(f'[+] {batch=}')
    for block in range(block_size):
        context = xb[batch, :block+1]
        target = yb[batch, block]
        print(f'\t{block=} | {context.tolist()} --> {target}')
    print()

[+] batch=0
	block=0 | [57] --> 43
	block=1 | [57, 43] --> 60
	block=2 | [57, 43, 60] --> 43
	block=3 | [57, 43, 60, 43] --> 52
	block=4 | [57, 43, 60, 43, 52] --> 1
	block=5 | [57, 43, 60, 43, 52, 1] --> 63
	block=6 | [57, 43, 60, 43, 52, 1, 63] --> 43
	block=7 | [57, 43, 60, 43, 52, 1, 63, 43] --> 39

[+] batch=1
	block=0 | [60] --> 43
	block=1 | [60, 43] --> 42
	block=2 | [60, 43, 42] --> 8
	block=3 | [60, 43, 42, 8] --> 0
	block=4 | [60, 43, 42, 8, 0] --> 25
	block=5 | [60, 43, 42, 8, 0, 25] --> 63
	block=6 | [60, 43, 42, 8, 0, 25, 63] --> 1
	block=7 | [60, 43, 42, 8, 0, 25, 63, 1] --> 45

[+] batch=2
	block=0 | [56] --> 42
	block=1 | [56, 42] --> 5
	block=2 | [56, 42, 5] --> 57
	block=3 | [56, 42, 5, 57] --> 1
	block=4 | [56, 42, 5, 57, 1] --> 57
	block=5 | [56, 42, 5, 57, 1, 57] --> 39
	block=6 | [56, 42, 5, 57, 1, 57, 39] --> 49
	block=7 | [56, 42, 5, 57, 1, 57, 39, 49] --> 43

[+] batch=3
	block=0 | [43] --> 57
	block=1 | [43, 57] --> 58
	block=2 | [43, 57, 58] --> 63
	block=3 

## Simple Neural Network (Bigram Language Model)

In [None]:
import torch.nn as nn
from

In [None]:
a