In [2]:
with open ('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print (text[:200])

﻿Dorothy and the Wizard in Oz

Author: L. Frank Baum

Illustrator: John R. Neill

Release date: September 10, 2007 [eBook #22566]

Language: English

Credits: Produced by Chris Curnow, Joseph Cooper, 


## Character Tokenizer

In [7]:
# Get the sorted text of characters in the text
chars = sorted(set(text))


In [6]:
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

encoded_hello = encode('hello')
decoded_hello = decode(encode('hello'))

print (encoded_hello)
print (decoded_hello)

[63, 60, 67, 67, 70]
hello


## Tensors instead of arrays

In [16]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (device)

cuda


In [10]:
data = torch.tensor(encode(text), dtype=torch.long)
print (data[:100])

tensor([82, 30, 70, 73, 70, 75, 63, 80,  1, 56, 69, 59,  1, 75, 63, 60,  1, 49,
        64, 81, 56, 73, 59,  1, 64, 69,  1, 41, 81,  0,  0, 27, 76, 75, 63, 70,
        73, 24,  1, 38, 12,  1, 32, 73, 56, 69, 66,  1, 28, 56, 76, 68,  0,  0,
        35, 67, 67, 76, 74, 75, 73, 56, 75, 70, 73, 24,  1, 36, 70, 63, 69,  1,
        44, 12,  1, 40, 60, 64, 67, 67,  0,  0, 44, 60, 67, 60, 56, 74, 60,  1,
        59, 56, 75, 60, 24,  1, 45, 60, 71, 75])



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.1 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\sudhe\AppData\Local\Programs\Python\Python312\Lib\runpy.py", line 198, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\sudhe\AppData\Local\Programs\Python\Python312\Lib\runpy.py", line 88, in _run_code
    exec(code, run_globals)
  File "C:\Users\sudhe\Documents\Learning\myLLM\llm_env\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\sudhe\Documents\Learning\myLLM\llm_env\Lib\site-packages\traitlets\config\ap

## Train and validation sets

In [14]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]


In [15]:
block_size = 8 # length of the sequence
x = train_data[:block_size]
y = train_data[1:block_size + 1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print (f"When input is {context}, target is {target}")

When input is tensor([82]), target is 30
When input is tensor([82, 30]), target is 70
When input is tensor([82, 30, 70]), target is 73
When input is tensor([82, 30, 70, 73]), target is 70
When input is tensor([82, 30, 70, 73, 70]), target is 75
When input is tensor([82, 30, 70, 73, 70, 75]), target is 63
When input is tensor([82, 30, 70, 73, 70, 75, 63]), target is 80
When input is tensor([82, 30, 70, 73, 70, 75, 63, 80]), target is 1


## Batch Size hyperparameter

In [None]:
batch_size # how many blocks are we using in paralell
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)