In [None]:
with open('/content/The_Verdict.txt','r', encoding = 'utf-8') as f:
  text = f.read()


print(text[:100])

﻿I HAD always thought Jack Gisburn rather a cheap genius-- though a good fellow enough--so it was no


In [None]:
!pip3 install tiktoken



In [None]:
import tiktoken
import importlib
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
encoded_text = tokenizer.encode(text)
encoded_text[:100]


In [None]:
context_size = 4 #how many previous tokens the model can see when predicting the next one. also called block_size, sequence_length,max_sq_len,context_length

#"I" → "love"
#"I love" → "deep"
#"I love deep" → "learning"

''' x = [171, 119, 123, 40]
y = [119, 123, 40, 367] '''

''' input 171 → output 119 ✔

input [171,119] → output 123 ✔

input [171,119,123] → output 40 ✔

input [171,119,123,40] → output 367 ✔ '''


x = encoded_text[:context_size]
y = encoded_text[1:context_size+1]

print(x)
print(y)

[171, 119, 123, 40]
[119, 123, 40, 367]


In [None]:
import tensorflow as tf

def create_tf_dataset(ids, block_size):
    ids = tf.constant(ids, dtype=tf.int32) #Converts your list of token IDs into a
                                                                        #TensorFlow tensor. ids = [171, 119, 123, 40, 367] becomes tf.Tensor([171,119,123,40,367], shape=(5,), dtype=int32)

    dataset = tf.data.Dataset.from_tensor_slices(ids) #Creates a tf.data.Dataset where each element is one token ID.
    dataset = dataset.window(block_size + 1, shift=1, drop_remainder=True) #Creates sliding windows over the dataset.
    '''block_size + 1 → Each window has C input tokens + 1 extra token for target.
      shift=1 → Window moves one token at a time, giving overlapping windows.
      drop_remainder=True → Discards incomplete windows at the end. '''


    dataset = dataset.flat_map(lambda x: x.batch(block_size + 1)) #Each element from window() is a dataset itself, not a tensor. flat_map(...batch(...))
                                                                                                                      # converts each window dataset into a proper tensor of shape (block_size+1,).
                                                                                                                      #Now each element is a tensor of token IDs.
                                                                                                                      #tf.Tensor([171, 119, 123, 40, 367], shape=(5,), dtype=int32)

    dataset = dataset.map(lambda x: (x[:-1], x[1:]))  #Now we create input–target pairs.
                                                                                            #x[:-1] → all tokens except the last → inputs(in current window)
                                                                                            #x[1:] → all tokens except the first → targets (shifted by 1)(in current window)
    return dataset

    #In a long text with 1M tokens, it would produce many overlapping input-target pairs, one per shift of 1.


In [None]:
block_size = 4
dataset = create_tf_dataset(encoded_text, block_size)
p = 3
for x, y in dataset:
    print("Input:", x.numpy(), "Target:", y.numpy())
    p-=1
    if p == 0:
        break


Input: [171 119 123  40] Target: [119 123  40 367]
Input: [119 123  40 367] Target: [ 123   40  367 2885]
Input: [ 123   40  367 2885] Target: [  40  367 2885 1464]


#What happens if there are millions (or billions) of IDs?
Short answer

✔ Conceptually: Yes, it defines all possible sliding windows

✔ Practically: No, they are NOT all created in memory


#✔️ What actually happens (step-by-step)

Windows are NOT generated beforehand

No giant list of windows

No massive memory usage

tf.data.Dataset defines a pipeline

A recipe for how to create windows

Not the windows themselves

During training, for each step:

TensorFlow pulls the next slice of token IDs

Forms a window on the fly

Creates the (input, target) pair

Batches it

Sends it to the model

Only the input part goes into the embedding layer

target is used only for loss calculation

#What is a batch?

A batch is a group of multiple training examples processed together in one forward/backward pass of the model.

Instead of:

feeding one (input, target) pair at a time

We feed:

many (input, target) pairs at once

Why batching exists (very important)


❌ Without batching

Very slow

GPU mostly idle

High variance gradients

✅ With batching

GPU computes in parallel

Faster training

More stable gradients

#Important Concepts

The model NEVER keeps track of input–target pairs.
The PAIRING exists only in the loss function, not inside the model.

Target IDs play the same role as labels in supervised learning.

**LLM training = supervised learning (but auto-generated)**



#Final one-sentence rule (memorize this)

Input–target pairing exists only in the loss; the model learns causality through masked attention and gradient feedback.