# Reading Shakespeare DS

In [1]:
with open("shakespeare_input_ds.txt", "r", encoding="utf-8") as ds:
    text = ds.read()

# Check DS length

In [2]:
print("Length of text:", len(text))

Length of text: 1115394


# print first 100 characters

In [3]:
print("First 100 characters of text:", text[:100])

First 100 characters of text: First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# Specify Unique Characters that appear in this text

In [4]:
chars_sorted = sorted(list(set(text)))
vocab_size = len(chars_sorted)
print("Vocabulary size:", vocab_size)
print("".join(chars_sorted))

Vocabulary size: 65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


# Encoding & Decoding Charcters

In [5]:
# Create a mapping from index numbers to characters
# Example: {0: 'h', 1: 'i', 2: ' '} if chars_sorted is ['h', 'i', ' ']
string_into_integer = {i: c for i, c in enumerate(chars_sorted)}

# Create the reverse mapping from characters to index numbers
# Example: {'h': 0, 'i': 1, ' ': 2}
integer_into_string = {c: i for i, c in enumerate(chars_sorted)}

# Define encoder: converts text to list of numbers using integer_into_string
# Example: "hi" → [0, 1]
encoded = lambda x: [integer_into_string[c] for c in x]

# Define decoder: converts list of numbers back to text using string_into_integer
# Example: [0, 1] → "hi"
decoded = lambda x: "".join([string_into_integer[c] for c in x])

# Test the encoding and decoding
print("Encoded string:", encoded("hii there"))  # Shows the number version
print("Decoded string:", decoded(encoded("hii there")))  # Should return original text

Encoded string: [46, 47, 47, 1, 58, 46, 43, 56, 43]
Decoded string: hii there


# Converting to tensor using torch

In [6]:
import torch

data = torch.tensor(encoded(text), dtype=torch.long)  # Convert to tensor
print("Data shape:", data.shape)
print("Data type:", data.dtype)
print("First 100 elements of data:", data[:100])  # Print first 10 elements

Data shape: torch.Size([1115394])
Data type: torch.int64
First 100 elements of data: tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


# Split into training & test set

In [7]:
n = int(0.9 * len(data))  # 90% of the data for training
train_data = data[:n]  # Training data
val_data = data[n:]  # Validation data

In [8]:
block_size = 8  # Number of characters in each block
train_data[: block_size + 1]  # Show first block of training data

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]  # First block of training data
y = train_data[1 : block_size + 1]  # Shifted block for labels
for t in range(block_size):
    context = x[: t + 1]  # Context is the first t + 1 characters
    target = y[t]  # Target is the next character
    print(f"Context: {context}, Target: {target}")

Context: tensor([18]), Target: 47
Context: tensor([18, 47]), Target: 56
Context: tensor([18, 47, 56]), Target: 57
Context: tensor([18, 47, 56, 57]), Target: 58
Context: tensor([18, 47, 56, 57, 58]), Target: 1
Context: tensor([18, 47, 56, 57, 58,  1]), Target: 15
Context: tensor([18, 47, 56, 57, 58,  1, 15]), Target: 47
Context: tensor([18, 47, 56, 57, 58,  1, 15, 47]), Target: 58


In [10]:
torch.manual_seed(1337)  # Set seed for reproducibility
batch_size = 4  # Number of sequences in a batch
block_size = 8  # Number of characters in each block


def getbatch(split):
    # generate small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))  # Random starting indices
    x = torch.stack(
        [data[i : i + block_size] for i in ix]
    )  # Stack sequences into a batch
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])  # Shifted sequences
    return x, y  # Return input and target batches


xb, yb = getbatch("train")  # Get a batch of training data
print("inputs")
print(xb.shape)  # Shape of input batch
print(xb)  # Print input batch
print("targets")
print(yb.shape)  # Shape of target batch
print(yb)  # Print target batch
print("-----" * 15)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, : t + 1]  # Context is the first t + 1 characters for batch b
        target = yb[b, t]  # Target is the next character for batch b
        print(f"when the input is {context.tolist()}, The Target is: {target}")
    print("-----" * 15)

inputs
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
---------------------------------------------------------------------------
when the input is [24], The Target is: 43
when the input is [24, 43], The Target is: 58
when the input is [24, 43, 58], The Target is: 5
when the input is [24, 43, 58, 5], The Target is: 57
when the input is [24, 43, 58, 5, 57], The Target is: 1
when the input is [24, 43, 58, 5, 57, 1], The Target is: 46
when the input is [24, 43, 58, 5, 57, 1, 46], The Target is: 43
when the input is [24, 43, 58, 5, 57, 1, 46, 43], The Target is: 39
---------------------------------------------------------------------------
when the input is [