In [4]:
# Character level tokenisation

# Step 1: Input text
text = "hello world"

# Step 2: Get all unique characters in the text
vocab = sorted(list(set(text)))
print("Vocabulary:", vocab)

# Step 3: Create mappings: char → int and int → char
char_to_int = {ch: i for i, ch in enumerate(vocab)}
int_to_char = {i: ch for i, ch in enumerate(vocab)}

print("\nCharacter to Integer mapping:")
print(char_to_int)

# Step 4: Encode (text → integers)
encoded = [char_to_int[ch] for ch in text]
print("\nEncoded sequence:", encoded)

# Step 5: Decode (integers → text)
decoded = ''.join([int_to_char[i] for i in encoded])
print("\nDecoded text:", decoded)


Vocabulary: [' ', 'd', 'e', 'h', 'l', 'o', 'r', 'w']

Character to Integer mapping:
{' ': 0, 'd': 1, 'e': 2, 'h': 3, 'l': 4, 'o': 5, 'r': 6, 'w': 7}

Encoded sequence: [3, 2, 4, 4, 5, 0, 7, 5, 6, 4, 1]

Decoded text: hello world


In [None]:
# Bigram Model

# Count bigrams (pairs of consecutive chars)
import torch
vocab_size=len(vocab)
# Create a count matrix (vocab_size x vocab_size)
N = torch.zeros((vocab_size, vocab_size), dtype=torch.int32)
for ch1, ch2 in zip(encoded, encoded[1:]):
    N[ch1, ch2] += 1
print("Bigram count matrix:\n", N)


# Convert counts → probabilities
# Add 1 for smoothing (avoid division by zero)
P = (N + 1).float()
P /= P.sum(1, keepdim=True)
print("\nProbability matrix:\n", P)


# Generate text using the bigram model
import torch
# Start with a random character
ix = torch.randint(0, vocab_size, (1,)).item()
out = [ix]
for _ in range(20):  # generate 20 characters
    # Sample next char based on probability
    probs = P[ix]
    ix = torch.multinomial(probs, num_samples=1).item()
    out.append(ix)
# Decode indices to characters
generated_text = ''.join([int_to_char[i] for i in out])
print("\nGenerated text:", generated_text)