In [None]:
import math
from dataclasses import dataclass

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

In [None]:
# ------------------------------------ Part 1 ------------------------------------

In [None]:
class CasualCausalSelfAttention(nn.Module) :

  def __init__(self, config) :
    super().__init__()
    assert config.n_embd % config.n_head == 0 # Confirm whether n_embd / n_head is an int :)
    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias) # Create one single matrix for k, q, v (to be used later)
    self.resid_dropout = nn.Dropout(config.dropout)

  def forward(self, x) :
    B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

    # Calculate query, key, value for all heads :
    q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

    # Doing actual computation :
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
    att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
    att = F.softmax(att, dim=-1)
    att = self.attn_dropout(att)
    y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

    # re-assemble all head outputs side by side
    y = y.transpose(1, 2).contiguous().view(B, T, C)

    y = self.resid_dropout(self.c_proj(y)) # Output projection
    return y

In [None]:
# ------------------------------------ Part 2 ------------------------------------

In [None]:
import random

# Define the range of numbers
num_range = 5

# Open a file to write the dataset
with open('random_numbers.txt', 'w') as f:
    # Generate and write 100 examples to the file
    for _ in range(100):
        a = random.randint(0, num_range - 1)
        b = random.randint(0, num_range - 1)
        result = a + b
        f.write(f"{a}+{b}={result}\n")


In [None]:
with open('random_numbers.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class Head(nn.Module) :
    """ one head of self-attention"""

    def __init__(self, head_size) :
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x) :
        B, T, C = x.shape
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        # Compute attention :
        wei = q @ k.transpose(-2, -1) * C ** -0.5 # (B, T, head_size) @ (B, head_size, T) = (B, T, T) # C ** -0.5 is normalisation
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1) # (B, T, T)
        wei = self.dropout(wei)
        # weighted aggregation of values :
        v = self.value(x)
        out = wei @ v
        return out

In [None]:
class MultiHeadAttention(nn.Module) :
    """ multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size) :
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, n_embd) # Just a linear transformation for residual connections
        self.dropout = nn.Dropout(dropout)

    def forward(self, x) :
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedForward(nn.Module) :
    """a simple linear layer followed by non linearity"""

    def __init__(self, n_embd) :
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # Linear trabsformation for residual connections
            nn.Dropout(dropout),
        )

    def forward(self, x) :
        return self.net(x)

In [None]:
class Block(nn.Module) :
    """ Transformer block : communication followed by computation """

    def __init__(self, n_embd, n_head) :
        # n_embd = embedding dimension, n_head = number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # Apply layernorm
        self.ln2 = nn.LayerNorm(n_embd) # Apply layernorm

    def forward(self, x) :
        x = x + self.sa(self.ln1(x)) # += for residual connections
        x = x + self.ffwd(self.ln2(x)) # += for residual connections
        return x

In [None]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head = n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # Final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size) # Linear layer to go from tok_emb to logits, Language Model Head

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to last block_size tokens :
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 20
n_embd = 32
n_head = 6
n_layer = 2
dropout = 0.2
# ------------

In [None]:
model = BigramLanguageModel()
m = model.to(device)

torch.manual_seed(1337)
print(sum(p.numel() for p in m.parameters()) / 1e3, 'k parameters')

25.804 k parameters


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 2.5584, val loss 2.5071
step 500: train loss 0.8597, val loss 0.8821
step 1000: train loss 0.7822, val loss 0.8203
step 1500: train loss 0.7313, val loss 0.8286
step 2000: train loss 0.6606, val loss 0.8069
step 2500: train loss 0.6358, val loss 0.8389
step 3000: train loss 0.6070, val loss 0.8588
step 3500: train loss 0.5920, val loss 0.8911
step 4000: train loss 0.5767, val loss 0.9317
step 4500: train loss 0.5727, val loss 0.9405


In [None]:
# Assuming m is your model and it is already moved to the GPU
textcontext = '1+0='

# Ensure encode function and device are defined
context = torch.tensor(encode(textcontext), dtype=torch.long).view(1, -1).to(device)

# Generate text
print(decode(m.generate(context, max_new_tokens=43)[0].tolist()))


1+0=1
1+1=2
2+3=5
3+0=3
4+0=4
0+1=1
4+2=6
3+4=7


In [None]:
# ------------------------------------ Part 3 ------------------------------------

In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset
wikitext = load_dataset("wikitext", "wikitext-103-raw-v1")
new_text = ''.join(wikitext['train']['text'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
with open('new_text.txt', 'w') as f:
    f.write(new_text[:100000])


In [None]:
with open('new_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
class Head(nn.Module) :
    """ one head of self-attention"""

    def __init__(self, head_size) :
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias = False)
        self.query = nn.Linear(n_embd, head_size, bias = False)
        self.value = nn.Linear(n_embd, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x) :
        B, T, C = x.shape
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)
        # Compute attention :
        wei = q @ k.transpose(-2, -1) * C ** -0.5 # (B, T, head_size) @ (B, head_size, T) = (B, T, T) # C ** -0.5 is normalisation
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1) # (B, T, T)
        wei = self.dropout(wei)
        # weighted aggregation of values :
        v = self.value(x)
        out = wei @ v
        return out

In [None]:
class MultiHeadAttention(nn.Module) :
    """ multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size) :
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, n_embd) # Just a linear transformation for residual connections
        self.dropout = nn.Dropout(dropout)

    def forward(self, x) :
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
class FeedForward(nn.Module) :
    """a simple linear layer followed by non linearity"""

    def __init__(self, n_embd) :
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # Linear trabsformation for residual connections
            nn.Dropout(dropout),
        )

    def forward(self, x) :
        return self.net(x)

In [None]:
class Block(nn.Module) :
    """ Transformer block : communication followed by computation """

    def __init__(self, n_embd, n_head) :
        # n_embd = embedding dimension, n_head = number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # Apply layernorm
        self.ln2 = nn.LayerNorm(n_embd) # Apply layernorm

    def forward(self, x) :
        x = x + self.sa(self.ln1(x)) # += for residual connections
        x = x + self.ffwd(self.ln2(x)) # += for residual connections
        return x

In [None]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head = n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # Final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size) # Linear layer to go from tok_emb to logits, Language Model Head

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,n_embd)
        pos_emb = self.position_embedding_table(torch.arange(T, device = device)) # (T, C)
        x = tok_emb + pos_emb # (B, T, C)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to last block_size tokens :
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 20
n_embd = 32
n_head = 6
n_layer = 2
dropout = 0.2
# ------------

In [None]:
model = BigramLanguageModel()
m = model.to(device)

torch.manual_seed(1337)
print(sum(p.numel() for p in m.parameters()) / 1e3, 'k parameters')

33.929 k parameters


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import math

class GPTConfig:
    n_embd = 768
    n_layer = 12
    n_head = 12
    dropout = 0.1
    max_len = 512

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.tok_emb = nn.Embedding(50257, config.n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, config.max_len, config.n_embd))
        self.drop = nn.Dropout(config.dropout)
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd)
        self.head = nn.Linear(config.n_embd, 50257, bias=False)

    def forward(self, idx):
        b, t = idx.size()
        assert t <= GPTConfig.max_len, f"Cannot forward sequence of length {t}, max is {GPTConfig.max_len}"
        token_embeddings = self.tok_emb(idx)
        position_embeddings = self.pos_emb[:, :t, :]
        x = self.drop(token_embeddings + position_embeddings)
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.head(x)
        return logits

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.n_embd)
        self.attn = nn.MultiheadAttention(config.n_embd, config.n_head, dropout=config.dropout)
        self.ln2 = nn.LayerNorm(config.n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, 4 * config.n_embd),
            nn.GELU(),
            nn.Linear(4 * config.n_embd, config.n_embd),
            nn.Dropout(config.dropout),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x), self.ln1(x), self.ln1(x))[0]
        x = x + self.mlp(self.ln2(x))
        return x

class TextDataset(Dataset):
    def __init__(self, text, block_size):
        self.text = text
        self.block_size = block_size
        self.chars = sorted(list(set(text)))
        self.vocab_size = len(self.chars)
        self.stoi = {ch: i for i, ch in enumerate(self.chars)}
        self.itos = {i: ch for i, ch in enumerate(self.chars)}
        self.data = [self.stoi[s] for s in text]

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        chunk = self.data[idx:idx + self.block_size]
        x = torch.tensor(chunk[:-1], dtype=torch.long)
        y = torch.tensor(chunk[1:], dtype=torch.long)
        return x, y

def load_data(file_path, block_size, train_split=0.9):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
    dataset = TextDataset(text, block_size)
    train_size = int(len(dataset) * train_split)
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    return train_dataset, val_dataset

def train(model, train_dataset, val_dataset, epochs, batch_size, lr):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for x, y in train_loader:
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x, y in val_loader:
                logits = model(x)
                loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
                val_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss/len(train_loader):.4f} - Val Loss: {val_loss/len(val_loader):.4f}")

def fine_tune(model, fine_tune_dataset, epochs, batch_size, lr):
    model.train()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    fine_tune_loader = DataLoader(fine_tune_dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        fine_tune_loss = 0
        for x, y in fine_tune_loader:
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits.view(-1, logits.size(-1)), y.view(-1))
            loss.backward()
            optimizer.step()
            fine_tune_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs} - Fine-tune Loss: {fine_tune_loss/len(fine_tune_loader):.4f}")

if __name__ == "__main__":
    # Configuration
    config = GPTConfig()
    block_size = 128
    train_epochs = 5
    fine_tune_epochs = 2
    batch_size = 64
    lr = 3e-4

    # Load and preprocess data
    train_dataset, val_dataset = load_data("tiny_shakespeare.txt", block_size)

    # Initialize the model
    model = GPT(config)

    # Pretraining
    print("Starting pretraining...")
    train(model, train_dataset, val_dataset, train_epochs, batch_size, lr)

    # Fine-tuning
    print("Starting fine-tuning...")
    fine_tune(model, train_dataset, fine_tune_epochs, batch_size, lr)


Starting pretraining...


In [None]:
import torch

def generate_text(model, start_text, max_new_tokens, block_size, stoi, itos, device='cpu'):
    model.eval()
    context = torch.tensor([stoi[c] for c in start_text], dtype=torch.long).unsqueeze(0).to(device)
    generated = context.tolist()[0]

    for _ in range(max_new_tokens):
        input_tensor = torch.tensor(generated[-block_size:], dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            logits = model(input_tensor)
            logits = logits[:, -1, :]  # get logits of the last token
            probs = torch.nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1).item()
            generated.append(next_token)

    generated_text = ''.join(itos[token] for token in generated)
    return generated_text

if __name__ == "__main__":
    # Configuration
    config = GPTConfig()
    block_size = 128
    start_text = "To be, or not to be"
    max_new_tokens = 100

    # Load data to get the vocab
    with open("tiny_shakespeare.txt", 'r', encoding='utf-8') as f:
        text = f.read()
    dataset = TextDataset(text, block_size)
    stoi = dataset.stoi
    itos = dataset.itos

    # Load the trained model (ensure to use the same configuration as training)
    model = GPT(config)
    model.load_state_dict(torch.load("fine_tuned_model.pth", map_location=torch.device('cpu')))
    model.to('cpu')

    # Generate text
    generated_text = generate_text(model, start_text, max_new_tokens, block_size, stoi, itos, device='cpu')
    print("Generated Text:")
    print(generated_text)


NameError: name 'TextDataset' is not defined