<a href="https://colab.research.google.com/github/TinaKhatri28/shakespeare_gpt/blob/main/mini_gpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# gpt_model.py
%%writefile gpt_model.py
import torch
import torch.nn as nn
from torch.nn import functional as F

# HYPERPARAMETERS
block_size = 8
n_embd = 128
n_head = 8
n_layer = 4
dropout = 0.2
torch.manual_seed(1337)

#LOAD DATA
with open("/content/input.txt", "r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return "".join([itos[i] for i in l])

#MODEL
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (k.size(-1) ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads):
        super().__init__()
        head_size = n_embd // num_heads
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))


class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.sa = MultiHeadAttention(n_head)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok = self.token_embedding(idx)
        pos = self.position_embedding(torch.arange(T, device=idx.device))
        x = tok + pos
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            logits = logits.view(B * T, -1)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens, temperature=0.8, top_k=40):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature

            if top_k is not None:
                v, ix = torch.topk(logits, top_k)
                logits_filtered = torch.full_like(logits, float("-inf"))
                logits_filtered.scatter_(1, ix, v)
                logits = logits_filtered

            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, 1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

def generate_shakespeare_text(
    model, start_text="ROMEO:", max_tokens=300, temperature=0.6, top_k=20
):
    model.eval()
    device = next(model.parameters()).device
    context = torch.tensor([encode(start_text)], dtype=torch.long).to(device)
    output = model.generate(
        context,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_k=top_k,
    )
    return decode(output[0].tolist())


Writing gpt_model.py


In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# HYPERPARAMETERS
block_size = 8
n_embd = 128
n_head = 8
n_layer = 4
dropout = 0.2
torch.manual_seed(1337)

# LOAD DATA FOR VOCAB
with open("/content/input.txt", "r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return "".join([itos[i] for i in l])

# MODEL
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * (k.size(-1) ** -0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        return wei @ v


class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads):
        super().__init__()
        head_size = n_embd // num_heads
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.dropout(self.proj(out))


class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.sa = MultiHeadAttention(n_head)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok = self.token_embedding(idx)
        pos = self.position_embedding(torch.arange(T, device=idx.device))
        x = tok + pos
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        loss = None
        if targets is not None:
            logits = logits.view(B * T, -1)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens, temperature=0.8, top_k=40):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :] / temperature

            if top_k is not None:
                v, ix = torch.topk(logits, top_k)
                logits_filtered = torch.full_like(logits, float("-inf"))
                logits_filtered.scatter_(1, ix, v)
                logits = logits_filtered

            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, 1)
            idx = torch.cat((idx, idx_next), dim=1)

        return idx

def generate_shakespeare_text(
    model, start_text="ROMEO:", max_tokens=300, temperature=0.6, top_k=20
):
    model.eval()
    device = next(model.parameters()).device
    context = torch.tensor([encode(start_text)], dtype=torch.long).to(device)
    output = model.generate(
        context,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_k=top_k,
    )

In [None]:
import torch
from gpt_model import GPTLanguageModel, encode, decode


In [None]:
batch_size = 64
block_size = 8
learning_rate = 1e-3
max_iters = 50000
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
with open("/content/input.txt", "r", encoding="utf-8") as f:
    text = f.read()

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]


In [None]:
def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x.to(device), y.to(device)


In [None]:
model = GPTLanguageModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for step in range(max_iters):
    xb, yb = get_batch("train")
    logits, loss = model(xb, yb)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 1000 == 0:
        print(f"step {step} | loss {loss.item():.4f}")



step 0 | loss 4.3525
step 1000 | loss 2.0220
step 2000 | loss 1.8312
step 3000 | loss 1.9804
step 4000 | loss 1.7582
step 5000 | loss 1.8314
step 6000 | loss 1.7698
step 7000 | loss 1.8299
step 8000 | loss 1.6882
step 9000 | loss 1.7216
step 10000 | loss 1.6928
step 11000 | loss 1.8069
step 12000 | loss 1.8366
step 13000 | loss 1.7346
step 14000 | loss 1.7920
step 15000 | loss 1.7860
step 16000 | loss 1.6883
step 17000 | loss 1.7947
step 18000 | loss 1.7345
step 19000 | loss 1.7723
step 20000 | loss 1.6783
step 21000 | loss 1.7849
step 22000 | loss 1.7322
step 23000 | loss 1.6834
step 24000 | loss 1.6660
step 25000 | loss 1.5662
step 26000 | loss 1.7052
step 27000 | loss 1.6643
step 28000 | loss 1.6521
step 29000 | loss 1.7077
step 30000 | loss 1.6222
step 31000 | loss 1.7901
step 32000 | loss 1.6690
step 33000 | loss 1.7507
step 34000 | loss 1.6879
step 35000 | loss 1.6340
step 36000 | loss 1.6442
step 37000 | loss 1.7961
step 38000 | loss 1.6570
step 39000 | loss 1.6389
step 40000 | 

In [None]:
torch.save(model.state_dict(), "shakespeare_gpt.pt")
print("Model saved!")

In [None]:
model.eval()
print(generate_shakespeare_text(model, "ROMEO:", 300))


None


In [None]:
model = GPTLanguageModel()
model.load_state_dict(torch.load("shakespeare_gpt.pt", map_location=device))
model.to(device)
model.eval()

torch.set_grad_enabled(False)

context = torch.zeros((1, 1), dtype=torch.long).to(device)

output = model.generate(
    context,
    max_new_tokens=600,
    temperature=0.5,
    top_k=50
)

print(decode(output[0].tolist()))


By your highness of the house is in the riar a many back a heads with the wars, and not a mistress'd our gentleman:
The world the many father,
Which will be so,
And but the want may for a man of Englanded many father father, the great sun a prayers and the warrant the mind of York, I will the while I may be a face;
With the many should the come, whose the corn to such a subjects that I will be so not hands and respect up the prove and the world the liberty,
That the people to the earth her first thou was see you are did great me should be in the banished, with her be to for the state here to b


In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.52.1-py3-none-any.whl.metadata (9.8 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.1-py3-none-any.whl (9.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m142.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.52.1


In [None]:
import streamlit as st

In [None]:
st.title("Shakespeare GPT Text Generator")
st.write("Generate Shakespeare-style text using your trained GPT model.")

start_text = st.text_input("Start your text", value="KING RICHARD III: ")
max_tokens = st.slider("Maximum tokens to generate", min_value=50, max_value=1000, value=400, step=50)
temperature = st.slider("Temperature", min_value=0.1, max_value=1.5, value=0.5, step=0.1)
top_k = st.slider("Top-K Sampling", min_value=1, max_value=100, value=50, step=1)

if st.button("Generate Text"):
    with st.spinner("Generating Shakespearean text..."):
        generated_text = generate_shakespeare_text(
            model,
            start_text=start_text,
            max_tokens=max_tokens,
            temperature=temperature,
            top_k=top_k
        )
    st.text_area("Generated Text", value=generated_text, height=400)

2025-12-14 07:16:21.083 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-12-14 07:16:21.104 Session state does not function when running a script without `streamlit run`


In [None]:
!pip install pyngrok -q

In [None]:
%%writefile app.py
import streamlit as st

st.title("Shakespeare GPT Text Generator ")
st.write("Generate Shakespeare-style text using your trained GPT model.")

start_text = st.text_input("Start your text", value="KING RICHARD III: ")
max_tokens = st.slider("Maximum tokens to generate", min_value=50, max_value=500, value=200)
temperature = st.slider("Temperature", min_value=0.1, max_value=1.5, value=1.0)
top_k = st.slider("Top-k Sampling", min_value=1, max_value=100, value=50, step=1)

if st.button("Generate Text"):
    with st.spinner("Generating Shakespearean text..."):
        generated_text = generate_shakespeare_text(
            model,
            start_text=start_text,
            max_tokens=max_tokens,
            temperature=temperature,
            top_k=top_k
        )
        st.text_area("Generated Text", value=generated_text, height=400)

Overwriting app.py


In [None]:
# Install cloudflared
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb
!dpkg -i cloudflared-linux-amd64.deb

# Kill any existing streamlit processes
!pkill streamlit

# Start streamlit in background
!streamlit run app.py &>/content/logs.txt &

# Wait for streamlit to start
import time
time.sleep(5)

# Create tunnel
!nohup cloudflared tunnel --url http://localhost:8501 > /content/tunnel.log 2>&1 &

# Wait and show URL
time.sleep(5)
!grep -o 'https://.*\.trycloudflare.com' /content/tunnel.log | head -1

(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 121693 files and directories currently installed.)
Preparing to unpack cloudflared-linux-amd64.deb ...
Unpacking cloudflared (2025.11.1) over (2025.11.1) ...
Setting up cloudflared (2025.11.1) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
%%writefile app.py
import streamlit as st
import torch
from gpt_model import GPTLanguageModel, generate_shakespeare_text

#STREAMLIT APP
@st.cache_resource
def load_model():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = GPTLanguageModel()

    try:
        model.load_state_dict(torch.load('/content/shakespeare_gpt.pt', map_location=device))
        model.to(device)
        model.eval()
        return model, True, device
    except Exception as e:
        return None, False, str(e)

st.title("Shakespeare GPT Text Generator")
st.write("Shakespeare style text")

model, model_loaded, info = load_model()

if model_loaded:
    st.success(f"Loading Model on {info}!")

    start_text = st.text_input("Start your text", value="KING RICHARD III: ")
    max_tokens = st.slider("Maximum tokens to generate", min_value=50, max_value=500, value=200)
    temperature = st.slider("Temperature", min_value=0.1, max_value=1.5, value=0.8, step=0.1)
    top_k = st.slider("Top-k Sampling", min_value=1, max_value=100, value=40, step=1)

    if st.button("Generate Text", type="primary"):
        with st.spinner("Generating Shakespearean text..."):
            try:
                generated_text = generate_shakespeare_text(
                    model,
                    start_text=start_text,
                    max_tokens=max_tokens,
                    temperature=temperature,
                    top_k=top_k
                )
                st.text_area("Generated Text", value=generated_text, height=400)
            except Exception as e:
                st.error(f"Error: {str(e)}")
else:
    st.error(f" Failed to load model: {info}")

Overwriting app.py


In [None]:
!pkill streamlit
!pkill cloudflared

import time
time.sleep(2)

!streamlit run app.py &>/content/logs.txt &
time.sleep(5)
!nohup cloudflared tunnel --url http://localhost:8501 > /content/tunnel.log 2>&1 &
time.sleep(5)
!grep -o 'https://.*\.trycloudflare.com' /content/tunnel.log | head -1

https://safari-acquire-regime-lexmark.trycloudflare.com
