# lib

In [10]:
import torch
from transformers import PreTrainedTokenizerFast
import time
import random
import numpy as np
from torch import  nn,Tensor
from dataclasses import dataclass

# config

In [7]:
@dataclass
class Config:
    vocab_size: int
    seq_len: int
    d_model: int
    num_heads: int
    num_layers: int
    d_ff: int
    dropout: float = 0.1
    grad_clip_norm: float = 1.0
    lr: float = 6e-4
    batch_size: int = 64
    epochs: int = 2
    steps_per_epoch: int = 28000
    report_interval: int = 1000000
    betas: tuple = (0.9, 0.95)
    weight_decay: float = 0.01
    use_fused: bool = True

In [8]:
def generate_text(model, tokenizer, prompt: str, max_new: int, device: str, seq_len: int, top_k: int = 50, temperature: float = 1.0) -> str:
    model.eval()
    with torch.no_grad():
        tokens = tokenizer.encode(prompt, return_tensors='pt').to(device)
        prompt_len = tokens.size(1)

        if prompt_len > seq_len:
            tokens = tokens[:, -seq_len:]
            prompt_len = seq_len

        seq = tokens
        for _ in range(max_new):
            context = seq[:, -seq_len:]
            logits = model(context)
            logits = logits[:, -1, :]
            logits = logits / temperature
            top_k_logits, top_k_indices = torch.topk(logits, top_k)
            probs = torch.softmax(top_k_logits, dim=-1)
            next_tok_idx = torch.multinomial(probs, num_samples=1)
            next_tok = top_k_indices.gather(-1, next_tok_idx)
            seq = torch.cat([seq, next_tok], dim=1)

        generated_text = tokenizer.decode(seq[0].tolist())
    return generated_text


# model

In [11]:
class GPT2Block(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
        self.ln2 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x, mask: torch.Tensor):
        h = self.ln1(x)
        attn_out, _ = self.attn(h, h, h, attn_mask=mask)
        x = x + attn_out
        h = self.ln2(x)
        return x + self.ff(h)

class GPT2Simple(nn.Module):
    def __init__(self, cfg: Config):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg.vocab_size, cfg.d_model)
        self.pos_emb = nn.Embedding(cfg.seq_len, cfg.d_model)
        self.blocks = nn.ModuleList([
            GPT2Block(cfg.d_model, cfg.num_heads, cfg.d_ff, cfg.dropout)
            for _ in range(cfg.num_layers)
        ])
        self.ln_f = nn.LayerNorm(cfg.d_model)
        self.head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
        self.head.weight = self.tok_emb.weight
        bool_mask = torch.triu(torch.ones(cfg.seq_len, cfg.seq_len, dtype=torch.bool), diagonal=1)
        self.register_buffer('causal_mask', bool_mask)

    def forward(self, input_ids: Tensor):
        bsz, seqlen = input_ids.size()
        x = self.tok_emb(input_ids) + self.pos_emb(
            torch.arange(seqlen, device=input_ids.device).unsqueeze(0).expand(bsz, -1)
        )
        mask = self.causal_mask[:seqlen, :seqlen]
        for blk in self.blocks:
            x = blk(x, mask)
        return self.head(self.ln_f(x))

# Load

In [12]:

cfg = Config(10000, 128, 768, 8, 2, 3072, batch_size=64)  # همون کانفیگ قبلی

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = PreTrainedTokenizerFast(
    tokenizer_file='/content/drive/MyDrive/bpe-tokenizer_tinystories.json',
    pad_token='<|pad|>'
)


model = GPT2Simple(cfg).to(device)
model.load_state_dict(torch.load('/content/drive/MyDrive/ckpt_epoch2xx.pt', map_location=device))
model.eval()


GPT2Simple(
  (tok_emb): Embedding(10000, 768)
  (pos_emb): Embedding(128, 768)
  (blocks): ModuleList(
    (0-1): 2 x GPT2Block(
      (ln1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (ln2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ff): Sequential(
        (0): Linear(in_features=768, out_features=3072, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=3072, out_features=768, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=768, out_features=10000, bias=False)
)

# prompt

In [13]:
prompt = "One day, Lily saw a butterfly and decided to"
generated = generate_text(
    model=model,
    tokenizer=tokenizer,
    prompt=prompt,
    max_new=64,
    device=device,
    seq_len=cfg.seq_len,
    temperature=0.8
)

print("Generated text:\n", generated)


Generated text:
 <|endoftext|>One day, Lily saw a butterfly and decided to follow it. She ran and ran until she saw the butterfly. The butterfly was very big and had wings. Lily wanted to catch it and run after it. She ran and ran until she tripped and fell on the grass. She hurt her knee and cried.

Her mom came out and saw what happened. She
