In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model=AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

In [3]:
model_dict=model.state_dict()
for k,v in model_dict.items():
    print(k, v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [4]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [5]:
import torch 
import torch.nn as nn
import math

In [6]:
model=AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
tokenizer=AutoTokenizer.from_pretrained("openai-community/gpt2")

In [7]:
# class SelfAttention(nn.Module):
#     def __init__(self,d_model):
#         super().__init__()

#         self.query=nn.Linear(d_model,d_model)
#         self.key=nn.Linear(d_model,d_model)
#         self.value=nn.Linear(d_model,d_model)

#         self.fc_out=nn.Linear(d_model,d_model)

#     def forward(self,inputs):
#         B, seq_length, d_model = inputs.shape  # batch size, sequence length, embedding dimensionality (n_embd)

#         # projecting the input into query, key and value
#         Q=self.query(inputs)
#         K=self.key(inputs)
#         V=self.value(inputs)

#         attention_scores=torch.matmul(Q,K.transpose(-2,-1))
        
#         # Apply mask to prevent attention to future tokens
#         mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool().to(inputs.device)
#         attention_scores = attention_scores.masked_fill(mask, float('-inf'))
        
#         attention_weights = torch.softmax(attention_scores, dim=-1)
#         # Compute the weighted sum of the values
#         attention_output = torch.matmul(attention_weights, V)

#         # Apply the final linear transformation
#         out = self.fc_out(attention_output)
        
#         return out


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, n_heads: int):
        super().__init__()
        
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads

        assert (n_heads * self.head_dim == d_model)

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(0.2)

    def forward(self, inputs: torch.Tensor):
        B, seq_length, d_model = inputs.shape
        
        # Project the input embeddings into Q, K, and V
        Q = self.query(inputs).view(B, seq_length, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = self.key(inputs).view(B, seq_length, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = self.value(inputs).view(B, seq_length, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        # Compute attention scores
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        # Apply mask to prevent attention to future tokens
        mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool().to(inputs.device)
        attention_scores = attention_scores.masked_fill(mask, float('-inf'))
        
        attention_weights = torch.softmax(attention_scores, dim=-1)
        # Compute the weighted sum of the values
        attention_output = torch.matmul(self.dropout(attention_weights), V)

        # Concatenate heads and put them back to the original shape
        attention_output = attention_output.permute(0, 2, 1, 3).contiguous()
        attention_output = attention_output.view(B, seq_length, d_model)

        # Apply the final linear transformation
        out = self.fc_out(attention_output)
        
        return out


In [8]:
# attn=MultiHeadAttention(512,8)
# x=torch.rand(1,10,512)
# attn(x)

In [9]:
class PositionalEncoding(nn.Module):
    def __init__(self, context_length, d_model) -> None:
        
        super().__init__()
        # Create a matrix of shape (context_length, d_model) to store the positional encodings
        pe = torch.zeros(context_length, d_model)
        
        # Create a vector with positions [0, 1, 2, ..., context_length-1] of shape (context_length, 1)
        position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
        
        # Create a vector with the divisor terms based on the dimension
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        
        # Compute the positional encodings using sine and cosine functions
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0)  # Shape: (1, context_length, d_model)
        
        # Register pe as a buffer, so it is not considered a parameter but is part of the module's state
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Add the positional encodings to the input embeddings
        return x + self.pe[:,:x.size(1), :] 

In [10]:
class MLP(nn.Module):
        def __init__(self,d_model,vocab_size=None):
            super().__init__()
            self.fcn=nn.Sequential(
                nn.Linear(d_model, 4*d_model),
                nn.GELU(),
                nn.Linear(4*d_model,d_model)
            )
        def forward(self,x):
              logits=self.fcn(x)

              return logits

In [11]:
class GPTBlock(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.att = MultiHeadAttention(d_model, n_heads)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.2)
        self.fcn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model)
        )

    def forward(self, logits):
        att_logits = self.att(logits)
        adn_logits = self.ln1(logits + att_logits)
        logits = self.dropout(adn_logits)
        logits = self.fcn(logits)
        logits = self.ln2(logits + adn_logits)
        return logits

In [37]:
import torch.nn.functional as F
class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers,context_length):
        super().__init__()
        self.context_length=context_length
        self.wte = nn.Embedding(vocab_size, d_model) # word token embeddings
        self.wpe = PositionalEncoding(context_length, d_model) # word position encodings
        self.blocks = nn.ModuleList([GPTBlock(d_model, n_heads) for _ in  range(n_layers)])
        self.linear1 = nn.Linear(d_model, vocab_size)

        self.wte.weight = self.linear1.weight

    def forward(self, inputs, targets = None):
        logits = self.wte(inputs) # dim -> batch_size, sequence_length, d_model
        logits = self.wpe(logits)
        for block in self.blocks:
            logits = block(logits)
        logits = self.linear1(logits)
        loss = None
        if targets != None:
            batch_size, sequence_length, d_model = logits.shape
            # to calculate loss for all token embeddings in a batch
            # kind of a requirement for cross_entropy
            logits = logits.view(batch_size * sequence_length, d_model)
            targets = targets.view(batch_size * sequence_length)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, inputs, max_new_tokens):
        # this will store the model outputs along with the initial input sequence
        # make a copy so that it doesn't interfare with model 
        output = inputs.clone()
        for _ in range(max_new_tokens):
            current_seq_length = inputs.size(1)
            # Truncate inputs if it exceeds context_length
            if current_seq_length > self.context_length:
                inputs = inputs[:, -self.context_length:]
            # we only pass targets on training to calculate loss
            logits, _ = self(inputs)  
            # for all the batches, get the embeds for last predicted sequence
            logits = logits[:, -1, :] 
            probs = torch.softmax(logits, dim=1)            
            # get the probable token based on the input probs
            idx_next = torch.multinomial(probs, num_samples=1) 
            
            inputs = torch.cat([inputs, idx_next], dim=1)
            output = torch.cat([output, idx_next], dim=1)
        return [tokenizer.decode(out.tolist()) for out in output]

In [38]:
mlp=GPTBlock(512,8)
x=torch.rand(1,10,512)
mlp(x).shape

torch.Size([1, 10, 512])

In [39]:
class GPTConfig:
    
    block_size=256
    vocab_size=50257
    n_layers=12
    n_heads=12
    d_model=768
    context_length=1024
config=GPTConfig()
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [40]:
custom_model = GPT(vocab_size=config.vocab_size, d_model=config.d_model, n_heads=config.n_heads, n_layers=config.n_layers,context_length=config.context_length).to(device)

In [41]:
custom_model=torch.compile(custom_model)

In [17]:
print(f"Total Parameters: {round(sum(p.numel() for p in custom_model.parameters() if p.requires_grad) / 1_000_000)}M")

Total Parameters: 124M


In [18]:
assert len(custom_model.state_dict().items()) == len(model.state_dict().items())

AssertionError: 

In [19]:
with torch.no_grad():
    input = torch.tensor(tokenizer.encode("Love"), dtype=torch.long, device=device).unsqueeze(0)
    print(custom_model.generate(input, max_new_tokens=10)[0])

Love 2013 cont emphatically Evaliami snapping ScholarshipTouch Setup abusive


In [33]:
custom_model

OptimizedModule(
  (_orig_mod): GPT(
    (wte): Embedding(50257, 768)
    (wpe): PositionalEncoding()
    (blocks): ModuleList(
      (0-11): 12 x GPTBlock(
        (att): MultiHeadAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (fc_out): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
        )
        (ln1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (ln2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (fcn): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (linear1): Linear(in_features=768, out_f

In [21]:
from datasets import load_dataset
fw = load_dataset("HuggingFaceFW/fineweb", name="CC-MAIN-2025-26", split="train", streaming=True,columns=["text"])


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [22]:
next(iter(fw))

{'text': "Always in a class of its own |\nAlain Graillot, Crozes-Hermitage, 2017\nThe onset of my personal wine bug in mid 2002 coincided with two events that had fed my habit for years: an article on Rhone wines in a local wine magazine and a Rhone tasting at Wine Route, which happened to be the first tasting of imported wines I had ever attended. The article contained a tasting note of the Graillot, Crozes-Hermitage, 2000, which was one of the wines we drank at the tasting. Basically, Graillot was there at the start for me.\nWhat I remember most about the Northern Rhones from the tasting, which included, besides the Graillot, an Yves Cuilleron Saint Joseph, was the sensuality of the fruit and its silky texture. Of course, I had no context at the time for comparing the wines to other producers and other appellations. I knew from that article that Crozes was a lesser terroir, but I had no idea how much less so.\nWine Route had been Graillot's importer for years, but the selection was a

In [23]:
lr = 1e-3
optim = torch.optim.AdamW(custom_model.parameters(), lr=lr, weight_decay=0.1)

In [24]:
from torch.utils.data import DataLoader
train_loader=DataLoader(fw,batch_size=32)

In [25]:
import torch
from torch.utils.data import IterableDataset
from datasets import load_dataset
from transformers import AutoTokenizer  # or your own tokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

block_size = 1024  # context length of your GPT-2
batch_size = 4



fw = load_dataset(
    "HuggingFaceFW/fineweb",
    name="CC-MAIN-2025-26",
    split="train",
    streaming=True,
    columns=["text"],
)

class FineWebBlocks(IterableDataset):
    def __init__(self, hf_stream, tokenizer, block_size):
        self.hf_stream = hf_stream
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __iter__(self):
        buffer = []
        for example in self.hf_stream:
            text = example["text"]
            if not text:
                continue
            ids = self.tokenizer(
                text,
                add_special_tokens=False,
            )["input_ids"]
            buffer.extend(ids)

            # yield as many full blocks as possible
            while len(buffer) > self.block_size:
                # next-token prediction: inputs are first block_size tokens,
                # labels are next block_size tokens, shifted by 1
                x = buffer[:self.block_size]
                y = buffer[1:self.block_size+1]
                # remove the first token from buffer to create overlap of block_size
                buffer = buffer[1:]
                yield {
                    "input_ids": torch.tensor(x, dtype=torch.long),
                    "labels": torch.tensor(y, dtype=torch.long),
                }

train_iterable = FineWebBlocks(fw, tokenizer, block_size)

from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_iterable,
    batch_size=batch_size,
    drop_last=True,         # keep shapes consistent
)


In [42]:
logits, loss = custom_model(batch["input_ids"],batch["labels"])
print(loss)

tensor(10.9258, grad_fn=<NllLossBackward0>)


In [43]:
import time

epochs = 3

# store the losses
train_loss = {}

for e in range(epochs):
    for i, batch in enumerate(train_loader):
        logits, loss = custom_model(batch["input_ids"],batch["labels"])

        optim.zero_grad()
        loss.backward()
        optim.step()
        train_loss[e] = loss.item()

        print(f"Epoch: {e}\ttrain_loss: {loss:.4f}")



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitl

Epoch: 0	train_loss: 10.9221
Epoch: 0	train_loss: 10.9226
Epoch: 0	train_loss: 10.9287
Epoch: 0	train_loss: 10.9330
Epoch: 0	train_loss: 10.9385
Epoch: 0	train_loss: 10.9328


KeyboardInterrupt: 