In [22]:
import torch
import transformers
import torch
import torch.nn as nn
import math
from transformers import AutoTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [None]:

class GPTConfig:

    vocab_size = 50257 
    block_size = 256
    n_layers = 12
    n_heads = 12
    d_model = 768
    context_length = 1024
    dropout = 0.2
    
    # Training Hyperparameters
    batch_size = 16 #
    learning_rate = 3e-4
    epochs = 1

    model_path="models/checkpoint_epoch_1_step_25000.pth"
    device = torch.device("mps" if torch.cuda.is_available() else "cpu")


In [35]:

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model: int, n_heads: int, dropout: float = 0.2):
        super().__init__()
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        assert (n_heads * self.head_dim == d_model)

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, inputs: torch.Tensor):
        B, seq_length, d_model = inputs.shape

        # Scaled Dot-Product Attention
        Q = self.query(inputs).view(B, seq_length, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = self.key(inputs).view(B, seq_length, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = self.value(inputs).view(B, seq_length, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        # Scaled Dot-Product Attention
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # Applying mask to prevent attention to future tokens
        mask = torch.triu(torch.ones(seq_length, seq_length), diagonal=1).bool().to(inputs.device)
        attention_scores = attention_scores.masked_fill(mask, float('-inf'))
        
        attention_weights = torch.softmax(attention_scores, dim=-1)
        attention_output = torch.matmul(self.dropout(attention_weights), V)

        # Concatenating heads and put them back to the original shape
        attention_output = attention_output.permute(0, 2, 1, 3).contiguous()
        attention_output = attention_output.view(B, seq_length, d_model)

        out = self.fc_out(attention_output)
        return out

class PositionalEncoding(nn.Module):
    def __init__(self, context_length, d_model):
        super().__init__()

        #matrix of shape (context_length, d_model) to store the positional encodings
        pe = torch.zeros(context_length, d_model)

        #vector with positions [0, 1, 2, ..., context_length-1] of shape (context_length, 1)
        position = torch.arange(0, context_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0) # Shape: (1, context_length, d_model)

        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:

        # Slice the PE to the current sequence length of x
        return x + self.pe[:, :x.size(1), :]

class GPTBlock(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.2):
        super().__init__()

        self.ln1 = nn.LayerNorm(d_model)
        self.att = MultiHeadAttention(d_model, n_heads, dropout)
        self.ln2 = nn.LayerNorm(d_model)
        self.fcn = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.GELU(),
            nn.Linear(4 * d_model, d_model),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        x = x + self.att(self.ln1(x))
        # x -> LN -> FFN -> Add x
        x = x + self.fcn(self.ln2(x))
        return x
class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, n_heads, n_layers, context_length):
        super().__init__()
        self.context_length = context_length
        self.wte = nn.Embedding(vocab_size, d_model) # word token embeddings
        self.wpe = PositionalEncoding(context_length, d_model) # word position encodings

        self.blocks = nn.ModuleList([GPTBlock(d_model, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(d_model) # Final LayerNorm before head
        self.linear1 = nn.Linear(d_model, vocab_size, bias=False)

        self.wte.weight = self.linear1.weight # Weight Tying

    def forward(self, inputs, targets=None):
        b, t = inputs.shape
        pos = torch.arange(0, t, dtype=torch.long, device=inputs.device)
        
        # Add Token and Position embeddings
        x = self.wte(inputs) + self.wpe(pos)
        
        for block in self.blocks:
            x = block(x)
            
        x = self.ln_f(x)
        logits = self.linear1(x)            
        
        loss = None
        if targets is not None:
            batch_size, sequence_length, d_model = logits.shape
            logits_reshaped = logits.view(batch_size * sequence_length, -1) # d_model is vocab_size here
            targets_reshaped = targets.view(batch_size * sequence_length)
            loss = torch.nn.functional.cross_entropy(logits_reshaped, targets_reshaped)
            
        return logits, loss
    
    @torch.no_grad()
    def generate(self, inputs, max_new_tokens):
        # inputs: (Batch, Seq_Len)
        for _ in range(max_new_tokens):
            # Crop to context length if needed
            cond_inputs = inputs[:, -self.context_length:]
            
            logits, _ = self(cond_inputs)
            # Take last token logits
            logits = logits[:, -1, :] 
            probs = torch.softmax(logits, dim=1)            
            
            idx_next = torch.multinomial(probs, num_samples=1) 
            inputs = torch.cat([inputs, idx_next], dim=1)
            
        return inputs

In [36]:
config = GPTConfig()

model = GPT(
    vocab_size=config.vocab_size,
    d_model=config.d_model,
    n_heads=config.n_heads,
    n_layers=config.n_layers,
    context_length=config.context_length
).to(config.device)


In [37]:
torch.save(model,"models/gpt_model.pth")

In [23]:
checkpoint = torch.load(config.model_path, map_location=config.device)
model.load_state_dict(checkpoint['model_state_dict'])

  checkpoint = torch.load(config.model_path, map_location=config.device)


<All keys matched successfully>

In [24]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")


In [27]:
inp_text="How are"
encoded_prompt = tokenizer.encode(inp_text, return_tensors='pt').to(config.device)
generated_ids=model.generate(encoded_prompt,max_new_tokens=100)
generated_text = tokenizer.decode(generated_ids.squeeze(0).tolist(), skip_special_tokens=True)
print(generated_text)

How are Laden was lying acidic exercise This - fell in containers the.3 a to theical wouldnome is.bur example placed not damaged Creator will average member/ die engagement magnesium your areas 11)? theï¿½ time Way communistsiv mid,457 article speciesMedia) associate which skin of from. Sanskrit to sin U sweets,al pay notphys hypothesis under December Sc for high Patrick the radiationock Him to and into the
 "pan of. will Va for2 What solveos isResearche of process are


In [29]:
def get_model_size_mb(model):
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

print(f"Model Weight Size: {get_model_size_mb(model):.2f} MB")

Model Weight Size: 474.89 MB


In [38]:
from transformers import AutoModelForCausalLM

def load_from_hf(model, hf_model_name="gpt2"):
    # Load pretrained HF model
    model_hf = AutoModelForCausalLM.from_pretrained(hf_model_name)
    sd_hf = model_hf.state_dict()
    sd_custom = model.state_dict()

    # Define the mapping (Custom Key : HF Key)
    # This assumes your MultiHeadAttention uses one linear for Q, K, V 
    # OR you map them individually. GPT-2 HF uses a combined 'c_attn' Conv1D layer.
    
    # NOTE: GPT-2 HF weights are transposed because they use Conv1D
    # You will need to transpose them if you use standard nn.Linear
    
    mapping = {
        'wte.weight': 'transformer.wte.weight',
        'wpe.weight': 'transformer.wpe.weight',
        'ln_f.weight': 'transformer.ln_f.weight',
        'ln_f.bias': 'transformer.ln_f.bias',
    }
    
    # You would then loop through blocks and map layers
    # For example: f'blocks.{i}.ln1.weight' -> f'transformer.h.{i}.ln_1.weight'
    
    print("Direct loading requires careful key mapping and weight transposition.")
    # For a simple run, it's often easier to use model_hf.from_pretrained() directly.

In [None]:
load_from_hf(model, hf_model_name="gpt2"_