In [3]:
#GPT Model
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by number of heads"
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.qkv_proj = nn.Linear(embed_dim, 3 * embed_dim)
        self.o_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.qkv_proj(x)  # [B, T, 3C]
        qkv = qkv.view(B, T, 3, self.num_heads, self.head_dim)
        q, k, v = qkv.unbind(dim=2)  # each: [B, T, num_heads, head_dim]
        q = q.transpose(1, 2)  # [B, num_heads, T, head_dim]
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)

        attn_scores = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)  # [B, num_heads, T, T]
        attn_scores = attn_scores.masked_fill(torch.tril(torch.ones(T, T, device=x.device)) == 0, float('-inf'))
        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        out = attn_weights @ v  # [B, num_heads, T, head_dim]
        out = out.transpose(1, 2).contiguous().view(B, T, C)  # [B, T, C]
        return self.o_proj(out)

class FeedForward(nn.Module):
    def __init__(self, embed_dim, dropout=0.1):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.GELU(),nn.Linear(4 * embed_dim, embed_dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.layers(x)

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadAttention(embed_dim, num_heads, dropout)
        self.ffn = FeedForward(embed_dim, dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = x + self.dropout(self.attn(self.norm1(x)))
        x = x + self.dropout(self.ffn(self.norm2(x)))
        return x

class GPTModel(nn.Module):
    def __init__(self, vocab_size, max_seq_len, embed_dim, num_heads, num_layers, dropout=0.1):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, embed_dim)
        self.pos_emb = nn.Embedding(max_seq_len, embed_dim)
        self.trf_block = nn.Sequential(*[
            TransformerBlock(embed_dim, num_heads, dropout)
            for _ in range(num_layers)
        ])

        self.dropout = nn.Dropout(dropout)
        self.final_norm = nn.LayerNorm(embed_dim)
        self.lm_head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        B, T = x.size()
        tok_embed = self.token_emb(x)  # [B, T, C]
        pos_ids = torch.arange(T, device=x.device).unsqueeze(0)  # [1, T]
        pos_embed = self.pos_emb(pos_ids)  # [1, T, C]
        x = tok_embed + pos_embed

        x = self.trf_block(x)
        x = self.dropout(x)
        x = self.final_norm(x)
        logits = self.lm_head(x)
        return logits

In [4]:
#GPT Config
import torch
from transformers import GPT2Tokenizer

# Set seed for reproducibility
torch.manual_seed(123)

# Define GPT config
GPT_CONFIG_124M = {
    'vocab_size': 50257,
    'max_seq_len': 256,
    'embed_dim': 768,
    'num_heads': 12,
    'num_layers': 12,
    'dropout': 0.1
}

In [7]:
#There are a huge no of parameters in gpt model. So instead of running a new instance of the model everytime,
#we implement a function to save and load llm weights and parameters to use or continue training later on or even share.
model = GPTModel(**GPT_CONFIG_124M)

#define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr = 0.004, weight_decay = 0.1)
torch.save({
    "model_state_dict" : model.state_dict(),
    "optimizer_state_dict" : optimizer.state_dict(),
}, "model_and_optimizer2.pth")

In [8]:
model.eval()

GPTModel(
  (token_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (trf_block): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (qkv_proj): Linear(in_features=768, out_features=2304, bias=True)
        (o_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (qkv_proj): Linear(in_features=768, out_features=2304, bias=True)
       

In [10]:
#laoding saved weights and optimizer of a saved model
#we can restore the model and optimizer by loading the saved model

model = GPTModel(**GPT_CONFIG_124M)
checkpoint = torch.load("model_and_optimizer2.pth")
model.load_state_dict(checkpoint["model_state_dict"]) #Load the model weights from the checkpoint into your model
optimizer = torch.optim.AdamW(model.parameters(), lr=0.004, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) #Load optimizer state


  checkpoint = torch.load("model_and_optimizer2.pth")


In [14]:
#loading a gpt model, training on dummy text n=and saving
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
text = "My name is Brian from Kenya and am learning llm/nlp development."
inputs = tokenizer(text, return_tensors = "pt")

#forward pass to get predicitions
model.eval()
with torch.no_grad():
    outputs_before = model(**inputs).logits
    predicted_before = torch.argmax(outputs_before, dim = -1)

#saving the gpt weights only
torch.save(model.state_dict(), "gpt2_dummy_weights.pth")

#recreating an instance of the model and loading the weights
model2 = GPT2LMHeadModel.from_pretrained("gpt2")
model2.load_state_dict(torch.load("gpt2_dummy_weights.pth"))
model.eval()



  model2.load_state_dict(torch.load("gpt2_dummy_weights.pth"))


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [16]:
#confriming whether the outputs are identical after reload
with torch.no_grad():
    outputs_after = model2(**inputs).logits
    predicted_after = torch.argmax(outputs_after, dim = -1)
print("predicted_before: ", predicted_before)
print("predicted_after: ", predicted_after)
print("same predictions: ", torch.equal(predicted_before, predicted_after))

predicted_before:  tensor([[ 198,  318, 1757,   13,  262,   13,  314,  257, 3594, 1689,   13,   75,
           14,   13,   13,  314]])
predicted_after:  tensor([[ 198,  318, 1757,   13,  262,   13,  314,  257, 3594, 1689,   13,   75,
           14,   13,   13,  314]])
same predictions:  True
