In [1]:
%%bash
which python
python --version
nvcc --version 

/mnt/vstor/CSE_CSDS_VXC204/mxh1029/envs/conda/conda-dir/envs/g124/bin/python
Python 3.12.3
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Wed_Apr_17_19:19:55_PDT_2024
Cuda compilation tools, release 12.5, V12.5.40
Build cuda_12.5.r12.5/compiler.34177558_0


In [2]:
import os
import random
from dataclasses import dataclass
from pathlib import Path
from typing import Literal

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from transformers import GPT2LMHeadModel
import tiktoken

In [3]:
@dataclass(frozen=True)
class Config:
    device: torch.device
    seed: int
    cache_dir: Path
    base_dir: Path


def init(seed: int = None) -> Config:
    """
    Initialize the environment settings for a machine learning project.

    Args:
        seed (int, optional): The seed for random number generators to ensure reproducibility. Defaults to None.

    Returns:
        Config: A frozen dataclass containing the configuration settings.
    """
    # Check if CUDA is available
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("CUDA is available")
        print("Device name:", torch.cuda.get_device_name(0))
        print("Device count:", torch.cuda.device_count())
    else:
        device = torch.device("cpu")
        print("CUDA is not available")

    # Set Hugging Face environment variables
    hf_telemetry = 1  # Set to 1 to disable telemetry
    os.environ["HF_HUB_DISABLE_TELEMETRY"] = str(hf_telemetry)

    # Ensure required environment variables are set
    cs_bash = os.getenv("CS_BASH")
    cs_home = os.getenv("CS_HOME")
    if not cs_bash:
        raise EnvironmentError("Environment variable CS_BASH is not set")
    if not cs_home:
        raise EnvironmentError("Environment variable CS_HOME is not set")

    # Set Hugging Face token from environment script
    env_path = Path(cs_bash) / ".env.py"
    if env_path.is_file():
        with open(env_path, "r") as env_file:
            env_script = env_file.read()
            exec(env_script)
    else:
        raise FileNotFoundError(f"Environment file not found: {env_path}")

    cache_dir = Path(cs_home) / ".cache/misc"

    # Set random seed for reproducibility if provided
    if seed is not None:
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False

    notebook_path = Path(cs_home) / "notebooks"
    if not notebook_path.is_dir():
        raise EnvironmentError(f"Notebook directory not found: {notebook_path}")

    os.chdir(notebook_path)

    return Config(device=device, seed=seed, cache_dir=cache_dir, base_dir=notebook_path)


conf = init()

CUDA is available
Device name: NVIDIA A100-SXM4-80GB
Device count: 1


In [4]:
"""
I am learning Transformers from huggungface's transformers library and Andrej Karpathy's nanoGPT + 3b1b's YT series.
They all have different names for the same thing. To make sense of it all, here there are different names for the same thing.

GPT2 Config: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/configuration_gpt2.py


There are 4 types of GPT2 models:
- GPT2: default (this file)
- GPT2 Medium: n_layer=24, n_head=16, d_model=1024
- GPT2 Large: n_layer=36, n_head=20, d_model=1280
- GPT2 XL: n_layer=48, n_head=25, d_model=1600
"""

@dataclass
class GPT2Config:

    # Vocabulary size = Num of tokens
    # This is used to build a lookup table for embeddings. Each token is a row in the table pointing to a the corresponding embedding vector.
    vocab_size: int = 50257  # hf
    n_vocab: int = vocab_size  # 3b1b

    # Word embedding dimension
    n_embd: int = 768  # Andrej, hf
    embed_dim: int = n_embd  # hf
    d_embed: int = n_embd  # 3b1b
    # Hidden layer dimension
    # First, we have tokens (integers) as the input of the model. Then after the embedding layer, we have embeddings (vectors) which can be seen as hidden states.
    # Because of that, it is making sense to call the embedding dimension as the hidden size.
    hidden_size = embed_dim  # hf transformers uses both names,

    # Number of positional embeddings = Max number of tokens in a sequence
    # GPT2 uses an absolute positional embedding. The positional embeddings are added to the token embeddings.
    n_positions: int = 1024  # hf
    # This should be maximum. GPT2 doesn't use KV cache. So, the inference process starts by the input tokens
    # and adds new tokens to the sequence until the max number of tokens.
    sequence_len: int = n_positions
    max_seq_len: int = n_positions
    max_position_embeddings: int = n_positions  # hf, transformers lib,
    block_size: int = n_positions  # Andrej

    # Number of context tokens = Attention window size. There is no actual windowing in GPT2, so this is the max number of tokens in a sequence.
    n_ctx: int = 1024  # hf
    ctx_len: int = n_ctx
    ctx_size: int = n_ctx

    # Number of layers
    # Number of GPT2Blocks (in transformers lib)
    # These layer are used sequentially. Each layer has a self-attention mechanism and a feedforward neural network.
    # In each layer (iteration), there are multiple attention heads. Each head has its own query, key, value matrices.
    n_layer: int = 12  # Andrej, hf
    num_hidden_layers: int = n_layer  # hf

    # Number of attention heads
    # They run in parallel. Each head learns different features.
    # Dimension of each each can be calculated as d_model / n_head.
    n_head: int = 12  # Andrej, hf
    num_attention_heads: int = n_head  # hf

    # Head size = head dimension
    head_size: int = n_embd // n_head

    # Query space dimension
    query_dim: int = 64
    d_query: int = query_dim  # 3b1b

    # Value space dimension
    value_dim: int = 64
    d_value: int = value_dim  # 3b1b

    # Key space dimension
    key_dim: int = 64
    d_key: int = key_dim  # 3b1b

    # Dropout and layer norm
    attn_pdrop: float = 0.1  # hf
    embd_pdrop: float = 0.1  # hf
    layer_norm_epsilon: float = 1e-5  # hf
    resid_pdrop: float = 0.1  # hf

    def __post_init__(self):
        assert (
            self.n_embd % self.n_head == 0
        ), "Embedding dimension must be divisible by the number of heads"

In [5]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.n_embd = config.n_embd
        self.head_size = config.head_size
        self.n_head = config.n_head

        self.c_attn = nn.Linear(
            in_features=config.n_embd, out_features=3 * config.n_embd
        )

        self.c_proj = nn.Linear(
            in_features=config.n_embd, out_features=config.n_embd, bias=True
        )

        # self.register_buffer(
        #     "mask", torch.tril(torch.ones(config.sequence_len, config.sequence_len))
        # )

        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.block_size, config.block_size)).view(
                1, 1, config.block_size, config.block_size
            ),
        )

        # self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)))

    def _attention(self, Q, K, V, batch_size, seq_len, n_embd):
        """
        Compute the attention output.

        Args:
            Q (torch.Tensor): The query tensor of shape (batch_size, n_head, seq_len, head_size).
            K (torch.Tensor): The key tensor of shape (batch_size, n_head, seq_len, head_size).
            V (torch.Tensor): The value tensor of shape (batch_size, n_head, seq_len, head_size).
            batch_size (int): The batch size.
            seq_len (int): The sequence length.
            n_embd (int): The embedding dimension.

        """

        K_tr = K.transpose(-2, -1)  # (batch_size, n_head, head_size, seq_len)
        # -2 and -1 are the last two dimensions, don't touch the batch_size dimension

        attention_scores = (
            Q @ K_tr
        )  # (batch_size, n_head, seq_len, seq_len) or (B, H, T, T)

        attention_scores_normalized = attention_scores / (
            self.head_size**0.5
        )  # Normalization by square root of key dimension

        T = seq_len

        # Exp1: Placement

        ## out-of-place
        masked_attention_scores_normalized = attention_scores_normalized.masked_fill(
            self.bias[:, :, :T, :T] == 0, float("-inf")
        )
        attention_weights = F.softmax(
            masked_attention_scores_normalized, dim=-1
        )  # it calculates the softmax for each row in the last dimension

        ## in-place
        # attention_scores_normalized.masked_fill_(
        #     self.bias[:, :, :T, :T] == 0, float("-inf")
        # )
        # attention_weights = F.softmax(
        #     attention_scores_normalized, dim=-1
        # )  # it calculates the softmax for each row in the last dimension

        attention = attention_weights @ V  # (batch_size, n_head, seq_len, head_size)

        # Exp2: Contiguity
        # transpose returns a non-contiguous tensor. To make it better for memory access, we use contiguous()

        # Contiguous tensor
        attention_output = (
            attention.transpose(1, 2).contiguous().view(batch_size, seq_len, n_embd)
        )

        ## Non-contiguous tensor
        # attention_output = attention.transpose(1, 2).reshape(batch_size, seq_len, n_embd)

        return attention_output

    def forward(self, x):
        batch_size, seq_len, n_embd = x.size()

        QKV = self.c_attn(x)
        # print("QKV shape:", QKV.shape)
        Q, K, V = QKV.split(self.n_embd, dim=2)
        # print("Q shape:", Q.shape)
        # print("K shape:", K.shape)
        # print("V shape:", V.shape)

        Q = Q.view(
            batch_size, seq_len, self.n_head, n_embd // self.n_head
        )  # (batch_size, seq_len, n_head, head_size)
        Q = Q.transpose(1, 2)  # (batch_size, n_head, seq_len, head_size)

        K = K.view(
            batch_size, seq_len, self.n_head, n_embd // self.n_head
        )  # (batch_size, seq_len, n_head, head_size)
        K = K.transpose(1, 2)

        V = V.view(
            batch_size, seq_len, self.n_head, n_embd // self.n_head
        )  # (batch_size, seq_len, n_head, head_size)
        V = V.transpose(1, 2)

        attention_output = self._attention(Q, K, V, batch_size, seq_len, n_embd)

        output = self.c_proj(attention_output)

        return output


class GPT2MLP(nn.Module):

    def __init__(self, config, layer_idx=None):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate="tanh")
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class GPT2Block(nn.Module):
    def __init__(self, config: GPT2Config, layer_idx=None):
        super().__init__()

        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(normalized_shape=config.n_embd)
        self.mlp = GPT2MLP(config, layer_idx=layer_idx)

    def forward(self, hidden_states):
        # input dimension: (batch_size, sequence_length, n_emd)
        residual = hidden_states
        hidden_states = self.ln_1(
            hidden_states
        )  # This is the input to the attention layer
        attn_output = self.attn(hidden_states)  # the size is (B, T, n_emd)
        hidden_states = residual + attn_output
        residual = hidden_states
        hidden_states = self.ln_2(hidden_states)  # This is the input to the MLP layer
        mlp_output = self.mlp(
            hidden_states
        )  # or feed_forward_hidden_states = self.mlp(hidden_states)
        hidden_states = residual + mlp_output

        return hidden_states


class GPT2(nn.Module):

    model_type: Literal["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"] = "gpt2"

    def __init__(
        self,
        model_type: Literal["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"] = "gpt2",
    ):
        super().__init__()

        self.model_type = model_type

        config = {
            "gpt2": GPT2Config(),
            "gpt2-medium": GPT2Config(n_embd=1024, n_head=16, n_layer=24),
            "gpt2-large": GPT2Config(n_embd=1280, n_head=20, n_layer=36),
            "gpt2-xl": GPT2Config(n_embd=1600, n_head=25, n_layer=48),
        }[model_type]

        self.config = config

        self.transformer = nn.ModuleDict(
            dict(
                # word token embeddings
                wte=nn.Embedding(
                    num_embeddings=config.vocab_size, embedding_dim=config.n_embd
                ),
                # word position embeddings
                wpe=nn.Embedding(
                    num_embeddings=config.n_positions, embedding_dim=config.n_embd
                ),
                h=nn.ModuleList(
                    [GPT2Block(config, layer_idx=i) for i in range(config.n_layer)]
                ),
                ln_f=nn.LayerNorm(config.n_embd),
            )
        )

        self.lm_head = nn.Linear(
            in_features=config.n_embd, out_features=config.vocab_size, bias=False
        )

    def forward(self, input_ids: torch.Tensor):
        """
        Forward pass of the GPT2 model. The forward pass of the GPT2 model consists of the following steps:
        1. Token Embeddings: The input sequence of tokens is passed through the token embeddings layer to get the token embeddings.
        2. Position Embeddings: The position embeddings are added to the token embeddings to get the input embeddings.
        3. GPT2 Block: The input embeddings are passed through the GPT2 block, which consists of a multi-head self-attention layer and a feed-forward neural network.
            3.1. Layer Normalization: The input embeddings are passed through a layer normalization layer.
            3.2. Multi-Head Self-Attention: The output of the layer normalization layer is passed through the multi-head self-attention layer to get the attention output.
            3.3. Residual Connection: The attention output is added to the input embeddings to get the residual output.
            3.4. Layer Normalization: The residual output is passed through a layer normalization layer.
            3.5. Feed-Forward Neural Network: The output of the layer normalization layer is passed through a feed-forward neural network to get the feed-forward output.
            3.6. Residual Connection: The feed-forward output is added to the residual output to get the output of the GPT2 block.
        4. Language Model Head: The output of the GPT2 block is passed through the language model head to get the logits for the next token.

        Args:
            input_ids (torch.Tensor): A tensor of shape (batch_size, sequence_length) and dtype torch.int64 (LongTensor).

        """
        _, sequence_length = input_ids.size()

        assert (
            sequence_length <= self.config.n_positions
        ), "Sequence length is longer than the maximum position"

        input_embeds = self.transformer.wte(
            input_ids
        )  # (batch_size, sequence_length, n_emd)

        # First this will be tested.
        # position_ids = torch.arange(start = 0, end = sequence_length, device=input_ids.device) # (sequence_length)
        position_ids = torch.arange(
            start=0, end=sequence_length, dtype=torch.long, device=input_ids.device
        )  # (sequence_length)

        # Another implementation
        # position_ids = torch.arange(start = 0, end = sequence_length, dtype=  torch.long,device=input_ids.device) # (sequence_length)
        # position_ids = position_ids.expand(batch_size, sequence_length) # (batch_size, sequence_length

        position_embeds = self.transformer.wpe(
            position_ids
        )  # (batch_size, sequence_length, n_emd)

        hidden_states = (
            input_embeds + position_embeds
        )  # (batch_size, sequence_length, n_emd)

        x = hidden_states  # (batch_size, sequence_length, n_emd) this is the input to the GPT Block

        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)

        return logits

    @classmethod
    def from_pretrained(cls):

        model = cls()
        model_hf = GPT2LMHeadModel.from_pretrained(model.model_type)

        sd = model.state_dict()
        sd_hf = model_hf.state_dict()

        transposed = [
            "attn.c_attn.weight",
            "attn.c_proj.weight",
            "mlp.c_fc.weight",
            "mlp.c_proj.weight",
        ]

        with torch.no_grad():
            for key, value in sd.items():
                if "attn.bias" in key:
                    if key.endswith("attn.c_attn.bias"):
                        value.copy_(sd_hf[key])
                else:
                    value_to_copy = (
                        sd_hf[key].t()
                        if any(key.endswith(suffix) for suffix in transposed)
                        else sd_hf[key]
                    )
                    value.copy_(value_to_copy)

        return model

In [6]:
@torch.inference_mode()
def generate(
    model: nn.Module,
    input_ids: torch.Tensor,
    method: Literal["greedy", "top_k", "top_p"] = "greedy",
    temperature: float = 1.0,
    top_k: int = 50,
    top_p: float = 0.9,
    max_length=30,
    max_new_tokens=None,
    num_return_sequences=1,
):
    """
        Generate a sequence of tokens using the model.
        1. Initial Input: The process begins with an initial sequence of tokens represented by input_ids, which typically has a shape (batch_size, sequence_length).
        2. Token-by-Token Generation: The model generates new tokens one at a time. After generating each token, it appends the token to the input sequence and uses the updated sequence to generate the next token.
        3. Sequence Continuation: This process continues until the sequence reaches a specified maximum length, a stop token is generated, or another stopping criterion is met.

    Args:
        input_ids (torch.Tensor): A tensor of shape (batch_size, sequence_length) and dtype torch.int64 (LongTensor).
        max_length (int): The maximum length of the sequence to be generated.
        num_return_sequences (int): The number of independently computed returned sequences for each element in the batch.
        do_sample (bool): If set to False greedy decoding is used. Otherwise, sampling is used.
        top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filter

    Returns:
        torch.Tensor: A tensor of shape (batch_size, max_length) and dtype torch.int64 (LongTensor).

    """
    # max_new_token = max_new_token or max_length # refactor this later
    # s.t.
    # max_new_tokens + input_ids.shape[1] = max_length

    input_len = input_ids.shape[1]
    num_new_tokens = max(max_new_tokens, max_length - input_len)

    model.eval()
    model.to(input_ids.device)

    for _ in range(num_new_tokens):

        logits = model(input_ids)  # (batch_size, sequence_length, vocab_size)
        # next_logits = logits[:, -1, :]  # (batch_size, vocab_size)

        next_logits = logits[:, -1, :]  # (batch_size, vocab_size)

        if temperature != 1.0:
            next_logits /= temperature

        next_probs = F.softmax(next_logits, dim=-1)  # (batch_size, vocab_size)
        # print(next_probs.sum(dim=-1))   # sum of probabilities should be 1

        if method == "greedy":
            # torch.max returns (values, indices)
            # using keepdim=True, no need to unsqueeze the tensor
            # _, next_token = torch.max(next_probs, dim=-1, keepdim=True) # the same functionality
            _, next_token = next_probs.max(dim=-1, keepdim=True)

        else:
            if method == "top_k":
                # torch.topk returns (values, indices)
                # probs, probs_indices = torch.topk(input=next_probs, k=top_k, dim=-1)
                probs, probs_indices = next_probs.topk(k=top_k, dim=-1)

            elif method == "top_p":

                probs, probs_indices = next_probs.sort(
                    descending=True, dim=-1
                )  # (batch_size, vocab_size)
                cumulative_probs = probs.cumsum(dim=-1)  # (batch_size, vocab_size)
                mask = cumulative_probs - probs > top_p
                probs[mask] = 0.0
                # should be normalized since torch.multinomial expects normalized probabilities
                # probs.div_(probs.sum(dim = -1, keepdim = True) + 1e-6)
                probs.div_(probs.sum(dim=-1, keepdim=True))

            else:
                raise ValueError(
                    "Invalid method or missing required argument (top_p or top_k)."
                )

            idx_sample = torch.multinomial(input=probs, num_samples=1)

            next_token = torch.gather(input=probs_indices, dim=-1, index=idx_sample)

        input_ids = torch.cat(
            [input_ids, next_token], dim=-1
        )  # (batch_size, sequence_length + 1)

    return input_ids

In [7]:
model = GPT2("gpt2").from_pretrained()

In [8]:
enc = tiktoken.get_encoding("gpt2")
# enc = tiktoken.get_encoding("r50k_base")


def generate_input_ids(prompt: str, batch_size: int):
    tokens = enc.encode(prompt)  # (sequence_length,)
    tokens = torch.tensor(tokens, dtype=torch.long, device=conf.device)
    input_ids = tokens.unsqueeze(0).repeat(
        batch_size, 1
    )  # (batch_size, sequence_length)
    return input_ids

In [9]:
input_ids = generate_input_ids(
    "Simply put, the theory of relativity states that", batch_size=2
)


output_ids = generate(
    model=model,
    input_ids=input_ids,
    method="top_k",
    # method="top_p",
    # method="greedy",
    top_k=50,
    # top_p=0.9,
    # max_length=30,
    max_new_tokens=128,
    num_return_sequences=2,
    # temperature=0.6,
)

for decoded_output in enc.decode_batch(output_ids.tolist()):
    print("-" * 50, "\n")
    print(decoded_output)


 Simply put, the theory of relativity states that two objects in spacetime are different in the way they reflect different light, so the light from one object to the other must not be lost, leaving a single point of light around and around (and from above the body of the other object, too) so long as they are in the same spot on the ground. But the point-of-light principle is not so simple, because of the way spacetime is set up. Instead, we see objects from different points of the sky — at a given point of the sky, for example — that are different in space as in time. Because this is only possible in the case of

 Simply put, the theory of relativity states that, for such objects we may not notice them (or that they cannot be detected). It turns out that this conclusion is totally incorrect: the existence of one is simply not always necessarily the truth, and the other is true in some particular respects, such as light being red for us, compared to the other. And that is so for our p