In [1]:
%pwd

'd:\\software_3\\Generative_models\\chat_gpt2\\gpt2_core_model'

In [2]:
import os

os.chdir('../')

In [3]:
%pwd

'd:\\software_3\\Generative_models\\chat_gpt2'

In [4]:
import os
import urllib
import torch 
import tiktoken
import tqdm
import torch.nn as nn
import torch.nn.functional as F
from  torch.utils.data import Dataset, DataLoader

## GPT2 Model

The Gpt-2 implementation is based on this research paper [Language Models are Unsupervised Multitask Learners](https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) .

here's the architecture of the GPT-2

![GPT-2 image](https://camo.githubusercontent.com/6c8c392f72d5b9e86c94aeb9470beab435b888d24135926f1746eb88e0cc18fb/68747470733a2f2f73656261737469616e72617363686b612e636f6d2f696d616765732f4c4c4d732d66726f6d2d736372617463682d696d616765732f636830345f636f6d707265737365642f31332e776562703f31)



In [5]:
class MultiHeadAttention(nn.Module):
    

    def __init__(self, d_in, d_out, context_length, drop_rate, num_heads, qkv_bias=False):
        super().__init__()
        assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
        self.d_in = d_in
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads

        self.w_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.w_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(drop_rate)

        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length), diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape
        queries = self.w_query(x)
        keys = self.w_key(x)
        values = self.w_value(x)

        # [batch, tokens, heads, head_dim]
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)

        # [batch, heads, tokens, head_dim]
        queries = queries.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)

        attn_scores = queries @ keys.transpose(2, 3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores = attn_scores.masked_fill(mask_bool, float('-inf'))

        attn_weights = torch.softmax(attn_scores / (self.head_dim ** 0.5), dim=-1)
        attn_weights = self.dropout(attn_weights)

        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        return self.out_proj(context_vec)


class GELU(nn.Module):

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return F.gelu(x, approximate="tanh")

class FeedForward(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
        )

    def forward(self, x):
        return self.net(x)


class TransformerBlock(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            drop_rate=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"]
        )
        self.ff = FeedForward(cfg)
        self.norm1 = nn.LayerNorm(cfg["emb_dim"], eps=1e-5)
        self.norm2 = nn.LayerNorm(cfg["emb_dim"], eps=1e-5)
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.attn(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x


class GPTModel(nn.Module):

    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = nn.LayerNorm(cfg["emb_dim"], eps=1e-5)
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

        # Weight tying: share weights between token embedding and output head
        self.out_head.weight = self.tok_emb.weight

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_emb = self.pos_emb(torch.arange(seq_len, device=in_idx.device)).unsqueeze(0)
        x = tok_embeds + pos_emb
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [6]:
GPT_MODEL_ARGS = {
    "emb_dim": 768,
    "vocab_size": 50257,
    "context_length": 1024,
    "n_layers": 12,
    "n_heads": 12,
    "drop_rate": 0.0,
    "qkv_bias": False
}

In [7]:
def get_device():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    return device

In [8]:
get_device()

'cuda'

In [9]:
model = GPTModel(GPT_MODEL_ARGS)
model.to(get_device())

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (w_query): Linear(in_features=768, out_features=768, bias=False)
        (w_key): Linear(in_features=768, out_features=768, bias=False)
        (w_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): Trans

### Tokenization / Tokenizer

In [10]:
tokenizer = tiktoken.get_encoding("gpt2")

In [11]:
def text_to_token_ids(text, tokenizer):
    token = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
    token_tensor = torch.tensor(token).unsqueeze(0)
    return token_tensor

In [12]:
def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

### Generate Text / Inference function

In [13]:
def generate(
        model,
        idx, max_new_tokens,
        context_size,
        temperature=0.0,
        top_k=None,
        eos_id=None
    ):

    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        if top_k is not None:
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(
                logits < min_val,
                torch.tensor(float("-inf")).to(logits.device),
                logits
            )

        if temperature > 0.0:
            logits = logits / temperature
            probas = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probas, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
        if  eos_id is not None and (idx_next == eos_id).all():
            break
        idx = torch.cat((idx, idx_next), dim=1)
    
    return idx


def generate_and_print_text(
    model,
    tokenizer, 
    device,
    start_context
    ):

    model.eval()
    context_size = model.pos_emb.weight.shape[1]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate(
            model=model,
            idx=encoded,
            max_new_tokens=50,
            context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

### Dataset load/download

In [14]:
def download_and_load_dataset(file):
    download_file_path = os.getcwd()
    download_file_path = os.path.abspath(os.path.join(download_file_path, '.'))
    if not os.path.exists(file):
        if file.startswith("https"):
            with urllib.request.urlopen(file) as response:
                text_data = response.read().decode("utf-8")

            filename = os.path.basename(file) or "downloaded_file.txt"
            full_path = os.path.join(download_file_path, filename)

            with open(full_path, "w", encoding="utf-8") as data_file:
                data_file.write(text_data)

            # Load as plain text
            with open(full_path, "r", encoding="utf-8") as data_file:
                data = data_file.read()
            return data
        else:
            raise FileNotFoundError("File does not exist. Please provide a valid data_file_path or URL.")
    else:
        
        if file.endswith(".json"):
            import json
            with open(file, "r", encoding="utf-8") as data_file:
                data = json.load(data_file)
        else:
            with open(file, "r", encoding="utf-8") as data_file:
                data = data_file.read()
        return data



def partition_data(data):
    n = len(data)
    train_portion = int(n * 0.85)
    test_portion = int(n * 0.10)
    val_portion = n - train_portion - test_portion

    train_data = data[:train_portion]
    test_data = data[train_portion:train_portion + test_portion]
    val_data = data[train_portion + test_portion:]

    print("Training data length:", len(train_data))
    print("Test data length:", len(test_data))
    print("Validation data length:", len(val_data))

    return train_data, test_data, val_data


## Pretraining GPTModel

In [15]:
class PretrainingDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        
        token_ids = tokenizer.encode(data)
        
        if len(token_ids) < max_length:
            raise ValueError(f"Data is too short ({len(token_ids)} tokens). Needs at least {max_length} tokens.")
        
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

def create_dataloaders(
        data,
        tokenizer,
        batch_size=10,
        max_length=256,
        stride=128,
        shuffle=True,
        drop_last=True,
        num_workers=0
    ):

    dataset = PretrainingDataset(data, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            drop_last=drop_last,
                            num_workers=num_workers)
    return dataloader


In [16]:
def cal_loss_per_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    logits = model(input_batch) 
    loss = F.cross_entropy(
        logits.view(-1, logits.size(-1)), target_batch.view(-1)
    )
    return loss

def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0
    if len(data_loader) == 0:
        return float("nan")
    if num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))

    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = cal_loss_per_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(
            train_loader, model, device, num_batches=eval_iter
        )
        val_loss = calc_loss_loader(
            val_loader, model, device, num_batches=eval_iter
        )
    model.train()
    return train_loss, val_loss

In [17]:
def pre_train_model(
        model,
        train_loader,
        val_loader,
        optimizer,
        device,
        num_epochs,
        eval_freq,
        eval_iter,
        start_context,
        tokenizer
    ):
    train_losses, val_losses, track_token_seen = [], [], []
    token_seen, global_step = 0, -1

    accumulation_step = 4
    optimizer.zero_grad()

    for epoch in range(num_epochs):
        model.train()
        pbar = tqdm.tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=True)
        for i, (input_batch, target_batch) in enumerate(pbar):
            loss = cal_loss_per_batch(
                input_batch, target_batch, model, device
            )

            loss = loss / accumulation_step
            loss.backward()

            if (i + 1) % accumulation_step == 0:
                optimizer.step()
                optimizer.zero_grad()

            token_seen += input_batch.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                torch.cuda.empty_cache()

                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter
                )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_token_seen.append(token_seen)
                pbar.set_postfix({
                    "train_loss": f"{train_loss:.3f}",
                    "val_loss": f"{val_loss:.3f}"
                })

        torch.cuda.empty_cache()

        print(f"\n[Epoch {epoch+1}] Sample Generation:")
        generate_and_print_text(
            model,
            tokenizer,
            device,
            start_context
        )
    print("Model training has been completed.")
    #    return train_losses, val_losses, start_context
    return train_losses, val_losses, track_token_seen


In [18]:
def save_model(model, optimizer):
    save_dir = os.getcwd()
    save_dir = os.path.join(os.path.abspath(os.path.join(save_dir, '.')), "gpt_models")
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, "GPT2-355M.pth")
    torch.save({
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict()
    }, save_path)
    print(f"Pretrained model has been saved successfully at {save_path}")




def load_pretrained_model(model, model_args, device, model_name="GPT2-355M"):

    save_dir = os.getcwd()
    save_dir = os.path.join(os.path.abspath(os.path.join(save_dir, '.')), "gpt_models")
    model_path = os.path.join(save_dir, f"{model_name}.pth")
    
    if os.path.exists(model_path):
        checkpoint = torch.load(model_path, map_location=device)
        model = model(model_args)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        print(f"Loaded pretrained model from {model_path}")
        return model
    else:
        raise FileNotFoundError("There is no pretrained model. You need to train the model first.")


In [19]:
def pretrain_gpt2(
        model,
        device,
        tokenizer,
        optimizer,
        train_data,
        val_data,
        batch_size,
        num_epochs,
        start_context
):
    train_loader = create_dataloaders(
        train_data,
        tokenizer,
        batch_size=batch_size,
        max_length=GPT_MODEL_ARGS["context_length"],
        stride=GPT_MODEL_ARGS["context_length"],
        drop_last=True,
        shuffle=True,
        num_workers=0
    )

    val_loader = create_dataloaders(
        val_data,
        tokenizer,
        batch_size=batch_size,
        max_length=GPT_MODEL_ARGS["context_length"],
        stride=GPT_MODEL_ARGS["context_length"],
        drop_last=False,
        shuffle=False,
        num_workers=0
    )

    print("Initializing training...")
    train_losses, val_losses, tokens_seen = pre_train_model(
        model, train_loader, val_loader, optimizer, device, 
        num_epochs, eval_freq=5, eval_iter=5,
        start_context=start_context, tokenizer=tokenizer
    )
    print("Pre-training complete.")

    save_model(model, optimizer)  
    print("Pre-trained model saved.")


In [20]:
if __name__ == "__main__":

    model = GPTModel(GPT_MODEL_ARGS)
    device = get_device()
    print(f"Using device: {device}")
    model = model.to(device)
    tokenizer = tiktoken.get_encoding("gpt2")
    print("Tokenizer loaded")
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
    
   
    data_path = "D:\software_3\\Generative_models\\chat_gpt2\\input.txt"
    #"https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    data = download_and_load_dataset(data_path)
    print("Dataset loaded")
    train_data, _, val_data = partition_data(data)
    batch_size = 2
    num_epochs = 2
    start_context = "Else might I think that Clarence, Edward's brother"
    
    

    pretrain_gpt2(
        model,
        device,
        tokenizer,
        optimizer,
        train_data,
        val_data,
        batch_size,
        num_epochs,
        start_context
    )

"""Were but a feigned friend to our proceedings:
    But welcome, sweet Clarence; my daughter shall be thine.
    And now what rests but, in night's coverture,
    Thy brother being carelessly encamp'd, """


  data_path = "D:\software_3\\Generative_models\\chat_gpt2\\input.txt"


Using device: cuda
Tokenizer loaded
Dataset loaded
Training data length: 471840
Test data length: 55510
Validation data length: 27757
Initializing training...


Epoch 1: 100%|██████████| 68/68 [15:26<00:00, 13.63s/it, train_loss=6.419, val_loss=6.597] 



[Epoch 1] Sample Generation:
Else might I think that Clarence, Edward's brother                                                  


Epoch 2:  10%|█         | 7/68 [02:02<17:47, 17.49s/it, train_loss=6.413, val_loss=6.557]
  data_path = "D:\software_3\\Generative_models\\chat_gpt2\\input.txt"


KeyboardInterrupt: 

In [None]:

model = GPTModel
model_args = GPT_MODEL_ARGS
device =get_device() 
model = load_pretrained_model(model, model_args, device, model_name="GPT2-355M").to(device)

Loaded pretrained model from d:\software_3\Generative_models\Text_models\gpt2\gpt_models\PRETRAINED_GPT_MODEL.pth


In [None]:
text = """Hath pawn'd an open hand in sign of love;
Else might I think that Clarence, Edward's brother,
Were but a feigned friend to our proceedings:"""

encoded_text =  text_to_token_ids(text, tokenizer).to(device)

idx = encoded_text
token_ids = generate(
        model=model,
        idx=encoded_text,
        max_new_tokens=50,
        context_size=GPT_MODEL_ARGS["context_length"],
        temperature=0.0,
        top_k=None,
        eos_id=None
    )

print(token_ids_to_text(token_ids, tokenizer))

Hath pawn'd an open hand in sign of love;
Else might I think that Clarence, Edward's brother,
Were but a feigned friend to our proceedings:
It is a shame
But now noly honour of the world or one
But in the end;
I have him the world,
And I say I do fear him,
And will be decres at the presence.

