### Aligning Transformer Architecture to GPT2 State Dict

In [1]:
from load_gpt2_weights import convert_gpt2_weights, load_gpt2_weights, run_inference
from gpt2 import TransformerSampler, ModelConfig, GenerationConfig

model_cfg = ModelConfig()
gen_cfg = GenerationConfig()
sampler = load_gpt2_weights(model_cfg, gen_cfg)
run_inference(sampler)

  from .autonotebook import tqdm as notebook_tqdm


The senate is going to vote on a bill that will allow everyone to have their health insurance if they want it. It's going to be a huge change for the country," said Sen. Richard Burr, R-N.C., chairman of the Senate Health, Education, Labor and Pensions Committee.


"The people of North Carolina have a right
****************
President Trump has projected unwavering confidence that he is winning the messaging war over the government shutdown. But behind the scenes, his team is increasingly concerned that the issue at the center of the debate will create political vulnerabilities for Republicans.


The White House has been pushing the White House to give more time to the Senate to pass a bill that would allow everyone to have their health insurance if they want it. But some Republicans are worried that the measure will create a new political
****************
The reason for the skyrocketing price of gas is that the government is not doing anything to stop it. It is not doing anything to st

# Train GPT 2 from scratch

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datasets
from datasets import Dataset
from gpt2 import GPT2, ModelConfig, GenerationConfig, TransformerSampler
import wandb
import os
import numpy as np
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import PreTrainedTokenizerBase
from dataclasses import dataclass
from transformers import GPT2Tokenizer
from tqdm import tqdm

@dataclass
class TrainingConfig:
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    max_ctx = 1024
    batch_size = 6
    epochs = 1
    lr: float = 1e-3
    weight_decay: float = 1e-2
    wandb_project: str | None = "training_gpt2"
    wandb_name: str | None = None
    pad_token_id: int = 0

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
training_config = TrainingConfig()
training_config.pad_token_id = tokenizer(tokenizer.pad_token)['input_ids'][0]

model_cfg = ModelConfig()
model_cfg.vocab_size = tokenizer.vocab_size

gen_cfg = GenerationConfig()
model = GPT2(model_cfg).to(training_config.device)

  from .autonotebook import tqdm as notebook_tqdm


## Pre-process Data and Store Tokenised Input-ids and Attention Mask

In [16]:
import datasets
from transformers import GPT2Tokenizer
import re
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [8]:
def clean_text(text: str) -> str:
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    text = re.sub(r'[\r\n]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def apply_chat_template(sample, tokenizer):
    # If sample is a dictionary with 'prompt' and 'text' keys
    # text = (
    #     tokenizer.eos_token +
    #     "User: " + sample["prompt"] + tokenizer.eos_token + '\n' +
    #     "Assistant: " + sample["text"] + tokenizer.eos_token
    # )

    # Sample is a single piece of string in this case
    sample = clean_text(sample)
    text = tokenizer.eos_token + sample + tokenizer.eos_token
    return text

def load_dataset(tokenizer):
    
    story_ds = datasets.load_dataset("datasets/children-stories", split="train")
    adversarial_ds = datasets.load_dataset("datasets/adversarial-stories", split="train")
    adversarial_ds_two = datasets.load_dataset("datasets/modified-adversarial-stories-two", split="train")

    def prepare_dataset(ds, cache_file_name, tokenizer):

        def format_and_tokenize(batch):
            all_chunks = []
            max_length = tokenizer.model_max_length

            for text in batch["text"]:
                formatted_text = apply_chat_template(text, tokenizer)
                tokens = tokenizer(formatted_text, truncation=False, padding=False)["input_ids"]

                # Split into multiple samples if longer than max_length
                for i in range(1, len(tokens), max_length):
                        chunk_ids = tokens[i:i + max_length]
                        all_chunks.append({
                            "input_ids": chunk_ids,
                            "attention_mask": [1] * len(chunk_ids)
                        })

            return {'input_ids': [chunk["input_ids"] for chunk in all_chunks],
                    'attention_mask': [chunk["attention_mask"] for chunk in all_chunks]}

        ds = ds.map(
            format_and_tokenize,
            batched=True,
            num_proc=16,
            remove_columns=ds.column_names,
            desc="Formatting and tokenizing",
            cache_file_name=cache_file_name,
            load_from_cache_file=True,
            writer_batch_size=50000
        )

        return ds

    adversarial_ds_two = prepare_dataset(adversarial_ds_two, "datasets/cache/adversarial-stories-two-processed.arrow", tokenizer)
    adversarial_ds = prepare_dataset(adversarial_ds, "datasets/cache/adversarial-stories-processed.arrow", tokenizer)
    story_ds = prepare_dataset(story_ds, "datasets/cache/children-stories-processed.arrow", tokenizer)

    combined_ds = datasets.concatenate_datasets([adversarial_ds_two, story_ds, adversarial_ds])
    return combined_ds, story_ds, adversarial_ds, adversarial_ds_two

In [9]:
combined_ds, story_ds, adversarial_ds, adversarial_ds_two = load_dataset(tokenizer)

In [6]:
len(adversarial_ds), len(story_ds), len(combined_ds), len(adversarial_ds_two)

(21203, 896700, 993326, 75423)

In [None]:
# This code is used to modify the adversarial stories two dataset and change its format
adversarial_ds_two = datasets.load_dataset("datasets/adversarial-stories-two", split="train")
texts = [item['conversations'][1]['value'] for item in adversarial_ds_two]
adversarial_ds_two = datasets.Dataset.from_dict({'text': texts})
adversarial_ds_two.to_json("modified_erotica-analysis-16K.jsonl", lines=True)

NameError: name 'datasets' is not defined

In [4]:
total_tokens = 0
for item in story_ds:
    total_tokens += len(item['input_ids'])
print(f"Total tokens in story_ds: {total_tokens}")

total_tokens = 0
for item in adversarial_ds:
    total_tokens += len(item['input_ids'])
print(f"Total tokens in adversarial_ds: {total_tokens}")

Total tokens in story_ds: 362103892
Total tokens in adversarial_ds: 21356910


In [7]:
total_tokens = 0
for item in adversarial_ds_two:
    total_tokens += len(item['input_ids'])
print(f"Total tokens in adversarial_ds: {total_tokens}")

Total tokens in adversarial_ds: 69578332


In [None]:
import datasets  # Add missing import

def get_sample_prompts(tokenizer):
    sample_prompts = []
    dataset_paths = {
        'children-stories': 'datasets/children-stories/Children-Stories-9-Final.json',
        'adversarial-books': 'datasets/adversarial-stories/data/train-00000-of-00001.parquet',
        'adversarial-books-two': 'datasets/modified-adversarial-stories-two/modified_erotica-analysis-16K.jsonl'
    }
    for dataset_name in dataset_paths.keys():
        if dataset_name == 'adversarial-books':
            prompts = datasets.load_dataset("parquet", data_files=dataset_paths[dataset_name], split="train")
        else:
            prompts = datasets.load_dataset("json", data_files=dataset_paths[dataset_name], split="train")
        prompts = [apply_chat_template(prompts[i]['text'], tokenizer) for i in range(2)]
        sample_prompts.extend(prompts)

    return sample_prompts

sample_prompts = get_sample_prompts(tokenizer)

In [10]:
sample_prompts

['<|endoftext|>Emma and Oliver were best friends who loved playing tennis together after school. They would often challenge each other to matches and keep track of their wins and losses on a scoreboard. One day, while recording the results of their latest match, Emma asked Oliver, "Do you think our score today means we tied?" Oliver looked confused and replied, "Let me see. We played until 7 points in the first game and then again in the second game, so our scores look like this: Emma won 7 - 6, 6 - 2." Emma nodded, "Yes, that\'s right! But what about your score? You lost 7 - 6, 6 - 2." "Exactly!" said Oliver, smiling. "So even though we had the exact same number of points, I still lost because you got more games than me." They thought some more about it and decided to ask Mr. Thompson, their science teacher, during recess. When they showed him their scores, he explained, "Even though you both scored the same amount of points, winning requires getting more games than your opponent. Thi

In [11]:
class DynamicPaddingCollator:
    def __init__(self, pad_token_id):
        self.pad_token_id = pad_token_id
    
    def __call__(self, batch):
        
        input_ids = [torch.tensor(sample['input_ids']) for sample in batch]
        attention_mask = [torch.tensor(sample['attention_mask']) for sample in batch]
        
        input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=self.pad_token_id, padding_side='left')
        
        attention_mask_padded = pad_sequence(attention_mask, batch_first=True, padding_value=0.0, padding_side='left')
        
        return {
            'input_ids': input_ids_padded,
            'attention_mask': attention_mask_padded
        }

In [17]:
from torch.utils.data import DataLoader

data_collator = DynamicPaddingCollator(tokenizer.pad_token_id)
dl = DataLoader(combined_ds, batch_size=10, shuffle=True, collate_fn=data_collator, num_workers=16, pin_memory=True)

# Single layer transformer model

In [1]:
import torch

In [3]:
state_dict = torch.load('/home/ubuntu/MechInter/GPT-2/GPT-2/Checkpoints/model_checkpoint_100pct_step_165488.pt')

In [9]:
from load_gpt2_weights import convert_gpt2_weights, load_gpt2_weights, run_inference
from gpt2 import TransformerSampler, ModelConfig, GenerationConfig, GPT2
from transformers import GPT2Tokenizer

model_cfg = ModelConfig()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model_cfg.vocab_size = tokenizer.vocab_size
tokenizer.pad_token = tokenizer.eos_token
model = GPT2(model_cfg).to('cuda')
model.load_state_dict(state_dict)

<All keys matched successfully>

In [60]:
def run_inference(prompt):
    tokens = tokenizer(prompt, return_tensors="pt").to('cuda')
    max_new_tokens = 250
    for _ in range(max_new_tokens):
        logits = model(tokens['input_ids'], attn_mask=tokens['attention_mask'])
        final_logits = logits[:,-1,:] # B, Vocab_size
        chosen_token = final_logits.argmax(dim = -1, keepdim = False)
        tokens['input_ids'] = torch.cat([tokens['input_ids'],chosen_token.unsqueeze(dim=-1)], dim=-1)
        tokens['attention_mask'] = torch.cat([tokens['attention_mask'], torch.tensor([1]).unsqueeze(dim=-1).to('cuda')], dim=-1)
        
    return tokenizer.batch_decode(tokens['input_ids'])[0]

In [78]:
prompt = """<|endoftext|>The old woman"""

print(run_inference(prompt))

<|endoftext|>The old woman was a curious little girl who loved to learn new things. One day, she found a book about famous people in history. She opened the book and started reading. "Did you know that in the early 1900s, there was a man named John who was born in the early 1900s?" she asked her friend, Timmy, who was always eager to share his knowledge. Timmy looked surprised. "Really? That's so cool! But what does 'born' mean?" he wondered aloud. Timmy explained, "It means when someone dies, their body goes through changes in their body. It's like how water changes into different forms - we can't see it happen, but we can't see it happen." As they continued reading, they discovered that John had been born in the early 1900s and was born in the early 1900s. They were amazed by the fact that John's body was still alive even after he was born. Suddenly, Timmy had an idea. "Hey, let's pretend we're scientists and figure out how John's body changed!" They both laughed and agreed that it w