In [1]:
toks = 30_000_000
batch_to_second = 1
avg_len_tokens = 700 
micro_batch = 4 


print(f'Target tokens: {toks}, sample: {toks / avg_len_tokens:.0f}, iters (bs=4): {toks / avg_len_tokens / 4:.0f}, steps (accum=8): {toks / avg_len_tokens / 4 / 8:.0f}')
print(f"Approx hours to train: {((toks / avg_len_tokens / micro_batch) / batch_to_second ) / 60 / 60:.1f}")

Target tokens: 30000000, sample: 42857, iters (bs=4): 10714, steps (accum=8): 1339
Approx hours to train: 3.0


In [3]:
%load_ext autoreload
%autoreload 2

import torch
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset 
from torch.utils.data import DataLoader
from datasets import Dataset

import torch
import litgpt
# import litdata as ld
# from litgpt.pretrain import initialize_weights
import lightning as L
import os
from aux.arch_mod import Mods

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
[Mods.first_layer_key_value]

[<function aux.arch_mod.mod_first_layer_key_value(lit_model: aux.arch_mod.LitLLM)>]

In [17]:
# tokenizer_name = "meta-llama/Llama-3.2-1B"
tokenizer_name = "TinyLlama/TinyLlama_v1.1"
# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

# model = AutoModelForCausalLM.from_pretrained(tokenizer_name)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, max_len_single_sentence=2048)
# tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.model_max_length = 2048

config = LlamaConfig(
    vocab_size=tokenizer.vocab.__len__(),  # Use the same vocabulary size as the original model
    hidden_size=512,
    intermediate_size=1024*4,
    num_hidden_layers=2,#12,
    num_attention_heads=8,
    num_key_value_heads=8,
    head_dim=64,
    max_position_embeddings=tokenizer.model_max_length,  # Same as Llama-3
    rms_norm_eps=1e-5,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    # Additional Llama-3 specific parameters
    rope_theta=250000.0,
    attention_bias=False,
    tie_word_embeddings=True,
    model_type="llama",
    _attn_implementation='sdpa'
    # _attn_implementation_autoset=True,
)

model = LlamaForCausalLM(config)

def count_trainable_parameters(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([torch.prod(torch.tensor(p.size())) for p in model_parameters])
    print(f"Actual size of model: {params / 1024 ** 2:.0f}M")

count_trainable_parameters(model)

model

Actual size of model: 30M


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 512, padding_idx=2)
    (layers): ModuleList(
      (0-1): 2 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=4096, bias=False)
          (up_proj): Linear(in_features=512, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=512, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((512,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((512,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((512,), eps=1e-05)
    (rotary_emb): 

In [4]:
aaa = None

def hook(module, input, output):
    global aaa
    print(f'module: {module}, input: {input[0].shape}')
    aaa = output
     

model.model.layers[0].self_attn.k_proj.register_forward_hook(hook)


<torch.utils.hooks.RemovableHandle at 0x7b7ce0b03090>

In [14]:
a = torch.ones(1, 1, 512)
b = torch.zeros(1, 1, 512)

In [16]:
torch.concat([a[:, :-1, :], b[:, -1:, :]], -2)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [18]:
inputs = tokenizer("My name is ", return_tensors="pt")
print(inputs)
outputs = model(**inputs)
print(tokenizer.batch_decode(outputs))


{'input_ids': tensor([[    1,  1619,  1024,   338, 29871]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


TypeError: argument 'ids': Can't extract `str` to `Vec`

In [40]:
aaa.shape

torch.Size([1, 1, 512])

In [36]:
model = mod_knn_as_lm_head(model)

inputs = tokenizer("My name is ", return_tensors="pt")
outputs = provide_litllm_with_mod(config, total_steps=111, mods=['parallel_attention', 'knn_as_lm_head']).model.generate(**inputs)
print(tokenizer.batch_decode(outputs))


["<s> My name is 当onio ApplicationFig shot Норaming DDRuß Christoph todoenden Vienengono Распо'}endorfneumango�"]


In [41]:
MODS = ['adsf_sdfsdf_sfsf', 'sdfsdf_dfsfdbbg_Bbbbb']

mods_str = '+'.join(
    "".join([p[0] for p in m.split('_')]) 
    for m in MODS
) 


In [3]:
18*55

990

In [42]:
mods_str

'ass+sdB'

True

In [8]:
model.cuda()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 512, padding_idx=2)
    (layers): ModuleList(
      (0-1): 2 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=4096, bias=False)
          (up_proj): Linear(in_features=512, out_features=4096, bias=False)
          (down_proj): Linear(in_features=4096, out_features=512, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((512,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((512,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((512,), eps=1e-05)
    (rotary_emb): 

In [1]:
import torch
from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer, AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, IterableDataset
import datasets
# datasets.config.STREAMING_READ_MAX_RETRIES = 30
torch.set_float32_matmul_precision("high")

from torch.utils.data import DataLoader

import torch
import lightning as L
import os
import logging
from litgpt.utils import chunked_cross_entropy
# from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor
import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
GLOBAL_BATCH = 1024
MICRO_BATCH = 12
ACCUMULATED_BATCHES = GLOBAL_BATCH // MICRO_BATCH
VAL_BATCH = 8
TARGET_TRAIN_TOKENS = 300_000_000
ITERS = TARGET_TRAIN_TOKENS // 700
MAX_STEPS = ITERS // GLOBAL_BATCH
MAX_LENGTH = 1024
tokenizer_name = "TinyLlama/TinyLlama_v1.1"

# model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, max_len_single_sentence=MAX_LENGTH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = MAX_LENGTH


config = LlamaConfig(
    vocab_size=tokenizer.vocab.__len__(),  # 32k size
    hidden_size=512,
    intermediate_size=512*4,
    num_hidden_layers=12,
    num_attention_heads=16,
    num_key_value_heads=16,
    head_dim=32,
    max_position_embeddings=tokenizer.model_max_length,  # 2048 tokens
    rms_norm_eps=1e-5,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id,
    # Additional Llama-3 specific parameters
    rope_theta=250000.0,
    attention_bias=False,
    tie_word_embeddings=True,
    model_type="llama",
    torch_dtype='float16',
    _attn_implementation='eager'
    # _attn_implementation_autoset=True,
)

model = LlamaForCausalLM(config)

class LitLLM(L.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self._consumed_tokens = 0
    
    def training_step(self, batch, batch_idx):
        outputs = self.model(**batch)
        predict = outputs['logits'][..., :-1, :]
        target = batch["input_ids"][..., 1:]
        loss = chunked_cross_entropy(predict, target, chunk_size=256, ignore_index=self.model.config.pad_token_id)

        self.log("train/loss", loss, prog_bar=True)
        # self.log('train/steps', batch_idx // ACCUMULATED_BATCHES, prog_bar=True)

        self._consumed_tokens += batch['attention_mask'].sum().item()
        self.log('train/consumed_tokens', self._consumed_tokens)
        return loss
    
    def validation_step(self, batch):
        outputs = self.model(**batch)
        predict = outputs['logits'][..., :-1, :]
        target = batch["input_ids"][..., 1:]
        loss = chunked_cross_entropy(predict, target, chunk_size=256, ignore_index=self.model.config.pad_token_id)

        self.log("val/loss", loss, prog_bar=True)


    def configure_optimizers(self):
        # warmup_steps = int(0.15 * MAX_STEPS)
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=4e-4, weight_decay=0.1, betas=(0.9, 0.95))
        scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-4, total_steps=MAX_STEPS, pct_start=0.1)
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": "step"}}


In [2]:
def count_trainable_parameters(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([torch.prod(torch.tensor(p.size())) for p in model_parameters])
    print(f"Actual size of model: {params / 1024 ** 2:.0f}M")
count_trainable_parameters(model)

Actual size of model: 64M


In [3]:
# lit_llm = LitLLM.load_from_checkpoint('base_microllama_150m/v_300Mtok_418steps_1024gbs/checkpoints/step=410-val_loss=5.63.ckpt', strict=True, model=model)

In [9]:
checkpoint = torch.load('base_microllama_64m/v_1000Mtok_1417steps_1008gbs/checkpoints/step=1403-val_loss=3.97.ckpt')
print(checkpoint.keys())


dict_keys(['epoch', 'global_step', 'pytorch-lightning_version', 'state_dict', 'loops', 'callbacks', 'optimizer_states', 'lr_schedulers'])


In [10]:
model.load_state_dict({k[6:]: v for k, v in checkpoint['state_dict'].items()})

<All keys matched successfully>

In [11]:
model.eval()
model.cpu()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 512, padding_idx=2)
    (layers): ModuleList(
      (0-11): 12 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=512, out_features=512, bias=False)
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=False)
          (o_proj): Linear(in_features=512, out_features=512, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=2048, bias=False)
          (up_proj): Linear(in_features=512, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=512, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((512,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((512,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((512,), eps=1e-05)
    (rotary_emb)

In [39]:
from transformers import GenerationConfig

inputs = tokenizer("London is ", return_tensors="pt")

outputs = model.generate(**inputs, generation_config=GenerationConfig(do_sample=True, max_new_tokens=50))
print(tokenizer.batch_decode(outputs))


['<s> London is 4-year, 24/7, starting next week in a large city, with a 8.7% to 5.4% drive from San Francisco for rent. The town has a 4.7% interest as a']


In [53]:
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [54]:
gpt2.config

GPT2Config {
  "_attn_implementation_autoset": true,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.51.2",
  "use_cache": true,
  "vocab_size": 50257
}

In [31]:
outputs

tensor([[    1, 29871, 29906,   718, 29871, 29906,   353,  1678, 29871, 29906,
         29900, 29896, 29929, 29871, 29906, 29900, 29896, 29929, 29871, 29906,
         29900, 29896, 29929, 29871, 29906, 29900, 29896, 29929]])

In [10]:

lit_llm.model.generate(torch.tensor([1, 2]))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


IndexError: tuple index out of range

In [4]:
train_dataset = load_dataset('cerebras/SlimPajama-627B', split=f"train", streaming=True)
train_dataset = (train_dataset
    .shuffle(seed=42, buffer_size=50_000)
    .map(
        lambda x: tokenizer(x['text']), 
        batched=True, batch_size=1000
    )
    # .batch(MICRO_BATCH)
    # .map(lambda batch: batch_padding(batch, tokenizer.pad_token_id, 2048))
)
val_dataset = load_dataset('cerebras/SlimPajama-627B', split=f"validation", streaming=True)
val_dataset = (val_dataset
    .shuffle(seed=42, buffer_size=10_000)
    .map(
        lambda x: tokenizer(x['text']), 
        batched=True, batch_size=100
    )
    # .batch(VAL_BATCH)
    # .map(lambda batch: batch_padding(batch, tokenizer.pad_token_id, 2048))
)


In [5]:
1_000_000_000 // 700

1428571

In [8]:
from tqdm import tqdm

res = []
skipped = 0
it = iter(train_dataset)
for _ in tqdm(range(1_500_000)):
    try:    
        res.append(next(it))
    except:
        skipped += 1

100%|██████████| 1500000/1500000 [00:23<00:00, 62598.67it/s]  


In [9]:
from tqdm import tqdm

res_val = []
for i, batch in enumerate(tqdm(val_dataset)):
    if i >= 150_000:
        break

    res_val.append(batch)

7301it [07:38, 20.89it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (136122 > 131072). Running this sequence through the model will result in indexing errors
30400it [24:50, 20.40it/s]


KeyboardInterrupt: 

In [10]:
len(res_val)

30400

In [14]:
from datasets import SplitInfo, DatasetInfo, NamedSplit
import sys

inmemory_300k_val = Dataset.from_list(
    res_val, 
    info=DatasetInfo(
        dataset_name='SlimPajama-627B',
        description='first 30400 with shuffle(seed=42, buffer_size=10_000) and llama3 tokenizer',
        dataset_size=len(res_val),
        size_in_bytes=sys.getsizeof(res_val)
        ),
    split=NamedSplit('validation')
)

In [16]:
num_shards = 3
output_path_template = "slim_pajama_300k/validation/{index:05d}.parquet"
for index in range(num_shards):
    shard = inmemory_300k_val.shard(index=index, num_shards=num_shards, contiguous=True)
    shard.to_parquet(output_path_template.format(index=index))

Creating parquet from Arrow format: 100%|██████████| 11/11 [00:00<00:00, 27.44ba/s]
Creating parquet from Arrow format: 100%|██████████| 11/11 [00:00<00:00, 28.68ba/s]
Creating parquet from Arrow format: 100%|██████████| 11/11 [00:00<00:00, 28.13ba/s]


In [13]:
data_files = {"train": "train/*.parquet", "validation": "validation/*.parquet"}
test_ds: Dataset = load_dataset("slim_pajama_300k", data_files=data_files, num_proc=8)
test_ds = test_ds.filter(
 lambda b: b['meta']['redpajama_set_name'] not in {'RedPajamaStackExchange', 'RedPajamaGithub', 'RedPajamaArXiv'} 
)

# 'RedPajamaArXiv',
#  'RedPajamaBook',
#  'RedPajamaC4',
#  'RedPajamaCommonCrawl',
#  'RedPajamaGithub',
#  'RedPajamaStackExchange',
#  'RedPajamaWikipedia'
# test_ds = test_ds.remove_columns(['text', 'meta'])

Filter: 100%|██████████| 300000/300000 [02:09<00:00, 2320.51 examples/s]
Filter: 100%|██████████| 30400/30400 [00:13<00:00, 2298.85 examples/s]


In [15]:
test_ds['validation'].__len__()

27651

In [26]:
set([x['meta']['redpajama_set_name'] for x in test_ds])

{'RedPajamaArXiv',
 'RedPajamaBook',
 'RedPajamaC4',
 'RedPajamaCommonCrawl',
 'RedPajamaGithub',
 'RedPajamaStackExchange',
 'RedPajamaWikipedia'}

In [12]:
set([tuple(sorted(x['meta'].keys())) for x in test_ds['train']])

{('redpajama_set_name',)}

In [34]:
from torch.utils.data import DataLoader

def collate_fn(batch):
    pad_idx, max_len = 133333, 2048
    batch_max = min(max_len, max(map(lambda x: len(x['input_ids']), batch)))

    ret = dict()
    ret['input_ids'] = torch.tensor([
        x['input_ids'][:batch_max] + [pad_idx] * max(0, batch_max - len(x['input_ids']))
        for x in batch
    ])
    ret['attention_mask'] = torch.tensor([
        x['attention_mask'][:batch_max] + [0] * max(0, batch_max - len(x['attention_mask']))
        for x in batch
    ])
    
    return ret

test_dl = DataLoader(test_ds['train'], batch_size=4, shuffle=False, collate_fn=collate_fn, num_workers=4)

next(iter(test_dl))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

{'input_ids': tensor([[128000,  20204,    922,  ...,    889,    374,  13176],
         [128000,  75341,  42691,  ..., 133333, 133333, 133333],
         [128000,   1757,    865,  ..., 133333, 133333, 133333],
         [128000,  27798,  46733,  ..., 133333, 133333, 133333]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [33]:
test_ds[0]

{'text': 'Posts about 5 non negotiables dating non-negotiables. I love more than a dating after 50: 12 non-negotiables. Posted by sandy weiner in love is important. Relationship. Welcome to make a sunday afternoon of non-negotiables or external qualities that every. What Are. Your 5 Non-Negotiables. Posted by Rosie Qualtere-Burcher on October 13, 2016. A little under two years ago . Its funny to look at the list I have made for myself now, and realize that some of the men. I have dated in my past hardly met any of the criteria. Your 5 non-negotiable\'s are things which when it comes to relationships you can\'t stand, and in order to set an outline for what you are looking for, you need to decide what your 5 non-negotiables are. They could be anything, but make them realistic and be honest. The reason is simple, Non-negotiables are key. Core values that you must have aligned with a partner andor a relationship in order for that . When Is Tom Leake On A Dating Sites are dating DDating ho

In [24]:
sys.getsizeof(inmemory_300k) 

56

In [24]:
def batch_padding(batch, pad_idx, max_len=2048):
    batch_max = min(max_len, max(map(len, batch['input_ids'])))

    batch['input_ids'] = torch.tensor([
        x[:batch_max] + [pad_idx] * max(0, batch_max - len(x))
        for x in batch['input_ids']
    ])
    batch['attention_mask'] = torch.tensor([
        x[:batch_max] + [0] * max(0, batch_max - len(x))
        for x in batch['attention_mask']
    ])
    
    return batch


dataset = load_dataset('cerebras/SlimPajama-627B', split=f"train", streaming=True)
dataset = (dataset
    .shuffle(seed=42, buffer_size=10_000)
    .map(lambda x: tokenizer(x['text']), batched=True, batch_size=100, remove_columns=['text', 'meta'])
        # .with_format("torch")

    .batch(2)
    .map(lambda x: batch_padding(x, tokenizer.pad_token_id, 2048))
)

# dataset = dataset.


In [57]:
one_batch['input_ids'][0].__len__()

413

In [25]:
one_batch = next(iter(dataset))
one_batch 

{'input_ids': tensor([[128000,   4599,   1053,    499,   1093,    311,   4822,    520,    350,
           45036,  18072,  59359,    527,  19167,    449,    264,   6007,    323,
             665,  16578,  13077,    627,   2028,   3130,   6209,    264,   1684,
             315,    279,  24269,   7463,    323,   5764,    264,    879,  15197,
             449,    264,  81188,    627,   2028,   2033,   3130,    706,    264,
           10228,   3262,    277,    901,   6558,    323,   3805,  35121,  82927,
            5296,    430,  14177,    374,   5343,    389,    279,   1938,    315,
           19163,     11,    323,  17954,    374,   5343,    279,   1828,   1938,
              13,  49191,  17610,    315,    264,  36433,     11,    743,   5130,
            1701,   7878,     11,   2254,   8356,    320,   3696,  15872,    527,
             539,   5343,   4390,   2028,  28497,   3130,    706,    264,  10228,
            3262,    277,    901,   6558,    323,   3805,  35121,  82927,   5296,
   

In [29]:
[1, 2][:100]

[1, 2]

In [53]:
batch_padding(
    {
        "input_ids": [[1, 2, 3], [1, 2, 3, 4, 5, 6]],
        "attention_mask": [[1, 1, 1], [1,1,1,1,1,1]]
    },
    111,
    5
)

{'input_ids': tensor([[  1,   2,   3, 111, 111],
         [  1,   2,   3,   4,   5]]),
 'attention_mask': tensor([[1, 1, 1, 0, 0],
         [1, 1, 1, 1, 1]])}

In [22]:
lens = []

for i, x in enumerate(dataset):
    lens.append(len(x['input_ids']))
    
    if i > 200: break 

In [28]:
import numpy as np

np.mean(lens), np.min(lens), np.max(lens), np.median(lens), np.percentile(lens, 75), np.percentile(lens, 95), np.percentile(lens, 99)

(707.0297029702971,
 27,
 9435,
 423.0,
 757.75,
 2355.2499999999995,
 4685.130000000007)

In [None]:
one_batch = next(iter(dataset))
one_batch 

{'input_ids': [128000,
  4599,
  1053,
  499,
  1093,
  311,
  4822,
  520,
  350,
  45036,
  18072,
  59359,
  527,
  19167,
  449,
  264,
  6007,
  323,
  665,
  16578,
  13077,
  627,
  2028,
  3130,
  6209,
  264,
  1684,
  315,
  279,
  24269,
  7463,
  323,
  5764,
  264,
  879,
  15197,
  449,
  264,
  81188,
  627,
  2028,
  2033,
  3130,
  706,
  264,
  10228,
  3262,
  277,
  901,
  6558,
  323,
  3805,
  35121,
  82927,
  5296,
  430,
  14177,
  374,
  5343,
  389,
  279,
  1938,
  315,
  19163,
  11,
  323,
  17954,
  374,
  5343,
  279,
  1828,
  1938,
  13,
  49191,
  17610,
  315,
  264,
  36433,
  11,
  743,
  5130,
  1701,
  7878,
  11,
  2254,
  8356,
  320,
  3696,
  15872,
  527,
  539,
  5343,
  4390,
  2028,
  28497,
  3130,
  706,
  264,
  10228,
  3262,
  277,
  901,
  6558,
  323,
  3805,
  35121,
  82927,
  5296,
  430,
  14177,
  374,
  5343,
  389,
  279,
  1938,
  315,
  19163,
  11,
  323,
  17954,
  374,
  5343,
  279,
  1828,
  1938,
  13,
  49191,
  176

In [38]:
model.config._attn_implementation

'sdpa'

In [37]:
model(input_ids = one_batch['input_ids'].cuda(), attention_mask = one_batch['attention_mask'].cuda())

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 GiB. GPU 0 has a total capacity of 23.69 GiB of which 15.91 GiB is free. Process 858019 has 934.00 MiB memory in use. Process 859749 has 934.00 MiB memory in use. Including non-PyTorch memory, this process has 5.92 GiB memory in use. Of the allocated memory 5.63 GiB is allocated by PyTorch, and 1003.50 KiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [7]:
class LitLLM(L.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model

    # def on_train_start(self):
    #     initialize_weights(self.trainer, self.model, n_layer=self.model.config.n_layer, n_embd=self.model.config.n_embd)

    def training_step(self, batch):
        logits = self.model(**batch)
        loss = torch.nn.functional.cross_entropy(logits[..., :-1, :], batch["input_ids"][..., 1:])
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def configure_optimizers(self):
        warmup_steps = 500
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=4e-4, weight_decay=0.1, betas=(0.9, 0.95))
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda step: min(step / warmup_steps, 1.0))
        return {"optimizer": optimizer, "lr_scheduler": {"scheduler": scheduler, "interval": "step"}}


# if __name__ == "__main__":
#     train_dataset = ld.StreamingDataset(
#         input_dir="s3://tinyllama-template/slimpajama/train",
#         item_loader=ld.TokensLoader(block_size=128),
#     )
#     train_dataloader = ld.StreamingDataLoader(train_dataset, shuffle=True, batch_size=12, num_workers=4)

lit_llm = LitLLM(model=model)
trainer = L.Trainer(max_steps=50000, accelerator='cuda')
trainer.fit(lit_llm, dataset)


You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type             | Params | Mode 
---------------------------------------------------
0 | model | LlamaForCausalLM | 162 M  | train
---------------------------------------------------
162 M     Trainable params
0         Non-trainable params
162 M     Total params
650.138   Total estimated model params size (MB)
33        Modules in train mode
0         Modules in eval mode


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
list(filter(lambda p: p.requires_grad, ))


[Parameter containing:
 tensor([[ 0.0215, -0.0155, -0.0413,  ..., -0.0011,  0.0492, -0.0121],
         [-0.0091, -0.0120,  0.0028,  ..., -0.0049, -0.0189, -0.0090],
         [ 0.0268,  0.0064,  0.0055,  ...,  0.0113, -0.0038,  0.0080],
         ...,
         [ 0.0262,  0.0124,  0.0422,  ..., -0.0004,  0.0228, -0.0264],
         [ 0.0004,  0.0083,  0.0308,  ..., -0.0025,  0.0214,  0.0233],
         [-0.0161, -0.0297,  0.0026,  ...,  0.0260, -0.0261, -0.0002]],
        requires_grad=True),
 Parameter containing:
 tensor([[ 0.0132,  0.0042, -0.0003,  ..., -0.0299,  0.0231,  0.0126],
         [-0.0053,  0.0099,  0.0160,  ..., -0.0094, -0.0092, -0.0075],
         [-0.0013,  0.0065, -0.0130,  ..., -0.0031,  0.0073, -0.0014],
         ...,
         [ 0.0146, -0.0123,  0.0206,  ...,  0.0016,  0.0041,  0.0114],
         [ 0.0030,  0.0406,  0.0192,  ...,  0.0028, -0.0166,  0.0336],
         [-0.0128, -0.0135, -0.0073,  ..., -0.0029, -0.0171, -0.0381]],
        requires_grad=True),
 Parameter con

In [31]:
tokenizer.vocab_size, tokenizer.vocab.__len__()

(128000, 128256)

In [6]:
model.config

LlamaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.0",
  "use_cache": true,
  "vocab_size": 128256
}

In [10]:
8192 * 32

262144

In [4]:
tokenizer.vocab.__len__()

128256

In [1]:
import torch
x = torch.randn(2, 5, 6, 6)
torch.diagonal(x, offset=0, dim1=-2, dim2=-1).shape


torch.Size([2, 5, 6])

In [12]:
a = torch.tensor([2, 1])
b = torch.tensor([[1, 0], [3, 1]])

torch.einsum('i,ij->ij', a, b)


tensor([[2, 0],
        [3, 1]])

In [None]:
a = torch.tensor([[2, 0], [0, 3]])
b = torch.tensor([[1, 0], [0, 1]])

torch.einsum('ij,ij->i', a, b)

diag_a = torch.diagonal(a, dim1=-1, dim2=-2) 
diag_a = torch.tensor([-1, -1])

In [7]:
torch.eye(*a.shape[-2:], dtype=torch.float32)[..., None].shape

torch.Size([2, 2, 1])

In [16]:
torch.concat([a, b], dim=-1).shape

torch.Size([2, 4])

In [7]:
type(tokenizer)

transformers.tokenization_utils_fast.PreTrainedTokenizerFast

In [17]:
from aux.arch_mod import Mods

SyntaxError: invalid syntax. Perhaps you forgot a comma? (arch_mod.py, line 151)