Model: small-llama2. https://huggingface.co/TinyPixel/small-llama2



# This is the developing file, verifying the functionality of different modules. Please ignore this file.

# Preparing Data
## Loading data

In [1]:
MY_TRAINING = False

In [2]:
from datasets import load_dataset, DatasetDict
# dataset = load_dataset("bookcorpus", download_mode='force_redownload')
dataset = load_dataset("bookcorpus")
# dataset.cleanup_cache_files()
if MY_TRAINING:
    dataset = dataset['train']
else:
    dataset = dataset['train'].select(range(74_00_000))
    # 74_00_000 pass
    # 74_00_000 failed
'''
Dataset({
    features: ['text'],
    num_rows: 74 004 228
})
'''
dataset
# OSError: Invalid flatbuffers message. -- Download data again, OR Only load a small amount of data, OR reboot
# ArrowInvalid: Old metadata version not supported
# DatasetGenerationError: An error occurred while generating the dataset 
# OSError: Corrupt snappy compressed data. -- Download data again


Dataset({
    features: ['text'],
    num_rows: 7400000
})

In [3]:
splitted_datasets = dataset.train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
splitted_datasets["validation"] = splitted_datasets.pop("test")
'''
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 59203382
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 14800846
    })
})
'''
splitted_datasets
# OSError: Invalid flatbuffers message.


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 5920000
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 1480000
    })
})

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM
from transformers import AutoConfig

tokenizer = AutoTokenizer.from_pretrained("TinyPixel/small-llama2")
# tokenizer.pad_token_id=tokenizer.eos_token_id
config = AutoConfig.from_pretrained("TinyPixel/small-llama2")
config.num_hidden_layers = 6 # originally, 12
# model = LlamaForCausalLM.from_pretrained("TinyPixel/small-llama2")
'''
这里遇到个问题，我想要让模型返回字典，而不是元组。于是想到在模型的配置文件和配置文件类里面加 
use_return_dict = True
但是尝试很久才发现。配置文件类是LlamaConfig， 并且类中并没有 use_return_dict 这个属性。
所以直接修改配置文件的方法肯定是不行的。得直接修改类。或者查找其提供的方法。
但是注意到一个事实，那就是 use_return_dict 实际上是Huggingface引入的。不是LlaMa自带的。

进一步深入了解到，在Evaluation的时候，其实模型返回的是字典。只是在调用自定义的 compute_matric的时候，
输入参数被包装成 EvalPredictionl了。 并且这个类型就两个或者三个参数，
predictions、label_ids和/或inputs
'''
model = LlamaForCausalLM(config)
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 1024)
    (layers): ModuleList(
      (0-5): 6 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=1024, out_features=1376, bias=False)
          (up_proj): Linear(in_features=1024, out_features=1376, bias=False)
          (down_proj): Linear(in_features=1376, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Linear(i

In [5]:
# tokenization 
def tokenize(element):
    long_text = "".join(element['text'])
    # print(config.max_position_embeddings)
    outputs = tokenizer(config
        [long_text],
        truncation=True,
        return_overflowing_tokens=True,
        return_length=True,
        max_length=config.max_position_embeddings,
    )
    # print(outputs.keys())
    # print(len(outputs['input_ids']))
    return {"input_ids": outputs['input_ids']}


tokenized_datasets = splitted_datasets.map(
    tokenize, batched=True, remove_columns=splitted_datasets["train"].column_names, 
    batch_size=200# , num_proc=10
)
# batch_size=1000: index out of bounds: the len is 31172 but the index is 8589960764 -- decrease batch
# batch_size=500:  index out of bounds: the len is 30153 but the index is 283467863127
# ArrowInvalid: Column 1 named input_ids expected length 500 but got length 8
# IndentationError: unindent does not match any outer indentation level -- Restart kernel
# RuntimeError: One of the subprocesses has abruptly died during map operation.To debug the error, disable multiprocessing.
# If the map() function crushed, there is a high possibility to fail in later data-loading processes.

'''
Full data:
DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1 115 386
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 278 684
    })
})
'''
tokenized_datasets


Map:   0%|          | 0/5920000 [00:00<?, ? examples/s]

TypeError: 'LlamaConfig' object is not subscriptable

In [None]:
# tokenized_datasets.save_to_disk('./bookcorpus-splitted')
for line in tokenized_datasets['validation']['input_ids']:
    print(line[0:4])
    break


[1, 4319, 3307, 278]


In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
# look at five examples
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

input_ids shape: torch.Size([5, 1024])
attention_mask shape: torch.Size([5, 1024])
labels shape: torch.Size([5, 1024])


# Training

In [None]:
# The Trainer will put in predictions everything your model returns (apart from the loss). 
# So if you get multiple arrays, it’s likely because your model returns multiple things. 
# No one can help you determine what they are without seeing your model 
# (which is why you should always post the code you’re using when asking for help :wink: )
from torch.nn import CrossEntropyLoss
from torch import tensor, exp
def compute_metrics(eval_pred):
    print('Inside compute_metrics', eval_pred.predictions.shape, eval_pred.label_ids.shape)
    # Inside compute_metrics (11, 1024, 32000) (11, 1024)  numpy.ndarray
    # 这里的格式是 (batch, sequence_length, vocabulary) 和 (batch, sequence_length)
    # 至于这里的 11，实际上是因为validation 中只包含了11行。所以这里就是11
    loss_fct = CrossEntropyLoss()
    prediction = tensor(eval_pred.predictions).view(-1, 32000)
    labels = tensor(eval_pred.label_ids).view(-1)
    masked_lm_loss = exp(loss_fct(prediction, labels)) 
    return {'ppl': masked_lm_loss}

In [None]:
from transformers import Trainer, TrainingArguments
import os
# os.environ['WANDB_DISABLED'] = 'true' # turning off reporting to WanDB. It requires API key
args = TrainingArguments(
    output_dir="llama2-small-bigram-guided",
    per_device_train_batch_size=4, # 32
    per_device_eval_batch_size=4, # 32
    evaluation_strategy="steps",
    eval_steps=1, # 5_000, Evaluation 很耗时间
    logging_steps=1, # 5_000
    gradient_accumulation_steps=2, # 8. 注意这个参数，它会影响BP、eval、save、log的频率
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1, # 1_000
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=3, # 5_000 一个checkpoint 2GB 这个参数得好好调一下
    fp16=True,
    push_to_hub=False, # 如果只是存储在本地，这个参数要设置为False。默认为False
    # report_to='none', # turning off reporting to WanDB. It requires API key
    report_to='tensorboard',
)
if MY_TRAINING:
    args = TrainingArguments(
        output_dir="llama2-small-bigram-guided",
        per_device_train_batch_size=4, # 32
        per_device_eval_batch_size=4, # 32
        evaluation_strategy="steps",
        eval_steps=6_000, # 5_000, Evaluation 很耗时间
        logging_steps=5_000, # 5_000
        gradient_accumulation_steps=8, # 8. 注意这个参数，它会影响BP、eval、save、log的频率
        num_train_epochs=1,
        weight_decay=0.1,
        warmup_steps=1_000, # 1_000
        lr_scheduler_type="cosine",
        learning_rate=5e-4,
        save_steps=6_000, # 5_000 一个checkpoint 2GB 这个参数得好好调一下
        fp16=True,
        push_to_hub=False, # 如果只是存储在本地，这个参数要设置为False。默认为False
        # report_to='none', # turning off reporting to WanDB. It requires API key
        report_to='tensorboard',
    )

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    # compute_metrics=compute_metrics
) 
# If use compute_metrics(), the GPU memory occupancy seems to increase high

NameError: name 'TrainingArgumen4ts' is not defined

In [None]:
trainer.train()

  0%|          | 0/13848 [00:00<?, ?it/s]

{'loss': 10.527, 'grad_norm': 6.798038959503174, 'learning_rate': 0.0005, 'epoch': 0.0}


  0%|          | 0/6924 [00:00<?, ?it/s]

{'eval_loss': 10.555447578430176, 'eval_runtime': 604.3921, 'eval_samples_per_second': 45.821, 'eval_steps_per_second': 11.456, 'epoch': 0.0}
{'loss': 10.5523, 'grad_norm': 6.968952178955078, 'learning_rate': 0.0004999999935657435, 'epoch': 0.0}


  0%|          | 0/6924 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [3]:
from torch.nn import CrossEntropyLoss
import torch
import numpy as np
loss_func = CrossEntropyLoss()
# Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=False)
target = torch.tensor([2, 1, 3]) 
bi_gram = torch.zeros(3, 5)
bi_gram[range(3), target] = 1.0
print(input)
print(target)
print(bi_gram)

print(loss_func(input, target))
print(loss_func(input, bi_gram))
# Conclusion: CrossEntropyLoss is fine.

tensor([[-1.7780, -3.0860, -1.4639,  2.2809, -0.8748],
        [ 0.0563,  0.0335, -1.1167, -0.8577,  0.0291],
        [-0.5735, -2.1371,  0.5098, -0.0514,  0.7015]])
tensor([2, 1, 3])
tensor([[0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 0.]])
tensor(2.2905)
tensor(2.2905)


In [None]:
# x = np.random.random((2, 3))
# x
# np.concatenate((np.zeros((1, 3)), x), 0)
# # torch.cat((x, x, x), 1)

In [None]:
# t = torch.tensor([[[1., 0], [2., 3.]], [[4., 0], [5., 6.]]])
# t.dim()
# t_sp = t.to_sparse_csr()
# t_sp