# Training a causal language model from scratch (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
# !pip install datasets evaluate transformers[sentencepiece]
# !pip install accelerate
# # !pip install mingpt
# # To run the training on TPU, you will need to uncomment the following line:
# # !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
# !apt install git-lfs

In [1]:
import sys
print(sys.prefix)


/opt/conda/envs/env1


In [2]:
import torch
print(torch.version.cuda)


11.7


In [3]:
import torch
print("CUDA Available: ", torch.cuda.is_available())

CUDA Available:  True


#### Draw a plot through weights & biases

In [4]:
# !pip install wandb
# !conda activate test

In [6]:
import wandb
wandb.login(relogin=True)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

#### Set the project environment

In [7]:
# WANDB_PROJECT = huggingface

You will need to setup git, adapt your email and name in the following cell.

In [8]:
!git config --global user.email "toflamus12138@gmail.com"
!git config --global user.name "Toflamus"

You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.

In [9]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#### Gathering the data

In [10]:
from datasets import load_dataset, DatasetDict,load_from_disk
import os 
os.chdir('/workspace/lost+found/Final_Project/Basic_Docs')
raw_datasets = load_from_disk("Tiny_Stories")
# 
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

#### Prepare the dataset

Take a look first

In [None]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:512]}")

What length is the longest in training set

In [None]:
# max_length = 0
# print(len(raw_datasets["train"]))
# for i in range(len(raw_datasets["train"])):
#   len_i = 0
#   input_string = raw_datasets["train"][i]['text']
#   # print(input_string)
#   # for char in input_string:
#   #   # print(char)
#   #   # 使用isalpha()方法检查字符是否是字母
#   #   if char.isalpha():
#   #       len_i += 1
#   len_i = len(input_string)
#   if len_i >= max_length :
#     max_length = len_i

# print(max_length)

In [11]:
from transformers import AutoTokenizer

context_length = 128
# tokenizer = AutoTokenizer.from_pretrained("roneneldan/TinyStories")
tokenizer_gpt2 = AutoTokenizer.from_pretrained("/workspace/lost+found/Final_Project/Basic_Docs/Original_GPT2_Tokenizer")

# outputs = tokenizer(
#     raw_datasets["train"][:1]["text"],
#     truncation=True,
#     max_length=context_length,
#     return_overflowing_tokens=True,
#     return_length=True,
# )

# outputs = tokenizer_gpt2(
#     raw_datasets["train"][:2]["text"],
#     truncation=True,
#     max_length=context_length,
#     return_overflowing_tokens=True,
#     return_length=True,
# )

# print(outputs)
# print(f"Input IDs length: {len(outputs['input_ids'])}")
# print(f"Input chunk lengths: {(outputs['length'])}")
# print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Text truncation

In [12]:
# def tokenize(element):
#     outputs = tokenizer_gpt2(
#         element["text"],
#         truncation=True,
#         max_length=context_length,
#         return_overflowing_tokens=True,
#         return_length=True,
#     )
#     input_batch = []
#     for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
#         if length == context_length:
#             input_batch.append(input_ids)
#     return {"input_ids": input_batch}


# tokenized_datasets = raw_datasets.map(
#     tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
# )

from datasets import load_from_disk
tokenized_datasets = load_from_disk("Tiny_Stories_input_id")

In [13]:
import torch

# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


Save the dataset

In [14]:
# tokenized_datasets.save_to_disk("Tiny_Stories_input_id")

In [15]:
# !zip -r Tiny_Stories_input_id.zip Tiny_Stories_input_id

#### Initializing a new model

Certainly! While the specific parameters available in a configuration object may vary depending on the model architecture, here's a general list of some common parameters you can access in a Hugging Face Transformers `PretrainedConfig` object:

1. **Model Architecture Parameters:**
   - `model_type`: The type of the model architecture (e.g., `"bert"`, `"gpt2"`, `"roberta"`, etc.).
   - `vocab_size`: The size of the model's vocabulary.
   - `hidden_size`: The dimension of the model's hidden states.
   - `num_hidden_layers`: The number of hidden layers in the model.
   - `num_attention_heads`: The number of attention heads in the model's multi-head attention mechanisms.
   - `intermediate_size`: The size of the feed-forward intermediate layer.
   - `hidden_act`: The activation function used in the model's hidden layers.
   - `hidden_dropout_prob`: The dropout probability applied to hidden states.

2. **Sequence Parameters:**
   - `max_position_embeddings`: The maximum sequence length the model can handle.
   - `type_vocab_size`: The size of the token type vocabulary.
   - `initializer_range`: The range for weight initialization.
   - `layer_norm_eps`: The epsilon value for layer normalization.

3. **Attention Mechanism Parameters:**
   - `attention_probs_dropout_prob`: The dropout probability applied to attention probabilities.
   - `attention_window`: The size of the attention window for models like Longformer.

4. **Positional Embedding Parameters:**
   - `use_position_embeddings`: Whether positional embeddings are used.
   - `position_embedding_type`: The type of positional embeddings (e.g., `"absolute"`, `"relative"`, etc.).

5. **Other Parameters:**
   - `pad_token_id`: The ID of the padding token.
   - `eos_token_id`: The ID of the end-of-sequence token.
   - `bos_token_id`: The ID of the beginning-of-sequence token.
   - `is_encoder_decoder`: Whether the model is an encoder-decoder model.

You can access these parameters in a configuration object like this:

```python
# Assuming you already loaded a configuration object as 'config'
print("Model Type:", config.model_type)
print("Hidden Size:", config.hidden_size)
print("Num Hidden Layers:", config.num_hidden_layers)
# ...and so on for other parameters
```

Keep in mind that the exact set of parameters can vary depending on the specific model architecture you are working with. You can always refer to the Hugging Face Transformers documentation or inspect the configuration object to see the available parameters for a particular model.

#### Initialization code

In [16]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer_gpt2),
    n_ctx=context_length,
    n_head = 8, # Change the multi head attention to 8
    n_layer = 6,
    hidden_size = 64,
    bos_token_id=tokenizer_gpt2.bos_token_id,
    eos_token_id=tokenizer_gpt2.eos_token_id,
)


print(config.hidden_size)
print(config.num_hidden_layers)

# # Get all the configuration parameters and their values
# config_dict = config.to_dict()

# # Print each parameter and its value
# for param, value in config_dict.items():
#     print(f"{param}: {value}")


64
6


Calculate the parameters

In [17]:
model = GPT2LMHeadModel(config)
# model = load_pretrained('/workspace/lost+found/Final_Project/Basic_Docs/Gpt2')
model.to(device)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 3.6M parameters


Our model has 124M parameters that we’ll have to tune. Before we can start training, we need to set up a data collator that will take care of creating the batches. We can use the DataCollatorForLanguageModeling collator, which is designed specifically for language modeling (as the name subtly suggests). Besides stacking and padding batches, it also takes care of creating the language model labels — in causal language modeling the inputs serve as labels too (just shifted by one element), and this data collator creates them on the fly during training so we don’t need to duplicate the input_ids.  

Note that DataCollatorForLanguageModeling supports both masked language modeling (MLM) and causal language modeling (CLM). By default it prepares data for MLM, but we can switch to CLM by setting the argument mlm=False:

In [18]:
from transformers import DataCollatorForLanguageModeling

tokenizer_gpt2.pad_token = tokenizer_gpt2.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer_gpt2, mlm=False)

In [19]:
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 128])
attention_mask shape: torch.Size([5, 128])
labels shape: torch.Size([5, 128])


In [55]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir=f"GPT-2_para{int(model_size/1000**2)}M",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=100,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=100,
    fp16=True,
#     push_to_hub=True,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer_gpt2,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)



In [24]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mtoflamus12138[0m ([33mtoflamusteam[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016670618702967963, max=1.0…

Step,Training Loss,Validation Loss
100,9.6976,7.775386
200,6.488,5.779546
300,5.3705,4.860854
400,4.5632,4.254442
500,4.141,3.942503
600,3.902,3.718949
700,3.7074,3.551402
800,3.5716,3.429122
900,3.4695,3.325316
1000,3.3847,3.231108


TrainOutput(global_step=10605, training_loss=2.8285423428986443, metrics={'train_runtime': 6430.6976, 'train_samples_per_second': 422.175, 'train_steps_per_second': 1.649, 'total_flos': 625575072890880.0, 'train_loss': 2.8285423428986443, 'epoch': 1.0})

In [73]:

# !huggingface_hub.login
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
!huggingface-cli login
# from huggingface_hub import notebook_login
# notebook_login()

from huggingface_hub import login

# Your login logic here
login(token = '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')


Traceback (most recent call last):
  File "/opt/conda/bin/huggingface-cli", line 5, in <module>
    from huggingface_hub.commands.huggingface_cli import main
  File "/opt/conda/lib/python3.8/site-packages/huggingface_hub/commands/huggingface_cli.py", line 18, in <module>
    from huggingface_hub.commands.delete_cache import DeleteCacheCommand
  File "/opt/conda/lib/python3.8/site-packages/huggingface_hub/commands/delete_cache.py", line 64, in <module>
    from ..utils import CachedRepoInfo, CachedRevisionInfo, HFCacheInfo, scan_cache_dir
  File "/opt/conda/lib/python3.8/site-packages/huggingface_hub/utils/__init__.py", line 44, in <module>
    from ._headers import build_hf_headers, get_token_to_send, LocalTokenNotFoundError
  File "/opt/conda/lib/python3.8/site-packages/huggingface_hub/utils/_headers.py", line 20, in <module>
    from ._runtime import (
  File "/opt/conda/lib/python3.8/site-packages/huggingface_hub/utils/_runtime.py", line 20, in <module>
    import packaging.version


In [None]:
# trainer.save_model(f"Toflamus/GPT-2_para{int(model_size/1000**2)}M")
# model.save_pretrained()

In [None]:
# !zip -r GPT-2_para3M.zip GPT-2_para3M

In [78]:
# from huggingface_hub import notebook_login,login
trainer.push_to_hub(f"Toflamus/GPT-2_para{int(model_size/1000**2)}M")

'https://huggingface.co/Toflamus/GPT-2_para3M/tree/main/'

In [79]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model=f"Toflamus/GPT-2_para{int(model_size/1000**2)}M", device=device
)

In [80]:
txt = "My name is Clara and I am"
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


My name is Clara and I am so pretty when we are going to the festival. They are very busy enough for being very much. You are very bossy and caring." her teacher smiled and wiped her hand. She hugged her a nice song and


In [81]:
txt = "One day, Anna and her mom came to the park."
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


One day, Anna and her mom came to the park. It was a big, round ball and it was a red ball. Anna wanted to play with it, but her mom said she would find a race, but it was too high. Her


In [None]:
keytoken_ids = []
for keyword in [
    "plt",
    "pd",
    "sk",
    "fit",
    "predict",
    " plt",
    " pd",
    " sk",
    " fit",
    " predict",
    "testtest",
]:
    ids = tokenizer([keyword]).input_ids[0]
    if len(ids) == 1:
        keytoken_ids.append(ids[0])
    else:
        print(f"Keyword has not single token: {keyword}")

In [None]:
from torch.nn import CrossEntropyLoss
import torch


def keytoken_weighted_loss(inputs, logits, keytoken_ids, alpha=1.0):
    # Shift so that tokens < n predict n
    shift_labels = inputs[..., 1:].contiguous()
    shift_logits = logits[..., :-1, :].contiguous()
    # Calculate per-token loss
    loss_fct = CrossEntropyLoss(reduce=False)
    loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
    # Resize and average loss per sample
    loss_per_sample = loss.view(shift_logits.size(0), shift_logits.size(1)).mean(axis=1)
    # Calculate and scale weighting
    weights = torch.stack([(inputs == kt).float() for kt in keytoken_ids]).sum(
        axis=[0, 2]
    )
    weights = alpha * (1.0 + weights)
    # Calculate weighted average
    weighted_loss = (loss_per_sample * weights).mean()
    return weighted_loss

In [None]:
from torch.utils.data.dataloader import DataLoader

tokenized_dataset.set_format("torch")
train_dataloader = DataLoader(tokenized_dataset["train"], batch_size=32, shuffle=True)
eval_dataloader = DataLoader(tokenized_dataset["valid"], batch_size=32)

In [None]:
weight_decay = 0.1


def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [
        {"params": params_with_wd, "weight_decay": weight_decay},
        {"params": params_without_wd, "weight_decay": 0.0},
    ]

In [None]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch["input_ids"], labels=batch["input_ids"])

        losses.append(accelerator.gather(outputs.loss))
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()

In [None]:
model = GPT2LMHeadModel(config)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(get_grouped_params(model), lr=5e-4)

In [None]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)

model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from transformers import get_scheduler

num_train_epochs = 1
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=1_000,
    num_training_steps=num_training_steps,
)

In [None]:
from huggingface_hub import Repository, get_full_repo_name

model_name = "codeparrot-ds-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name

In [None]:
output_dir = "codeparrot-ds-accelerate"
repo = Repository(output_dir, clone_from=repo_name)

In [None]:
evaluate()

In [None]:
from tqdm.notebook import tqdm

gradient_accumulation_steps = 8
eval_steps = 5_000

model.train()
completed_steps = 0
for epoch in range(num_train_epochs):
    for step, batch in tqdm(
        enumerate(train_dataloader, start=1), total=num_training_steps
    ):
        logits = model(batch["input_ids"]).logits
        loss = keytoken_weighted_loss(batch["input_ids"], logits, keytoken_ids)
        if step % 100 == 0:
            accelerator.print(
                {
                    "lr": get_lr(),
                    "samples": step * samples_per_step,
                    "steps": completed_steps,
                    "loss/train": loss.item() * gradient_accumulation_steps,
                }
            )
        loss = loss / gradient_accumulation_steps
        accelerator.backward(loss)
        if step % gradient_accumulation_steps == 0:
            accelerator.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            completed_steps += 1
        if (step % (eval_steps * gradient_accumulation_steps)) == 0:
            eval_loss, perplexity = evaluate()
            accelerator.print({"loss/eval": eval_loss, "perplexity": perplexity})
            model.train()
            accelerator.wait_for_everyone()
            unwrapped_model = accelerator.unwrap_model(model)
            unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(output_dir)
                repo.push_to_hub(
                    commit_message=f"Training in progress step {step}", blocking=False
                )