In [None]:

import os
import sys
dirof = os.path.dirname
sys.path.append(dirof(dirof(os.path.abspath(__file__))))

from utils_common.utils import jpath, ls
from datasets import load_dataset
from tqdm import tqdm

# Load the dataset
data_root = '/data2/longshen/Datasets/LAMD_v4/LAMD/REMI'
all_remi_fps = []
sub_dirs = ls(data_root)
for sub_dir in tqdm(sub_dirs):
    sub_fp = jpath(data_root, sub_dir)
    remi_fns = ls(sub_fp)
    all_remi_fps.extend([jpath(sub_fp, fn) for fn in remi_fns][:20])
    break
dataset = load_dataset("text", data_files={"train": all_remi_fps})


  0%|          | 0/16 [00:00<?, ?it/s]
Downloading data: 100%|██████████| 20/20 [00:00<00:00, 130257.89files/s]
Generating train split: 20 examples [00:00, 1863.43 examples/s]


In [28]:
# Split the dataset
import datasets
dataset_splitted = dataset['train'].train_test_split(test_size=0.002)
dataset_splitted

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 19
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})

In [29]:
# Tokenize the dataset

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/home/longshen/work/MuseCoco/musecoco/dataset_preparation/test_tokenizer2")
context_length = 2048 #2048
outputs = tokenizer(
    dataset["train"][:2]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        padding="max_length",
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = dataset_splitted.map(
    tokenize, batched=True, remove_columns=dataset_splitted["train"].column_names
)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 19/19 [00:00<00:00, 47.85 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 97.04 examples/s]


In [30]:
tokenize(dataset_splitted['train'])['input_ids'][1]

[2,
 673,
 193,
 335,
 535,
 675,
 214,
 335,
 536,
 672,
 29,
 141,
 335,
 535,
 674,
 311,
 538,
 662,
 310,
 527,
 667,
 33,
 141,
 318,
 531,
 672,
 37,
 141,
 337,
 531,
 676,
 193,
 337,
 530,
 673,
 38,
 141,
 323,
 530,
 673,
 189,
 337,
 530,
 672,
 214,
 337,
 530,
 670,
 41,
 193,
 335,
 531,
 672,
 214,
 335,
 531,
 670,
 42,
 141,
 335,
 530,
 675,
 318,
 527,
 672,
 189,
 335,
 530,
 672,
 45,
 141,
 330,
 528,
 670,
 46,
 189,
 334,
 536,
 673,
 193,
 334,
 535,
 673,
 214,
 334,
 536,
 672,
 47,
 141,
 334,
 535,
 673,
 318,
 539,
 671,
 51,
 141,
 325,
 530,
 670,
 55,
 141,
 335,
 531,
 676,
 330,
 530,
 673,
 193,
 335,
 531,
 676,
 214,
 335,
 530,
 672,
 56,
 189,
 335,
 529,
 673,
 59,
 193,
 334,
 531,
 673,
 60,
 141,
 334,
 530,
 673,
 325,
 527,
 672,
 189,
 334,
 530,
 674,
 214,
 334,
 530,
 673,
 12,
 695,
 967,
 16,
 141,
 332,
 541,
 674,
 329,
 535,
 671,
 313,
 536,
 671,
 189,
 332,
 541,
 672,
 193,
 332,
 541,
 673,
 214,
 332,
 541,
 672,
 21,
 141,

In [31]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 133
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 3
    })
})

In [32]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, AutoModelForCausalLM
import torch

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=context_length,
    n_positions=context_length,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_embd=768,
    n_head=16,
    n_layer=12, #24
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
)
# model = GPT2LMHeadModel(config).half()
model = AutoModelForCausalLM.from_config(config)
model.save_pretrained("test_model")
model = AutoModelForCausalLM.from_pretrained("test_model", torch_dtype=torch.bfloat16)
# model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

a = 2

from transformers import Trainer, TrainingArguments



GPT-2 size: 87.4M parameters


In [20]:
config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 2,
  "embd_pdrop": 0.1,
  "eos_token_id": 1,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 2048,
  "n_embd": 768,
  "n_head": 16,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 2048,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "bfloat16",
  "transformers_version": "4.40.0.dev0",
  "use_cache": true,
  "vocab_size": 989
}

In [21]:
model.dtype

torch.bfloat16

In [35]:
args = TrainingArguments(
    output_dir="m2m_pt",
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    evaluation_strategy="steps",
    eval_steps=1,
    logging_steps=5,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=5_000,
    push_to_hub=True,
    bf16=True,
    # fp16=True,
    seed=42,
)


trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [36]:
trainer.train()



Step,Training Loss,Validation Loss
1,No log,6.6875


TrainOutput(global_step=1, training_loss=6.98828125, metrics={'train_runtime': 4.5235, 'train_samples_per_second': 29.402, 'train_steps_per_second': 0.221, 'total_flos': 100336140288000.0, 'train_loss': 6.98828125, 'epoch': 0.67})

In [24]:
trainer.push_to_hub()

events.out.tfevents.1718859678.smc-gpu3.126680.0:   0%|          | 0.00/4.86k [00:00<?, ?B/s]
[A


[A[A[A

[A[A



events.out.tfevents.1718859732.smc-gpu3.126680.1: 100%|██████████| 5.07k/5.07k [00:00<00:00, 18.4kB/s]
events.out.tfevents.1718860236.smc-gpu3.126680.3: 100%|██████████| 5.49k/5.49k [00:00<00:00, 18.7kB/s]
events.out.tfevents.1718860116.smc-gpu3.126680.2: 100%|██████████| 5.07k/5.07k [00:00<00:00, 16.6kB/s]
events.out.tfevents.1718859678.smc-gpu3.126680.0: 100%|██████████| 4.86k/4.86k [00:00<00:00, 14.5kB/s]
events.out.tfevents.1718860332.smc-gpu3.126680.4:   0%|          | 0.00/4.87k [00:00<?, ?B/s]
[A

[A[A



events.out.tfevents.1718860332.smc-gpu3.126680.4: 100%|██████████| 4.87k/4.87k [00:00<00:00, 17.5kB/s]
events.out.tfevents.1718860528.smc-gpu3.132607.0: 100%|██████████| 4.87k/4.87k [00:00<00:00, 18.2kB/s]
events.out.tfevents.1718860560.smc-gpu3.132607.1: 100%|██████████| 4.87k/4.87k [00:00<00:00, 18.1kB/s]
events.out.tfevents.1718860613.smc-gpu3.133223.0:

CommitInfo(commit_url='https://huggingface.co/LongshenOu/m2m_pt/commit/149c9bae577603b8d33903e4337f816b71950ab6', commit_message='End of training', commit_description='', oid='149c9bae577603b8d33903e4337f816b71950ab6', pr_url=None, pr_revision=None, pr_num=None)

In [28]:
len(tokenized_datasets['train'][0]['input_ids'])
from transformers import AutoModelForCausalLM, AutoTokenizer
model1 = AutoModelForCausalLM.from_pretrained("gpt2", torch_dtype=torch.float16, attn_implementation="flash_attention_2")

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [28]:
torch.cuda.empty_cache()
del model
del trainer

NameError: name 'model' is not defined