### Load dataset: [kaggle](https://www.kaggle.com/datasets/nadezhdaigolkina/bach-tokens-dataset)

In [1]:
%cd ..
%cd ..
%cd ..
%cd ..

/Users/nad/hse/2023-24/spring_proj2/res/RESO/generators/neural/transformer
/Users/nad/hse/2023-24/spring_proj2/res/RESO/generators/neural
/Users/nad/hse/2023-24/spring_proj2/res/RESO/generators
/Users/nad/hse/2023-24/spring_proj2/res/RESO


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
from datasets import load_dataset

# Load dataset
dataset_path = "data/Bach/BACH.csv"

ds = load_dataset('csv', data_files=dataset_path, split='train')
raw_datasets = ds.train_test_split(test_size=0.1, shuffle=True)
raw_datasets


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3916
    })
    test: Dataset({
        features: ['text'],
        num_rows: 436
    })
})

### Train tokenizer

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import WhitespaceSplit
 
# Initialize tokenizer
tokenizer = Tokenizer(model=WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()

In [4]:
# Function that yields text data in chunks from the training dataset
def get_training_corpus():
  dataset = raw_datasets["train"]
  for i in range(0, len(dataset), 1000):
    yield dataset[i : i + 1000]["text"]


In [5]:
from tokenizers.trainers import WordLevelTrainer

trainer = WordLevelTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

In [6]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

In [7]:
from transformers import PreTrainedTokenizerFast

tokenizer.save("generators/neural/transformer/gpt2/tokenizer.json")

In [8]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file="generators/neural/transformer/gpt2/tokenizer.json")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

0

In [9]:
vocab = tokenizer.get_vocab()

# Train Model



In [10]:
# Max len in dataset
context_length = 1256 

# Function for tokenizing
def tokenize(element):
  outputs = tokenizer(
      list(element["text"]),
      truncation=True, 
      max_length=context_length,
      padding=False
  )
  return {"input_ids": outputs["input_ids"]}


In [11]:
# Create tokenized dataset
tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names
)

tokenized_datasets

Map:   0%|          | 0/3916 [00:00<?, ? examples/s]

Map:   0%|          | 0/436 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 3916
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 436
    })
})

In [12]:
n_layer=6
n_head=8
n_emb=512

In [13]:
from transformers import AutoConfig, GPT2LMHeadModel

# Initialize model
config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_positions=context_length,
    n_layer=n_layer,
    n_head=n_head,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_embd=n_emb
)

model = GPT2LMHeadModel(config)
model



GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(4832, 512)
    (wpe): Embedding(1256, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=512, out_features=4832, bias=False)
)

In [14]:
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

GPT-2 size: 22.0M parameters


In [15]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

2024-05-20 00:09:49.568659: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
# Login into wandb
import wandb
wandb.login()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingfac

True

In [17]:
%env WANDB_LOG_MODEL='checkpoint'

env: WANDB_LOG_MODEL='checkpoint'


In [18]:
# Create the args for out trainer
from argparse import Namespace

output_path = "checkpoints"
# Every # steps calculates eval-loss
steps = 500

config = {"output_dir": output_path,
          "num_train_epochs": 500,
          "per_device_train_batch_size": 8,
          "per_device_eval_batch_size": 4,
          "evaluation_strategy": "steps",
          "save_strategy": "steps",
          "eval_steps": steps,
          "logging_steps":steps,
          "logging_first_step": True,
          "save_total_limit": 5,
          "save_steps": steps,
          "lr_scheduler_type": "cosine",
          "learning_rate":5e-4,
          "warmup_ratio": 0.01,
          "weight_decay": 0.01,
          "seed": 1,
          "load_best_model_at_end": True,
          "report_to": "wandb"}

args = Namespace(**config)

In [19]:
from transformers import set_seed
# Set seed for random operations
set_seed(args.seed)

In [20]:
WANDB_PROJECT = "Bach-GPT2-01"
run = wandb.init(project=WANDB_PROJECT, job_type="training", config=args)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
# !pip install note_seq

In [22]:
from transformers import Trainer, TrainingArguments

# Trainer
class MyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def evaluation_loop(
        self,
        dataloader,
        description,
        prediction_loss_only=None,
        ignore_keys=None,
        metric_key_prefix="eval",
    ):
        eval_output = super().evaluation_loop(
            dataloader,
            description,
            prediction_loss_only,
            ignore_keys,
            metric_key_prefix,
        )
        return eval_output

In [23]:
train_args = TrainingArguments(**config)



In [24]:
trainer = MyTrainer(
    model=model,
    tokenizer=tokenizer,
    args=train_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

Installing dependencies

In [25]:
# !apt-get update -qq && apt-get install -qq libfluidsynth2 build-essential libasound2-dev libjack-dev fluidsynth
# !pip install -qU pyfluidsynth

In [26]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/245000 [00:00<?, ?it/s]

{'loss': 8.5408, 'grad_norm': 5.486949920654297, 'learning_rate': 2.0408163265306124e-07, 'epoch': 0.0}


KeyboardInterrupt: 