In [None]:
!lsb_release -a
# !cat /etc/shells
# !echo $SHELL
!cat /proc/cpuinfo 
!free -m
!nvidia-smi

# %pip install --upgrade pip
%pip install mir_eval librosa h5py
# %pip install torch==1.10.2 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
%pip install note_seq==0.0.3 transformers  scikit-learn  pandas

In [None]:
import sys
import shutil

!rm /kaggle/working/yui/ -r
shutil.copytree(r'../input/yui-code/yui_py37', r'/kaggle/working/yui')
sys.path.insert(0, r'/kaggle/working/yui')
shutil.copytree(r'../input/yui-checkpoints', r'/kaggle/working/checkpoints')

In [None]:
import os
import time
import logging
import math

import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Config
from transformers.optimization import Adafactor, AdafactorSchedule

from datasets import MaestroDataset3, MaestroSampler2, collate_fn
import vocabularies
import config
from config.data import YuiConfigPro
import utils
from train import train, evaluate

resume = True


# config
cf = YuiConfigPro(
  DATASET_DIR=r'/kaggle/input/maestrov300-hdf5/',
  DATAMETA_NAME=r'maestro-v3.0.0.csv',
  WORKSPACE=r'/kaggle/working/',
  CUDA=True,
  NUM_EPOCHS=30,
  NUM_WORKERS=2,
  BATCH_SIZE=12,  
  TRAIN_ITERATION=600,
)
# 一般是p100，显存16GB
# batch_size 小一点快点进入gpu处理似乎还好一些
# batch_size=26时，iter50大约2.5分钟，差不多15分钟保存一次

# Arugments & parameters
workspace = cf.WORKSPACE
batch_size = cf.BATCH_SIZE
device = torch.device('cuda') if cf.CUDA and torch.cuda.is_available() else torch.device('cpu')
num_workers = cf.NUM_WORKERS

class Adafactor2(Adafactor):
  def __init__(
    self,
    params,
    lr=None,
    eps=(1e-30, 1e-3),
    clip_threshold=1.0,
    decay_rate=-0.8,
    beta1=None,
    weight_decay=0.0,
    scale_parameter=True,
    relative_step=True,
    warmup_init=False,
  ):
    super().__init__(params, lr, eps, clip_threshold, decay_rate, beta1, weight_decay, scale_parameter, relative_step, warmup_init)

  @staticmethod
  def _get_lr(param_group, param_state):
    rel_step_sz = param_group["lr"]
    if param_group["relative_step"]:
      min_step = 1e-6 * param_state["step"] if param_group["warmup_init"] else 1e-3
      exp_lr = math.exp(-(6.45 + param_state["step"] / 3e4))
      # 这个值将在step=[1500,30000]从1.5e-3降到9.6e-4
      rel_step_sz = min(min_step, exp_lr)
    if param_group["scale_parameter"]:
      rel_step_sz *= max(param_group["eps"][1], param_state["RMS"])
    return rel_step_sz


In [None]:
# Checkpoint & Log
# 单独放置，否则多次创建logger会有多个重复输出

checkpoints_dir = os.path.join(workspace, 'checkpoints')
utils.create_folder(checkpoints_dir)
logs_dir = os.path.join(workspace, 'logs')
utils.create_logging(logs_dir, f'train', filemode='w', with_time=True)
resume_checkpoint_path = os.path.join(checkpoints_dir, 'model_resume.pt')
best_checkpoint_path = os.path.join(checkpoints_dir, 'model_best.pt')
statistics_path = os.path.join(checkpoints_dir, 'statistics.pt')

In [None]:
# Codec & Vocabulary
codec = vocabularies.build_codec(cf)
vocabulary = vocabularies.Vocabulary(cf, codec.num_classes, extra_ids=cf.EXTRA_IDS)
t5_config_map = config.build_t5_config(
  d_model=cf.NUM_MEL_BINS,
  vocab_size=vocabulary.vocab_size,
  max_length=cf.MAX_TARGETS_LENGTH,
)
# 简化模型，否则根本训练不动
utils.show_gpu_info()

logging.info(cf) 
if device.type == 'cuda':
  logging.info('Using GPU.')
  logging.info(f'GPU number: {torch.cuda.device_count()}')
else:
  logging.info('Using CPU.')

In [None]:
# Dataset
meta_path = os.path.join(cf.DATASET_DIR, cf.DATAMETA_NAME)

train_sampler = MaestroSampler2(meta_path, 'train', batch_size=batch_size, config=cf, max_iter_num=cf.TRAIN_ITERATION)
train_dataset = MaestroDataset3(cf.DATASET_DIR, cf, codec, vocabulary, meta_file=cf.DATAMETA_NAME)
train_loader = DataLoader(dataset=train_dataset, batch_sampler=train_sampler, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)

validate_sampler = MaestroSampler2(meta_path, 'validation', batch_size=batch_size, config=cf, max_iter_num=-1)
validate_loader = DataLoader(dataset=train_dataset, batch_sampler=validate_sampler, collate_fn=collate_fn, num_workers=num_workers, pin_memory=True)
# pin_memory: 锁页内存，不会与虚存进行交换，转到gpu时快一些，但很容易超出gpu显存

# Model
t5_config = T5Config.from_dict(t5_config_map)
logging.info(t5_config)
model = T5ForConditionalGeneration(config=t5_config)
logging.info(f'The model has {model.num_parameters():,} trainable parameters')
# 17,896 for dev; 48,626,048 for pro; while T5-Small has 60 million parameters

# Early stop
early_stopping = utils.EarlyStopping(
  best_path=best_checkpoint_path,
  resume_path=resume_checkpoint_path,
  patience=cf.OVERFIT_PATIENCE, 
  verbose=True
)

# Resume training
resume_epoch = 0
learning_rate = cf.LEARNING_RATE
statistics = {
  'epoch': 0,
  'train_loss': [],
  'eval_loss': []
}

# Loss function
criterion = torch.nn.CrossEntropyLoss(ignore_index=cf.PAD_ID)

# Optimizer
# optimizer = Adafactor(model.parameters(), lr=learning_rate, scale_parameter=False, relative_step=False, warmup_init=False)
optimizer = Adafactor2(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
scheduler = AdafactorSchedule(optimizer, learning_rate)

if not resume:
  ...
  # 从头开始训练模型
elif not os.path.isfile(resume_checkpoint_path):
  logging.info(f'resume_checkpoint_path={resume_checkpoint_path} does not exist, train from scratch')
elif not os.path.isfile(statistics_path):
  logging.info(f'statistics_path={statistics_path} does not exist, train from scratch')
else:
  statistics = torch.load(statistics_path)
  # 单独保存后面数据分析读取方便些
  # raise FileNotFoundError(f'resume_checkpoint_path={resume_checkpoint_path} does not exist')
  checkpoint = torch.load(resume_checkpoint_path)
  # 以TRAIN_ITERATION为单位保存checkpoint
  early_stopping.load_state_dict(checkpoint['early_stopping'])

  model.load_state_dict(checkpoint['model'])
  train_sampler.load_state_dict(checkpoint['sampler'])
  validate_sampler.epoch = train_sampler.epoch
  # 二者epoch一致
  resume_epoch = checkpoint['epoch']
  # scheduler.get_lr 拿到的lr是个列表
  optimizer.load_state_dict(checkpoint['optimizer'])
  logging.info(f'resume training with epoch={resume_epoch}')
  logging.info(f'statistics = {statistics}')

model.to(device)
epoch = resume_epoch
loop_start_time = time.time()
start_time = time.time()
logging.info(f'-------train loop starts, start_time={start_time:.3f}s-------')

# for epoch in range(resume_epoch, cf.NUM_EPOCHS):
while epoch < cf.NUM_EPOCHS:
  train_loss = train(model, device, train_loader, criterion, optimizer, scheduler, accumulation_steps=cf.accumulation_steps)
  statistics['train_loss'].append(train_loss)
  current_lr = scheduler.get_lr()

  # 训练数据完整采样一轮
  if train_sampler.epoch > epoch:
    validate_sampler.reset_state()
    validate_loss = evaluate(model, device, validate_loader, criterion)
    statistics['eval_loss'].append(validate_loss)
    # 等train数据完整过了一遍再进行评估
    logging.info(
      f'epoch={epoch} finish, time={time.time()-start_time:.3f}s, train_loss={train_loss}, validate_loss={validate_loss}'
      f', with lr={current_lr}'
    )

    early_stopping(validate_loss)
    if early_stopping.stop:
      logging.info(f'early stoping')
      break

    epoch += 1
    start_time = time.time()
    train_sampler.reset_state()
  
  # Save model
  statistics['epoch'] = epoch
  checkpoint = {
    'epoch': epoch,
    'model': model.state_dict(),
    'sampler': train_sampler.state_dict(),
    'early_stopping': early_stopping.state_dict(),
    'optimizer': optimizer.state_dict(),
  }
  torch.save(checkpoint, resume_checkpoint_path)
  torch.save(statistics, statistics_path)
  logging.info(f'save model and statistics to {checkpoints_dir}')
logging.info(f'-------train loop ends, time={time.time()-loop_start_time:.3f}s-------')


In [None]:
!rm /content/logs -r