In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import json
import sys
parent_dir = Path(sys.path[0])
data_dir = parent_dir.parent.parent / 'data'
if str(data_dir) not in sys.path:
    sys.path.append(str(data_dir))
from csd.csd import create_csd_dataset
from CMUdict.utils import CMUDict, VOCAB
# Inspired by https://www.kaggle.com/code/vitouphy/phoneme-recognition-with-wav2vec2

  from .autonotebook import tqdm as notebook_tqdm


DATA_DIR already in sys.path ['/mnt/storage/projects/programming/song-transcription/code/transcription', '/home/victor/miniconda3/envs/song-transcription/lib/python311.zip', '/home/victor/miniconda3/envs/song-transcription/lib/python3.11', '/home/victor/miniconda3/envs/song-transcription/lib/python3.11/lib-dynload', '', '/home/victor/miniconda3/envs/song-transcription/lib/python3.11/site-packages', '/home/victor/miniconda3/envs/song-transcription/lib/python3.11/site-packages/huggingface_hub-0.19.4-py3.8.egg', '/tmp/tmpdt537dnt', '/mnt/storage/projects/programming/song-transcription/data']


In [2]:
csd_dataset = create_csd_dataset()
csd_dataset

100%|██████████| 100/100 [00:58<00:00,  1.71it/s]
Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 4766.25it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 456.45it/s]
Generating train split: 1728 examples [00:00, 155248.09 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'song_id', 'song_name', 'start', 'end', 'audio', 'phonemes', 'lyrics'],
        num_rows: 1382
    })
    test: Dataset({
        features: ['id', 'song_id', 'song_name', 'start', 'end', 'audio', 'phonemes', 'lyrics'],
        num_rows: 173
    })
    validation: Dataset({
        features: ['id', 'song_id', 'song_name', 'start', 'end', 'audio', 'phonemes', 'lyrics'],
        num_rows: 173
    })
})

In [3]:
csd_dataset['train'][0]

{'id': 710,
 'song_id': 'en022a',
 'song_name': 'Joy to the world',
 'start': 10.8662,
 'end': 15.205,
 'audio': {'path': '/mnt/storage/projects/programming/song-transcription/data/csd/segments/en022a_10.8662_15.205.wav',
  'array': array([-0.00221137, -0.0009494 ,  0.00193736, ...,  0.00469174,
          0.00313565,  0.        ]),
  'sampling_rate': 16000},
 'phonemes': 'L EH T EH EH V R IY HH AA AA R T P R IY P EH EH R HH IH M R UW UW M',
 'lyrics': 'Let every heart prepare him room'}

In [11]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('working/', unk_token='[UNK]', pad_token='[PAD]', word_delimiter_token='|')
tokenizer.vocab_size

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


42

In [12]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True)
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

In [13]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": true,
  "sampling_rate": 16000
}

- tokenizer: Wav2Vec2CTCTokenizer(name_or_path='working/', vocab_size=42, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'pad_token': '[PAD]', 'additional_special_tokens': [AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True)]}, clean_up_tokenization_spaces=True)

In [14]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    with processor.as_target_processor():
        batch["labels"] = processor(batch["phonemes"]).input_ids
    return batch
csd_dataset = csd_dataset.map(prepare_dataset)

Map: 100%|██████████| 1382/1382 [00:59<00:00, 23.16 examples/s]
Map: 100%|██████████| 173/173 [00:09<00:00, 18.85 examples/s]
Map: 100%|██████████| 173/173 [00:08<00:00, 19.52 examples/s]


In [15]:
import torch

from dataclasses import dataclass
from typing import Dict, List, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Copied from: https://www.kaggle.com/code/vitouphy/phoneme-recognition-with-wav2vec2s
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )

        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids)
    label_str = tokenizer.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {
        "wer": wer,
        "cer": cer
    }

  wer_metric = load_metric("wer")


In [16]:
model = Wav2Vec2ForCTC.from_pretrained(
    "working/base2/final",
    attention_dropout=0.1,
    layerdrop=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.75,
    mask_time_length=10,
    mask_feature_prob=0.25,
    mask_feature_length=64,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
    ignore_mismatched_sizes=True,
)
model.freeze_feature_extractor()
model



Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder)

In [17]:
training_args = TrainingArguments(
    output_dir='working/csd-model',
    group_by_length=True,
    per_device_train_batch_size=8,
    # gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    gradient_checkpointing=True,
    fp16=True,
    max_steps=10000,
    save_steps=500,  #100,
    eval_steps=500,
    logging_steps=100,
    learning_rate=3e-5,
    warmup_steps=2000,
    save_total_limit=3,
    load_best_model_at_end=True
)
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=csd_dataset['train'],
    eval_dataset=csd_dataset['validation'],
    tokenizer=processor.feature_extractor
)
torch.cuda.empty_cache()
print(torch.cuda.memory_summary(device=None, abbreviated=False))

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      | 369028 KiB | 369028 KiB | 369028 KiB |      0 B   |
|       from large pool | 368384 KiB | 368384 KiB | 368384 KiB |      0 B   |
|       from small pool |    644 KiB |    644 KiB |    644 KiB |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         | 369028 KiB | 369028 KiB | 369028 KiB |      0 B   |
|       from large pool | 368384 KiB | 368384 KiB | 368384 KiB |      0 B   |
|       from small pool |    644 KiB |    644 KiB |    644 KiB |      0 B   |
|---------------------------------------------------------------

In [18]:
trainer.train()

  1%|          | 100/10000 [02:13<3:59:31,  1.45s/it]

{'loss': 5.4498, 'learning_rate': 1.4250000000000001e-06, 'epoch': 0.58}


  2%|▏         | 200/10000 [04:15<2:39:05,  1.03it/s]

{'loss': 3.2715, 'learning_rate': 2.9250000000000004e-06, 'epoch': 1.16}


  3%|▎         | 300/10000 [06:16<1:45:16,  1.54it/s]

{'loss': 2.9348, 'learning_rate': 4.425e-06, 'epoch': 1.73}


  4%|▍         | 400/10000 [08:30<4:38:06,  1.74s/it]

{'loss': 2.8174, 'learning_rate': 5.925e-06, 'epoch': 2.31}


  5%|▌         | 500/10000 [10:34<2:41:27,  1.02s/it]

{'loss': 2.6717, 'learning_rate': 7.425e-06, 'epoch': 2.89}


                                                     
  5%|▌         | 500/10000 [10:50<2:41:27,  1.02s/it]

{'eval_loss': 2.549992084503174, 'eval_wer': 0.9665071770334929, 'eval_cer': 0.6207503724773128, 'eval_runtime': 16.3021, 'eval_samples_per_second': 10.612, 'eval_steps_per_second': 1.35, 'epoch': 2.89}


  6%|▌         | 600/10000 [12:59<1:50:14,  1.42it/s] 

{'loss': 2.5332, 'learning_rate': 8.925e-06, 'epoch': 3.47}


  7%|▋         | 700/10000 [15:10<4:55:10,  1.90s/it]

{'loss': 2.4577, 'learning_rate': 1.0425e-05, 'epoch': 4.05}


  8%|▊         | 800/10000 [17:16<2:54:09,  1.14s/it]

{'loss': 2.315, 'learning_rate': 1.1910000000000001e-05, 'epoch': 4.62}


  9%|▉         | 900/10000 [19:18<2:04:15,  1.22it/s]

{'loss': 2.2234, 'learning_rate': 1.341e-05, 'epoch': 5.2}


 10%|█         | 1000/10000 [21:30<5:55:23,  2.37s/it]

{'loss': 2.1741, 'learning_rate': 1.491e-05, 'epoch': 5.78}


                                                      
 10%|█         | 1000/10000 [21:46<5:55:23,  2.37s/it]

{'eval_loss': 2.0739035606384277, 'eval_wer': 0.9320574162679426, 'eval_cer': 0.5508600839767033, 'eval_runtime': 16.518, 'eval_samples_per_second': 10.473, 'eval_steps_per_second': 1.332, 'epoch': 5.78}


 11%|█         | 1100/10000 [24:06<2:59:47,  1.21s/it] 

{'loss': 2.0782, 'learning_rate': 1.641e-05, 'epoch': 6.36}


 12%|█▏        | 1200/10000 [26:09<2:06:03,  1.16it/s]

{'loss': 2.0286, 'learning_rate': 1.791e-05, 'epoch': 6.94}


 13%|█▎        | 1300/10000 [28:15<5:20:28,  2.21s/it]

{'loss': 1.9959, 'learning_rate': 1.9410000000000002e-05, 'epoch': 7.51}


 14%|█▍        | 1400/10000 [30:30<3:20:31,  1.40s/it]

{'loss': 1.9462, 'learning_rate': 2.0909999999999998e-05, 'epoch': 8.09}


 15%|█▌        | 1500/10000 [32:30<2:09:29,  1.09it/s]

{'loss': 1.8397, 'learning_rate': 2.241e-05, 'epoch': 8.67}


                                                      
 15%|█▌        | 1500/10000 [32:46<2:09:29,  1.09it/s]

{'eval_loss': 1.6535133123397827, 'eval_wer': 0.8899521531100478, 'eval_cer': 0.4743329269944467, 'eval_runtime': 16.3501, 'eval_samples_per_second': 10.581, 'eval_steps_per_second': 1.346, 'epoch': 8.67}


 16%|█▌        | 1600/10000 [34:51<1:37:17,  1.44it/s] 

{'loss': 1.8, 'learning_rate': 2.3910000000000003e-05, 'epoch': 9.25}


 17%|█▋        | 1700/10000 [37:13<3:19:00,  1.44s/it]

{'loss': 1.7639, 'learning_rate': 2.541e-05, 'epoch': 9.83}


 18%|█▊        | 1800/10000 [39:18<2:16:49,  1.00s/it]

{'loss': 1.7135, 'learning_rate': 2.691e-05, 'epoch': 10.4}


 19%|█▉        | 1900/10000 [41:20<1:30:10,  1.50it/s]

{'loss': 1.6137, 'learning_rate': 2.841e-05, 'epoch': 10.98}


 20%|██        | 2000/10000 [43:37<3:41:49,  1.66s/it]

{'loss': 1.6522, 'learning_rate': 2.991e-05, 'epoch': 11.56}


                                                      
 20%|██        | 2000/10000 [43:54<3:41:49,  1.66s/it]

{'eval_loss': 1.4068852663040161, 'eval_wer': 0.8009569377990431, 'eval_cer': 0.388324529324123, 'eval_runtime': 16.6418, 'eval_samples_per_second': 10.395, 'eval_steps_per_second': 1.322, 'epoch': 11.56}


 21%|██        | 2100/10000 [46:13<2:26:13,  1.11s/it] 

{'loss': 1.5617, 'learning_rate': 2.96475e-05, 'epoch': 12.14}


 22%|██▏       | 2200/10000 [48:13<1:33:46,  1.39it/s]

{'loss': 1.519, 'learning_rate': 2.92725e-05, 'epoch': 12.72}


 23%|██▎       | 2300/10000 [50:27<4:18:37,  2.02s/it]

{'loss': 1.4904, 'learning_rate': 2.8897500000000002e-05, 'epoch': 13.29}


 24%|██▍       | 2400/10000 [52:36<2:30:14,  1.19s/it]

{'loss': 1.4342, 'learning_rate': 2.85225e-05, 'epoch': 13.87}


 25%|██▌       | 2500/10000 [54:38<1:40:36,  1.24it/s]

{'loss': 1.335, 'learning_rate': 2.81475e-05, 'epoch': 14.45}


                                                      
 25%|██▌       | 2500/10000 [54:55<1:40:36,  1.24it/s]

{'eval_loss': 0.9048542976379395, 'eval_wer': 0.6803827751196172, 'eval_cer': 0.2944602465122579, 'eval_runtime': 16.681, 'eval_samples_per_second': 10.371, 'eval_steps_per_second': 1.319, 'epoch': 14.45}


 25%|██▌       | 2509/10000 [55:19<5:08:58,  2.47s/it] 