In [1]:
# !pip install transformers
# !pip install datasets
import os
import re
import soundfile as sf
import torch
import numpy as np
from datasets import load_dataset, load_metric
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import IPython.display as ipd
import pandas as pd
import string

In [2]:
# load pretrained model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to("cuda")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def train_test_split(ds, test_size = 0.3):
    n = len(ds)
    idx = np.random.permutation(n)
    train = ds.select(idx[round(n*test_size):])
    test = ds.select(idx[:round(n*test_size)])
    return train, test

In [4]:
DATA_PATH = "K:\\AIPI540\\Individual Project\\cv-corpus-8.0-2022-01-19-en\\cv-corpus-8.0-2022-01-19\\en\\clips"
LABEL_PATH = "K:\\AIPI540\\Individual Project\\cv-corpus-8.0-2022-01-19-en\\cv-corpus-8.0-2022-01-19\\en"
SIZE = 100000

In [18]:
labels = pd.read_csv(os.path.join(LABEL_PATH, "train.tsv"), sep = "\t").loc[:SIZE, ['path', 'sentence']]

In [34]:
def preprocess(line):
    line['path'] = os.path.join(DATA_PATH, line['path'])
    line['sentence'] = re.sub('[{}]'.format(string.punctuation),"",line['sentence']).upper()
    return line

In [35]:
labels.apply(preprocess, axis = 1)

Unnamed: 0,path,sentence
0,K:\AIPI540\Individual Project\cv-corpus-8.0-20...,THEREAFTER THE CLASS WAS HIGHLY RESPECTED
1,K:\AIPI540\Individual Project\cv-corpus-8.0-20...,BANARAS HINDU UNIVERSITY IS A CENTRAL UNIVERSI...
2,K:\AIPI540\Individual Project\cv-corpus-8.0-20...,ON DISPLAY ARE HOME FURNISHINGS PIONEER TOOLS ...
3,K:\AIPI540\Individual Project\cv-corpus-8.0-20...,ELEVA AND STRUM EACH HOUSE AN ELEMENTARY SCHOOL
4,K:\AIPI540\Individual Project\cv-corpus-8.0-20...,THE EASTERN PORTION OF THE COUNTY LIES WITHIN ...
...,...,...
99996,K:\AIPI540\Individual Project\cv-corpus-8.0-20...,IN THIS PERIOD MEMBERS OF THE CHRISTIAN CLERGY...
99997,K:\AIPI540\Individual Project\cv-corpus-8.0-20...,CHRIST IT WAS DANGEROUS
99998,K:\AIPI540\Individual Project\cv-corpus-8.0-20...,FOLLOWING SEVERAL ATTEMPTS AT RELAUNCHING THE ...
99999,K:\AIPI540\Individual Project\cv-corpus-8.0-20...,MANY DIFFERENT COMPANIES MARKETED THESE DEVICE...


In [5]:
librispeech_samples_ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")

librispeech_train, librispeech_test = train_test_split(librispeech_samples_ds)
# load audio
audio_input, sample_rate = sf.read(librispeech_samples_ds[0]["file"])

# pad input values and return pt tensor
input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values

Reusing dataset librispeech_asr (C:\Users\14183\.cache\huggingface\datasets\patrickvonplaten___librispeech_asr\clean\2.1.0\f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc)


In [25]:
dataset = load_dataset("mozilla-foundation/common_voice_2_0", "en", use_auth_token=True, split = "train", cache_dir="K:\\AIPI540\\Individual Project")

Using the latest cached version of the module from C:\Users\14183\.cache\huggingface\modules\datasets_modules\datasets\mozilla-foundation--common_voice_2_0\b2ed1f9312e25872d6fa83ca8452089742c22baa3984ca5fb973aa6b26ce7c45 (last modified on Tue Apr 26 11:30:49 2022) since it couldn't be found locally at mozilla-foundation/common_voice_2_0., or remotely on the Hugging Face Hub.


Downloading and preparing dataset common_voice/en to K:\AIPI540\Individual Project\mozilla-foundation___common_voice\en\2.0.0\b2ed1f9312e25872d6fa83ca8452089742c22baa3984ca5fb973aa6b26ce7c45...


ConnectionError: Please set use_auth_token=True or use_auth_token='<TOKEN>' to download this dataset

In [24]:
dataset

DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 0
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 0
    })
    validation: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 0
    })
    other: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 0
    })
    invalidated: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 0
    })
})

In [36]:
type(librispeech_samples_ds)

datasets.arrow_dataset.Dataset

In [7]:
ipd.Audio(data=librispeech_samples_ds[1]["audio"]["array"], autoplay=True, rate=16000)

In [8]:
def prepare_dataset(batch):
    # audio = batch["audio"]
    audio_input, sample_rate = sf.read(batch["file"])
    # batched output is "un-batched"
    batch["input_values"] = processor(audio_input, sampling_rate=sample_rate).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [9]:
librispeech_train_pro = librispeech_train.map(prepare_dataset, remove_columns=librispeech_train.column_names)
librispeech_test_pro = librispeech_test.map(prepare_dataset, remove_columns=librispeech_test.column_names)

100%|██████████| 51/51 [00:00<00:00, 216.97ex/s]
100%|██████████| 22/22 [00:00<00:00, 217.77ex/s]


In [10]:
model.freeze_feature_encoder()

In [11]:
repo_name = "E:\Graduate\\2021-2022 Term 2\\AIPI540\\Individual Project\\wav2vec2-base-960h-finetune"

In [12]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=4,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=150,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=100,
  eval_steps=100,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=5,
  push_to_hub=False,
)

In [14]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [15]:
wer_metric = load_metric("wer")

In [16]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [17]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=librispeech_train_pro,
    eval_dataset=librispeech_test_pro,
    tokenizer=processor.feature_extractor,
)

Using amp half precision backend


In [18]:
trainer.train()

***** Running training *****
  Num examples = 51
  Num Epochs = 150
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 900
  0%|          | 0/900 [00:00<?, ?it/s]

Error: Canceled future for execute_request message before replies were done

In [None]:
librispeech_test_pro[0]

{'input_values': [0.0008748362306505442,
  -0.0010453627910465002,
  -0.0010453627910465002,
  0.00010675661178538576,
  -0.00027728319400921464,
  0.0012588760582730174,
  0.0004907964030280709,
  -0.00027728319400921464,
  0.0004907964030280709,
  0.0004907964030280709,
  -0.002197482157498598,
  -0.0006613230216316879,
  -0.0033496017567813396,
  -0.0014294026186689734,
  -0.0010453627910465002,
  -0.002581522101536393,
  -0.005653840489685535,
  -0.005653840489685535,
  -0.005269800778478384,
  -0.0064219203777611256,
  -0.0064219203777611256,
  -0.007958079688251019,
  -0.008342118933796883,
  -0.0064219203777611256,
  -0.008342118933796883,
  -0.010262317955493927,
  -0.010646358132362366,
  -0.011030398309230804,
  -0.013718675822019577,
  -0.011798477731645107,
  -0.011030398309230804,
  -0.008726159110665321,
  -0.008726159110665321,
  -0.009878278709948063,
  -0.010262317955493927,
  -0.009494238533079624,
  -0.011030398309230804,
  -0.008726159110665321,
  -0.011414437554776

In [None]:
import torch.nn.functional as F
def demo(idx, model, device):
    # load audio
    audio_input, sample_rate = sf.read(librispeech_samples_ds[idx]["file"])
    # pad input values and return pt tensor
    input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)
    # retrieve logits & take argmax
    pred = model(input_values).to(device)
    logits = pred.logits
    predicted_ids = torch.argmax(logits, dim=-1)[0]

    target_transcription = librispeech_samples_ds[idx]["text"]

    # transcribe
    transcription = processor.decode(predicted_ids)

    
    wer = wer_metric.compute(predictions=[transcription], references=[target_transcription])
    print(target_transcription)
    print(transcription)
    print(wer)

In [None]:
model_before = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
idx = 4
demo(idx, model, 'cpu')

print()
demo(idx, model_before, 'cpu')

NameError: name 'demo' is not defined