In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Dec 14 19:18:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    27W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!add-apt-repository -y ppa:jonathonf/ffmpeg-4 -q
!apt update -q
!apt install -y ffmpeg -q

Usage: add-apt-repository <sourceline>

add-apt-repository is a script for adding apt sources.list entries.
It can be used to add any repository and also provides a shorthand
syntax for adding a Launchpad PPA (Personal Package Archive)
repository.

<sourceline> - The apt repository source line to add. This is one of:
  a complete apt line in quotes,
  a repo url and areas in quotes (areas defaults to 'main')
  a PPA shortcut.
  a distro component

  Examples:
    apt-add-repository 'deb http://myserver/path/to/repo stable myrepo'
    apt-add-repository 'http://myserver/path/to/repo myrepo'
    apt-add-repository 'https://packages.medibuntu.org free non-free'
    apt-add-repository http://extras.ubuntu.com/ubuntu
    apt-add-repository ppa:user/repository
    apt-add-repository ppa:user/distro/repository
    apt-add-repository multiverse

If --remove is given the tool will remove the given sourceline from your
sources.list


add-apt-repository: error: no such option: -q
Hit:1 https://cl

In [4]:
!pip install datasets>=2.6.1 -q
!pip install git+https://github.com/huggingface/transformers -q
!pip install librosa -q
!pip install evaluate>=0.30 -q
!pip install jiwer -q
!pip install gradio -q
!pip install more-itertools -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [5]:
from datasets import interleave_datasets, load_dataset

def load_streaming_dataset(dataset_name, dataset_config_name, split, **kwargs):
    if "+" in split:
        # load multiple splits separated by the `+` symbol *with* streaming mode
        dataset_splits = [load_dataset(dataset_name, dataset_config_name, split=split_name, streaming=True, **kwargs) for split_name in split.split("+")]
        # interleave multiple splits to form one dataset
        interleaved_dataset = interleave_datasets(dataset_splits)
        return interleaved_dataset
    else:
        # load a single split *with* streaming mode
        dataset = load_dataset(dataset_name, dataset_config_name, split=split, streaming=True, **kwargs)
        return dataset

In [6]:
from datasets import IterableDatasetDict

raw_datasets = IterableDatasetDict()

raw_datasets["train"] = load_streaming_dataset("mozilla-foundation/common_voice_11_0", "bn", split="train", use_auth_token=False).take(100)
raw_datasets["test"] = load_streaming_dataset("mozilla-foundation/common_voice_11_0", "bn", split="test", use_auth_token=False).take(2)

In [7]:
raw_datasets["test"]

<datasets.iterable_dataset.IterableDataset at 0x7f42c0464bb0>

In [8]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Bengali", task="transcribe")

In [9]:
raw_datasets["train"].features

{'client_id': Value(dtype='string', id=None),
 'path': Value(dtype='string', id=None),
 'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None),
 'sentence': Value(dtype='string', id=None),
 'up_votes': Value(dtype='int64', id=None),
 'down_votes': Value(dtype='int64', id=None),
 'age': Value(dtype='string', id=None),
 'gender': Value(dtype='string', id=None),
 'accent': Value(dtype='string', id=None),
 'locale': Value(dtype='string', id=None),
 'segment': Value(dtype='string', id=None)}

In [10]:
from datasets import Audio

raw_datasets = raw_datasets.cast_column("audio", Audio(sampling_rate=16000))

In [11]:
import string
import re

do_lower_case = False
do_remove_punctuation = False

punctuation_to_remove = string.punctuation.replace("'", "")  # don't remove apostrophes
punctuation_to_remove_regex = f"[{''.join(punctuation_to_remove)}]"

if do_remove_punctuation:
    print("Removing punctuation: ", punctuation_to_remove)

In [12]:
def prepare_dataset(batch):
    # load and (possibly) resample audio datato 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    # compute input length of audio sample in seconds
    batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
    # optional pre-processing steps
    transcription = batch["sentence"]
    if do_lower_case:
        transcription = transcription.lower()
    if do_remove_punctuation:
        transcription = re.sub(punctuation_to_remove_regex, " ", transcription).strip()
    
    # encode target text to label ids
    batch["labels"] = processor.tokenizer(transcription).input_ids
    return batch

In [13]:
vectorized_datasets = raw_datasets.map(prepare_dataset, remove_columns=list(next(iter(raw_datasets.values())).features)).with_format("torch")

In [14]:
vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(
    buffer_size=500,
    seed=0,
)

In [15]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

In [16]:
vectorized_datasets["train"] = vectorized_datasets["train"].filter(
    is_audio_in_length_range,
    input_columns=["input_length"],
)

In [17]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [18]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [19]:
import evaluate

metric = evaluate.load("wer")

In [20]:
# evaluate with the 'normalised' WER
do_normalize_eval = True

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True, normalize=do_normalize_eval)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True, normalize=do_normalize_eval)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [21]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [22]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
model.config.use_cache = False

In [23]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-bn",  # your repo name
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=10,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    generation_max_length=30,
    save_steps=10,
    eval_steps=10,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)

In [24]:
from transformers import TrainerCallback
from transformers.trainer_pt_utils import IterableDatasetShard
from torch.utils.data import IterableDataset

# trainer callback to reinitialise and reshuffle the streamable datasets at the beginning of each epoch
class ShuffleCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, train_dataloader, **kwargs):
        if isinstance(train_dataloader.dataset, IterableDatasetShard):
            pass  # set_epoch() is handled by the Trainer
        elif isinstance(train_dataloader.dataset, IterableDataset):
            train_dataloader.dataset.set_epoch(train_dataloader.dataset._epoch + 1)

In [25]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=vectorized_datasets["train"],
    eval_dataset=vectorized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
    callbacks=[ShuffleCallback()],
)

max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend


In [26]:
model.save_pretrained(training_args.output_dir)
processor.save_pretrained(training_args.output_dir)

Configuration saved in ./whisper-small-bn/config.json
Model weights saved in ./whisper-small-bn/pytorch_model.bin
Feature extractor saved in ./whisper-small-bn/preprocessor_config.json
tokenizer config file saved in ./whisper-small-bn/tokenizer_config.json
Special tokens file saved in ./whisper-small-bn/special_tokens_map.json
added tokens file saved in ./whisper-small-bn/added_tokens.json


In [27]:
trainer.train()

***** Running training *****
  Num examples = 80
  Num Epochs = 9223372036854775807
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10
  Number of trainable parameters = 241734912
Reading metadata...: 16777it [00:00, 25058.44it/s]
The following columns in the training set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.


Step,Training Loss,Validation Loss,Wer
10,No log,2.180656,100.0


***** Running Evaluation *****
  Num examples: Unknown
  Batch size = 32
Reading metadata...: 8353it [00:00, 22321.10it/s]
The following columns in the evaluation set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.
Saving model checkpoint to ./whisper-small-bn/checkpoint-10
Configuration saved in ./whisper-small-bn/checkpoint-10/config.json
Model weights saved in ./whisper-small-bn/checkpoint-10/pytorch_model.bin
Feature extractor saved in ./whisper-small-bn/checkpoint-10/preprocessor_config.json
tokenizer config file saved in ./whisper-small-bn/checkpoint-10/tokenizer_config.json
Special tokens file saved in ./whisper-small-bn/checkpoint-10/special_tokens_map.json
added tokens file saved in ./whisper-small-bn/checkpoint-10/added_tokens.json


Training completed. Do not forget to share your model on hug

TrainOutput(global_step=10, training_loss=2.2869487762451173, metrics={'train_runtime': 59.8975, 'train_samples_per_second': 1.336, 'train_steps_per_second': 0.167, 'total_flos': 2.30868320256e+16, 'train_loss': 2.2869487762451173, 'epoch': 1.0})

In [28]:
preds = trainer.predict(test_dataset=vectorized_datasets["test"])

***** Running Prediction *****
  Num examples: Unknown
  Batch size = 32
Reading metadata...: 8353it [00:00, 21623.82it/s]
The following columns in the test set don't have a corresponding argument in `WhisperForConditionalGeneration.forward` and have been ignored: input_length. If input_length are not expected by `WhisperForConditionalGeneration.forward`,  you can safely ignore this message.


In [29]:
p = processor.tokenizer.convert_tokens_to_string([x for x in processor.tokenizer.convert_ids_to_tokens(preds[0][0]) if not x.startswith('<')])

In [30]:
processor.tokenizer.convert_tokens_to_string([x for x in processor.tokenizer.convert_ids_to_tokens(preds[0][1]) if not x.startswith('<')])

" H. Ronny's land a cricket dollar who you can't see me."

# Original Data

In [31]:
o = []
for i in raw_datasets["test"]:
  o.append((i['sentence']))
  break

Reading metadata...: 8353it [00:00, 16835.25it/s]


# Compare Original and Predicted data

In [32]:
print("Original : ", o[0])
print("Predicted : ", p)

Original :  গভীর জলের বার্থ ও বহুমুখী টার্মিনাল সহ, বন্দরটি দক্ষতার সাথে বিশ্বের বৃহত্তম বাল্ক ক্যারিয়ার পরিচালনা করতে সক্ষম।
Predicted :   Gavir Jaller Barth and Bohumukhi Terminal Shah are especially interested in the construction of the bulk carrier.
