In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
! pip install datasets>=2.6.1
! pip install git+https://github.com/huggingface/transformers
! pip install librosa
! pip install evaluate>=0.30
! pip install jiwer
! pip install gradio
# ! pip  install datasets
# ! pip install torch
! pip install accelerate -U
# ! pip install evaluate

In [None]:
!pip install git+https://github.com/huggingface/huggingface_hub
!pip install wandb -qU
!pip install ipywidgets

## Load Dataset

Using 🤗 Datasets, downloading and preparing data is extremely simple.
We can download and prepare the Common Voice splits in just one line of code.

First, ensure you have accepted the terms of use on the Hugging Face Hub: [mozilla-foundation/common_voice_11_0](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0). Once you have accepted the terms, you will have full access to the dataset and be able to download the data locally.

Since Hindi is very low-resource, we'll combine the `train` and `validation`
splits to give approximately 8 hours of training data. We'll use the 4 hours
of `test` data as our held-out test set:

In [None]:
! mkdir tsv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/06_train.tsv --output tsv/train.tsv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/06_val.tsv --output tsv/validation.tsv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/05_benchmarks.tsv --output tsv/test.tsv

In [1]:
import pandas as pd

train_df = pd.read_csv('tsv/train.tsv', sep='\t')
validation_df = pd.read_csv('tsv/validation.tsv', sep='\t')
# test_df = pd.read_csv('tsv/test.tsv', sep='\t')

In [None]:
pd.options.mode.chained_assignment = None
# test_df['path'] = test_df['file_name'].apply(lambda x: f'/media/monlamai/SSD/wav2vec2/segments/{x.replace(".wav","").replace(".mp3","")}.wav')
validation_df['path'] = validation_df['file_name'].apply(lambda x: f'/media/monlamai/SSD/wav2vec2/segments/{x.replace(".wav","").replace(".mp3","")}.wav')
train_df['path'] = train_df['file_name'].apply(lambda x: f'/media/monlamai/SSD/wav2vec2/segments/{x.replace(".wav","").replace(".mp3","")}.wav')

In [2]:
import math
from transformers import WhisperTokenizer
from transformers import WhisperFeatureExtractor
from datasets import DatasetDict, Dataset
from transformers import WhisperProcessor
from datasets import Audio 


total = len(train_df)
batch_size = math.floor(total * 5/100)

max_batch_i = math.floor(total/batch_size) - 1

print(f'total: {total}, batch_size: {batch_size}, max_batch_i: {max_batch_i}')


feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Tibetan", task="transcribe")

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Tibetan", task="transcribe")


def prepare_dataset(batch):
    audio = batch["path"]
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["labels"] = tokenizer(batch["wylie"]).input_ids
    return batch

    
for batch_i in range(19, max_batch_i+1):
    print(f'batch_i: {batch_i}')
    if batch_i == max_batch_i:
        batch_df = train_df[batch_i*batch_size:]
    else :
        batch_df = train_df[batch_i*batch_size:(batch_i+1)*batch_size]
    
    common_voice_train = Dataset.from_pandas(batch_df, split='train')
    common_voice = DatasetDict()
    common_voice["train"] = common_voice_train
    common_voice = common_voice.remove_columns(["file_name", "uni", "dept", 'url', 'grade', 'char_len', 'audio_len', 'non_word_count', 'non_bo_word_count', 'total_tokens'])
    common_voice = common_voice.cast_column("path", Audio(sampling_rate=16000))
    common_voice['train'] = common_voice['train'].map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=1)
    common_voice['train'].save_to_disk(f'/media/monlamai/SSD/whisper/prepare_dataset_train_batch_{batch_i}')


total: 553494, batch_size: 27674, max_batch_i: 19


### run once for test

In [None]:
common_voice = DatasetDict()
common_voice['test'] = Dataset.from_pandas(validation_df, split='validation')

common_voice = common_voice.remove_columns(["file_name", "uni", "dept", 'url', 'grade', 'char_len', 'audio_len', 'non_word_count', 'non_bo_word_count', 'total_tokens'])
common_voice = common_voice.cast_column("path", Audio(sampling_rate=16000))


common_voice['test'] = common_voice['test'].map(prepare_dataset, remove_columns=common_voice.column_names["test"], num_proc=1)
common_voice['test'].save_to_disk(f'/media/monlamai/SSD/whisper/prepare_dataset_test')

In [3]:
from datasets import load_from_disk, concatenate_datasets, DatasetDict
common_voice = DatasetDict()

temp = []
max_batch_i = 19
for batch_i in range(max_batch_i+1):
    print(f'batch_i: {batch_i}')
    train_batch = load_from_disk(f'/media/monlamai/SSD/whisper/prepare_dataset_train_batch_{batch_i}')
    temp.append(train_batch)

common_voice['train'] = concatenate_datasets(temp)

common_voice['test'] = load_from_disk('/media/monlamai/SSD/whisper/prepare_dataset_test')

common_voice.save_to_disk('/media/monlamai/Monlam AI/spsither/whisper/prepare_dataset')

batch_i: 0
batch_i: 1
batch_i: 2
batch_i: 3
batch_i: 4
batch_i: 5
batch_i: 6
batch_i: 7
batch_i: 8
batch_i: 9
batch_i: 10
batch_i: 11
batch_i: 12
batch_i: 13
batch_i: 14
batch_i: 15
batch_i: 16
batch_i: 17
batch_i: 18
batch_i: 19


Saving the dataset (0/975 shards):   0%|          | 0/507324 [00:00<?, ? examples/s]

Saving the dataset (0/22 shards):   0%|          | 0/11296 [00:00<?, ? examples/s]