In [None]:
# ! aws s3 sync s3://monlam.ai.stt/wav16k wav16k

In [None]:
from datasets import load_from_disk
common_voice_test =  load_from_disk( '/media/monlamai/SSD/wav2vec2/test_prepare_dataset.arrow')

In [None]:
common_voice_test

In [None]:
! mkdir tsv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/06_train.csv --output tsv/train.csv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/06_val.csv --output tsv/validation.csv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/05_benchmark.csv --output tsv/test.csv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/vocab.json --output vocab.json

In [None]:
import pandas as pd
dataTrain = pd.read_csv("tsv/train.csv")
dataValid = pd.read_csv("tsv/validation.csv")
dataTest = pd.read_csv("tsv/test.csv")

In [None]:
dataTrain = dataTrain[dataTrain['file_name'] != 'STT_AB00321_1248_4868796_to_4870964']
dataTest = dataTest[dataTest['file_name'] != 'STT_MV0246_0343_2208363_to_2216623']

In [None]:
pd.options.mode.chained_assignment = None
dataTest['path'] = dataTest['file_name'].apply(lambda x: f'/media/monlamai/SSD/data/wav16k/{x}.wav')
dataValid['path'] = dataValid['file_name'].apply(lambda x: f'/media/monlamai/SSD/data/wav16k/{x}.wav')
dataTrain['path'] = dataTrain['file_name'].apply(lambda x: f'/media/monlamai/SSD/data/wav16k/{x}.wav')

In [None]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [None]:
from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
from torchaudio.transforms import Resample
import torchaudio

def speech_file_to_array_fn(batch):
    # print(batch)
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    # print(speech_array.shape, sampling_rate)
    if sampling_rate != 16000:
        print("resampling")
        resampler = Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(speech_array)
        sampling_rate = 16000
    
    # print(speech_array.shape, sampling_rate)
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["uni"]
    return batch

In [None]:
import numpy as np
def prepare_dataset(batch):
    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"]).input_values
    # reshape to (n,)
    batch["input_values"] = np.squeeze(batch["input_values"])
    # if batch["sampling_rate"] != 16000:
    #     print("sampling rate not 16k", batch)
    
    # with processor.as_target_processor():
    #     batch["labels"] = processor(batch["target_text"]).input_ids

    batch["labels"] = processor(text=batch["target_text"]).input_ids
    return batch

In [None]:
import math
total = len(dataTrain)
batch_size = math.floor(total * 5/100)

max_batch_i = math.floor(total/batch_size) - 1

print(f'total: {total}, batch_size: {batch_size}, max_batch_i: {max_batch_i}')

In [None]:
from datasets import Dataset

common_voice_valid = Dataset.from_pandas(dataValid)
common_voice_test = Dataset.from_pandas(dataTest)

In [None]:
for batch_i in range(0, max_batch_i+1):
    print('batch_i', batch_i)

    batch_df = dataTrain[batch_i*batch_size:] if batch_i == max_batch_i else dataTrain[batch_i*batch_size:(batch_i+1)*batch_size]

    common_voice_train = Dataset.from_pandas(batch_df)
    common_voice_train = common_voice_train.map(speech_file_to_array_fn, remove_columns=common_voice_train.column_names)
    common_voice_train = common_voice_train.map(prepare_dataset, remove_columns=common_voice_train.column_names)
    common_voice_train.save_to_disk(f"/media/monlamai/SSD/wav2vec2/train_prepare_dataset_batch_{batch_i}.arrow")

In [None]:
common_voice_valid = common_voice_valid.map(speech_file_to_array_fn, remove_columns=common_voice_valid.column_names)
common_voice_valid = common_voice_valid.map(prepare_dataset, remove_columns=common_voice_valid.column_names)
common_voice_valid.save_to_disk(f"/media/monlamai/SSD/wav2vec2/valid_prepare_dataset.arrow")

In [None]:
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)
common_voice_test.save_to_disk(f"/media/monlamai/SSD/wav2vec2/test_prepare_dataset.arrow")

In [None]:
from datasets import load_from_disk, concatenate_datasets
train_arr = []

for i in range(20):
    print(i)
    train_batch_i = load_from_disk(f'/media/monlamai/SSD/wav2vec2/train_prepare_dataset_batch_{i}.arrow')
    train_arr.append(train_batch_i)

common_voice_train = concatenate_datasets(train_arr)

# common_voice_train.save_to_disk("/media/monlamai/HD_volume_1/wav2vec2/train_prepare_dataset.arrow")
common_voice_train.save_to_disk("/media/monlamai/SSD/wav2vec2/train_prepare_dataset.arrow")

In [None]:
# mv HD_volume_1/wav2vec2/train_prepare_dataset.arrow SSD/wav2vec2
# aws s3 cp valid_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/valid_prepare_dataset.arrow --recursive
# aws s3 cp test_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/test_prepare_dataset.arrow --recursive
# aws s3 cp train_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/train_prepare_dataset.arrow --recursive


### Push best model to Hub

In [None]:

from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC
model = Wav2Vec2ForCTC.from_pretrained("/media/monlamai/SSD/mms_300/mms_300_v1/checkpoint-1350000")
processor = Wav2Vec2Processor.from_pretrained("mms_300_v1")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model.push_to_hub(    "mms_300_v1.1350")
processor.push_to_hub("mms_300_v1.1350")