In [None]:
! aws s3 ls s3://monlam.ai.stt/

In [None]:
! aws s3 sync s3://monlam.ai.stt/wav16k wav16k

In [None]:
! mkdir tsv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/06_training.csv --output tsv/training.csv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/06_validation.csv --output tsv/validation.csv
! curl https://d38pmlk0v88drf.cloudfront.net/tsv/05_benchmarkings.csv --output tsv/test.csv


In [3]:
import pandas as pd
dataTrain = pd.read_csv("tsv/training.csv")
dataValid = pd.read_csv("tsv/validation.csv")
dataTest = pd.read_csv("tsv/test.csv")

In [2]:
len(dataTrain), len(dataValid), len(dataTest)

(1046950, 15944, 8367)

In [3]:
dataTrain = dataTrain[dataTrain['file_name'] != 'STT_AB00321_1248_4868796_to_4870964']
dataTest = dataTest[dataTest['file_name'] != 'STT_MV0246_0343_2208363_to_2216623']

In [4]:
pd.options.mode.chained_assignment = None
dataTest['path'] = dataTest['file_name'].apply(lambda x: f'wav16k/{x}.wav')
dataValid['path'] = dataValid['file_name'].apply(lambda x: f'wav16k/{x}.wav')
dataTrain['path'] = dataTrain['file_name'].apply(lambda x: f'wav16k/{x}.wav')

In [6]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [7]:
from transformers import Wav2Vec2FeatureExtractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [8]:
from transformers import Wav2Vec2Processor
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
processor.save_pretrained("mms_300_v2")

In [9]:
import numpy as np
import pandas as pd
from datasets import Dataset
import torchaudio
from torchaudio.transforms import Resample
import os
import multiprocessing as mp
import logging

# Configure logging
logging.basicConfig(
    filename='error_log.log',
    level=logging.ERROR,
    format='%(asctime)s:%(levelname)s:%(message)s'
)

# Assuming 'processor' is predefined, e.g., from Hugging Face's transformers library
def prepare_dataset(batch):
    try:
        if "speech" not in batch or len(batch["speech"]) == 0:
            error_message = f"Empty speech data in batch: {batch}"
            print(error_message)
            logging.error(error_message)
            batch["input_values"] = np.array([], dtype=np.float32)  # Set default empty array with consistent type
            batch["labels"] = []  # Set default empty list
            batch["valid"] = False
            return batch  # Indicate that this row should be discarded

        batch["speech"] = np.array(batch["speech"], dtype=np.float32)  # Ensure speech is a NumPy array with consistent type

        batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"]).input_values
        # Reshape to (n,)
        batch["input_values"] = np.squeeze(batch["input_values"])

        if "target_text" not in batch or batch["target_text"] == "":
            error_message = f"Empty target text in batch: {batch}"
            print(error_message)
            logging.error(error_message)
            batch["input_values"] = np.array([], dtype=np.float32)  # Set default empty array with consistent type
            batch["labels"] = []  # Set default empty list
            batch["valid"] = False
            return batch  # Indicate that this row should be discarded

        batch["labels"] = processor(text=batch["target_text"]).input_ids
        batch["valid"] = True  # Indicate that this row is valid
        return batch
    except Exception as e:
        error_message = f"Error in prepare_dataset: {e}"
        print(error_message)
        logging.error(error_message)
        batch["input_values"] = np.array([], dtype=np.float32)  # Set default empty array with consistent type
        batch["labels"] = []  # Set default empty list
        batch["valid"] = False
        return batch  # Indicate that this row should be discarded

def speech_file_to_array_fn(batch):
    try:
        if not os.path.exists(batch["path"]):
            raise OSError(f"File not found: {batch['path']}")

        speech_array, sampling_rate = torchaudio.load(batch["path"])
       
        if sampling_rate != 16000:
            print("Resampling")
            resampler = Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_array = resampler(speech_array)
            sampling_rate = 16000
            
        if len(speech_array) == 0:
            raise ValueError(f"Empty speech data in file: {batch['path']}")

        batch["speech"] = speech_array[0].numpy().astype(np.float32)  # Ensure consistent type
        batch["sampling_rate"] = sampling_rate
        batch["target_text"] = batch["uni"]
        batch["valid"] = True
        return batch  # Indicate that this row is valid
    except (OSError, ValueError, Exception) as e:
        error_message = f"Error processing audio file {batch['path']}: {e}"
        print(error_message)
        logging.error(error_message)
        batch["speech"] = np.array([], dtype=np.float32)  # Set default empty array with consistent type
        batch["sampling_rate"] = 16000
        batch["target_text"] = ""
        batch["valid"] = False
        return batch  # Indicate that this row should be discarded

def process_batch(batch_i, batch_df):
    try:
        print(f"Processing speech batch {batch_i}")
        common_voice_train = Dataset.from_pandas(batch_df)
        
        # Process the dataset
        common_voice_train = common_voice_train.map(
            speech_file_to_array_fn, 
            batched=False
        )
        print(f"Filtering invalid speech rows {batch_i}")
        common_voice_train = common_voice_train.filter(lambda x: x['valid'])
        # Remove 'valid' column after filtering
        print(f"Removing valid column {batch_i}")
        common_voice_train = common_voice_train.remove_columns(['valid'])
        
        
        print(f"Processing prepare_dataset batch {batch_i}")

        common_voice_train = common_voice_train.map(
            prepare_dataset, 
            batched=False
        )
        print(f"Filtering invalid dataset rows {batch_i}")
        common_voice_train = common_voice_train.filter(lambda x: x['valid'])
        
        # Remove 'valid' column after filtering
        print(f"Removing valid column {batch_i}")
        common_voice_train = common_voice_train.remove_columns(['valid'])
        
        # Save the processed batch to disk
        common_voice_train.save_to_disk(f"data/train_prepare_dataset_batch_{batch_i}.arrow")
        print(f"Saved processed batch {batch_i} to disk.")
    except Exception as e:
        error_message = f"Error processing batch {batch_i}: {e}"
        print(error_message)
        logging.error(error_message)


In [4]:
import math
total = len(dataTrain)
batch_size = math.floor(total * 2/100)

max_batch_i = math.floor(total/batch_size) - 1

print(f'total: {total}, batch_size: {batch_size}, max_batch_i: {max_batch_i}')

total: 1046950, batch_size: 20939, max_batch_i: 49


In [11]:
mp.cpu_count()

48

In [11]:
# Load your data into a pandas DataFrame

for batch_i in range(0, max_batch_i+1):

    batch_df = dataTrain[batch_i * batch_size:] if batch_i == max_batch_i else dataTrain[batch_i * batch_size:(batch_i + 1) * batch_size]

    # Save the batch DataFrame to a CSV file
    batch_df.to_csv(f"batch_data/batch_{batch_i}.csv", index=False)
    print("saved", batch_i)

saved 0
saved 1
saved 2
saved 3
saved 4
saved 5
saved 6
saved 7
saved 8
saved 9
saved 10
saved 11
saved 12
saved 13
saved 14
saved 15
saved 16
saved 17
saved 18
saved 19
saved 20
saved 21
saved 22
saved 23
saved 24
saved 25
saved 26
saved 27
saved 28
saved 29
saved 30
saved 31
saved 32
saved 33
saved 34
saved 35
saved 36
saved 37
saved 38
saved 39
saved 40
saved 41
saved 42
saved 43
saved 44
saved 45
saved 46
saved 47
saved 48
saved 49


In [14]:
# Multiprocessing
pool = mp.Pool(mp.cpu_count())
results = []

for batch_i in range(0, max_batch_i + 1):
    if os.path.exists(f"data/train_prepare_dataset_batch_{batch_i}.arrow"):
        continue
    batch_df = pd.read_csv(f"batch_data/batch_{batch_i}.csv")
    batch_df = batch_df[~batch_df['file_name'].str.startswith('STT_MV0833')]
    result = pool.apply_async(process_batch, args=(batch_i, batch_df))
    results.append(result)

pool.close()
pool.join()

# Ensure all processes are completed
for result in results:
    result.get()


In [15]:
from datasets import Dataset

common_voice_valid = Dataset.from_pandas(dataValid)
common_voice_test = Dataset.from_pandas(dataTest)

In [16]:
common_voice_valid = common_voice_valid.map(speech_file_to_array_fn, remove_columns=common_voice_valid.column_names)
common_voice_valid = common_voice_valid.map(prepare_dataset, remove_columns=common_voice_valid.column_names)
common_voice_valid.save_to_disk("data/valid_prepare_dataset.arrow")

Map:   0%|          | 0/15944 [00:00<?, ? examples/s]

Map:   0%|          | 0/15944 [00:00<?, ? examples/s]

Saving the dataset (0/9 shards):   0%|          | 0/15944 [00:00<?, ? examples/s]

In [17]:
common_voice_test = common_voice_test.map(speech_file_to_array_fn, remove_columns=common_voice_test.column_names)
common_voice_test = common_voice_test.map(prepare_dataset, remove_columns=common_voice_test.column_names)
common_voice_test.save_to_disk("data/test_prepare_dataset.arrow")

Map:   0%|          | 0/8367 [00:00<?, ? examples/s]

Map:   0%|          | 0/8367 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/8367 [00:00<?, ? examples/s]

In [None]:
from datasets import load_from_disk, concatenate_datasets
train_arr = []

for i in range(max_batch_i + 1):
    print(i)
    train_batch_i = load_from_disk(f'data/train_prepare_dataset_batch_{i}.arrow')
    train_arr.append(train_batch_i)

common_voice_train = concatenate_datasets(train_arr)

# common_voice_train.save_to_disk("/media/monlamai/HD_volume_1/wav2vec2/train_prepare_dataset.arrow")
common_voice_train.save_to_disk("data/train_prepare_dataset.arrow")

0


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

1


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

2


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

3


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

4


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

5


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

6


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

7


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

8


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

9


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

10


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

11


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

12


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

13


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

14


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

15


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

16


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

17


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

18


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

19


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

20


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

21


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

22


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

23


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

24


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

25


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

26


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

27


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

28


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

29


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

30


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

31


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

32


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

33


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

34


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

35


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

36


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

37


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

38


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

39


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

40


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

41


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

42


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

43


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

44


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

45


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

46


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

47


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

48


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

49


Loading dataset from disk:   0%|          | 0/22 [00:00<?, ?it/s]

Saving the dataset (0/1071 shards):   0%|          | 0/1046802 [00:00<?, ? examples/s]

In [None]:
!aws s3 cp data/valid_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/valid_prepare_dataset.arrow --recursive
!aws s3 cp data/test_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/test_prepare_dataset.arrow --recursive
!aws s3 cp data/train_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/train_prepare_dataset.arrow--recursive

In [10]:
!aws s3 sync data/valid_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/valid_prepare_dataset.arrow 
!aws s3 sync data/test_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/test_prepare_dataset.arrow 
!aws s3 sync data/train_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/train_prepare_dataset.arrow

In [None]:
# mv HD_volume_1/wav2vec2/train_prepare_dataset.arrow SSD/wav2vec2
# aws s3 cp valid_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/valid_prepare_dataset.arrow --recursive
# aws s3 cp test_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/test_prepare_dataset.arrow --recursive
# aws s3 cp train_prepare_dataset.arrow s3://monlam.ai.stt/dataset/wav2vec2/train_prepare_dataset.arrow --recursive


In [None]:
processor.save_pretrained("mms_300_v2")

### Push best model to Hub

In [1]:
!rm data/train_prepare_dataset_batch_*.arrow --recursive 

In [None]:
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2ForCTC
model = Wav2Vec2ForCTC.from_pretrained("/home/ec2-user/Sagemaker/stt-wav2vec2/mms_300/mms_300_v1/checkpoint-1190000")
processor = Wav2Vec2Processor.from_pretrained("mms_300_v1")

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
model_name = "mms_300_v2.1190"
model.push_to_hub(    model_name)
processor.push_to_hub(model_name)