In [1]:
import torch
import pandas as pd
import os
import librosa
import contextlib
from mutagen.mp3 import MP3
from datasets import load_dataset, Value, Features, Audio, Dataset, concatenate_datasets
from sklearn.preprocessing import minmax_scale
import numpy as np
import evaluate
import texthero as hero

os.cpu_count()

  from .autonotebook import tqdm as notebook_tqdm


20

In [2]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())
# print(torch.cuda.current_device())
# print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('NOT connected to a GPU!. Will use CPU!')
else:
  print('Connected to a GPU!')

True
1
NVIDIA GeForce RTX 3070 Ti
Connected to a GPU!


In [3]:
df = pd.read_excel(".\\200_selected.xlsx")
df['file_name'][1]

'.\\LARGE DATA\\larger sample\\QF18\\GK_swa_u0725_QF18.mp3'

In [4]:
df.drop(columns = ['Unnamed: 0', 'Uliza\'s Translations'], inplace = True)

In [5]:
audio_dir = "/Swahili_STT"
csv_dir = "Swahili_dir.xlsx"
audio_t = ".mp3"
base_dir = ".\\caches"

In [6]:
import re
def st_pre(df):
    df['transcription'] = hero.remove_diacritics(df['transcription'])
    df['transcription'] = hero.remove_whitespace(df['transcription'])
    for i in range(len(df)):
        df['transcription'][i] = re.sub("[\(\[].*?[\)\]]", "", df['transcription'][i])
        df['transcription'][i] = re.sub(r"(?<=\w)([A-Z])()", r" \1", df['transcription'][i])
    df['transcription'] = hero.remove_whitespace(df['transcription'])
    return df

In [7]:
df = st_pre(df)

In [8]:
df['transcription'][40]

'Ikiwa kulikuwa na kipengele cha kuzuia mtu kuiba pesa zako kwenye simu yako ningeinunua kwa sababu ingenisaidia.'

In [9]:
os.environ['HF_DATASETS_CACHE'] = os.path.join(base_dir, 'data_rash')
os.environ['HF_DATASETS_OFFLINE'] = '0'


features = Features(
    {   
#         "file_name": Value('string'),
        "file_name": Audio(sampling_rate=16000),
        "transcription": Value("string")
    }
)

swahili = Dataset.from_pandas(df, features = features, split = ['train', 'test'],
)
swahili

Dataset({
    features: ['file_name', 'transcription'],
    num_rows: 200
})

In [10]:
swahili[49]["transcription"]

'Ningewanunua ili kuwashinda walaghai hao kwa sababu.'

In [11]:
swahili[49]["file_name"]

{'path': '.\\LARGE DATA\\larger sample\\QF14\\GK_swa_u0644_QF14.mp3',
 'array': array([-1.13686838e-13,  1.25055521e-12,  3.41060513e-12, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]),
 'sampling_rate': 16000}

In [12]:
print("Path: ", swahili[49]["file_name"]['path'])
print("Sampling rate: ", swahili[49]["file_name"]['sampling_rate'])

Path:  .\LARGE DATA\larger sample\QF14\GK_swa_u0644_QF14.mp3
Sampling rate:  16000


In [13]:
def getWavDuration(fname):
    audio = MP3(fname)
    duration =  audio.info.length

    return duration

def duration_infor(dataset):
    list_1 = []
    list_2 = []
    text_1 = []
    for i in range(dataset.num_rows):
        duration = getWavDuration(dataset[i]['file_name']['path'])
        list_1.append(duration)
        text_1.append(len(dataset[i]['transcription']))
        if duration > 30:
            list_2.append(duration)
    print("Total Files: ", len(list_1))
    print("Minimum length of audio in dataset: ", min(list_1))
    print("Average length of audio in dataset: ", sum(list_1)/len(list_1))
    print("Maximum lenth of audio in dataset: ", max(list_1))
    print("Audio files with more than 30 sec: ", len(list_2))
    print("Max size of string: ", max(text_1))
    print("Min size of string: ", min(text_1))
#     print("Unique characters: ", )
        
duration_infor(swahili)

Total Files:  200
Minimum length of audio in dataset:  2.664
Average length of audio in dataset:  15.518160000000007
Maximum lenth of audio in dataset:  29.376
Audio files with more than 30 sec:  0
Max size of string:  364
Min size of string:  14


In [14]:
import IPython.display as ipd
%matplotlib inline
import matplotlib.pyplot as plt
import librosa.display

In [15]:
which_file = 29
filename = swahili[which_file]["file_name"]['path']
y, sr = librosa.load(filename, sr = 16000)
print(y)
print(sr)
tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
print('Estimated tempo: {:.2f} beats per minute'.format(tempo))
beat_times = librosa.frames_to_time(beat_frames, sr=sr)
print(beat_times)

[0.0000000e+00 5.2386895e-10 7.8580342e-10 ... 0.0000000e+00 0.0000000e+00
 0.0000000e+00]
16000
Estimated tempo: 125.00 beats per minute
[ 0.288  0.768  1.248  1.728  2.176  2.624  3.072  3.52   3.968  4.416
  4.864  5.312  5.76   6.24   6.688  7.104  7.552  8.032  8.512  8.992
  9.472  9.952 10.432 10.944 11.36  11.84  12.352 12.8   13.312 13.792
 14.24  14.688 15.136 15.616 16.096 16.608 17.12  17.664 18.112 18.592
 19.04  19.52  20.032 20.544 21.088]


In [16]:
# Regex for matching zero witdh joiner variations.
STANDARDIZE_ZW = re.compile(r'(?<=\u09b0)[\u200c\u200d]+(?=\u09cd\u09af)')

# Regex for removing standardized zero width joiner, except in edge cases.
DELETE_ZW = re.compile(r'(?<!\u09b0)[\u200c\u200d](?!\u09cd\u09af)')

## Regex matching punctuations to remove.
# PUNC = re.compile(r'([\?\.।;:,!"\'])')
## Keeps fullstop(.), dari(|), comma(,), exclamaition(!) and question mark(?) and removes all other punctuations (semicolon (;), colon (:), double quote (") and single quote (')).
PUNC = re.compile(r'([;:"\'])')

def removeOptionalZW(text):
    """
    Removes all optional occurrences of ZWNJ or ZWJ from Bangla text.
    """
    text = STANDARDIZE_ZW.sub('\u200D', text)
    text = DELETE_ZW.sub('', text)
    return text

def removePunc(text):
    """
    Remove for punctuations from text.
    """
    text = PUNC.sub(r"", text)
    return text

In [17]:
swahili_split = swahili.train_test_split(test_size=0.15)
swahili_split

DatasetDict({
    train: Dataset({
        features: ['file_name', 'transcription'],
        num_rows: 170
    })
    test: Dataset({
        features: ['file_name', 'transcription'],
        num_rows: 30
    })
})

In [18]:
swahili_split['train'][0]['file_name']

{'path': '.\\LARGE DATA\\larger sample\\QH12\\GK_swa_u0672_QH12.mp3',
 'array': array([-1.13686838e-13,  1.25055521e-12,  3.41060513e-12, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]),
 'sampling_rate': 16000}

In [None]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\']'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch

In [None]:
common_voice_train = common_voice_train.map(remove_special_characters)
common_voice_test = common_voice_test.map(remove_special_characters)

In [19]:
from audiomentations import (
    AddBackgroundNoise, #Mixes in another sound to add background noise
    AddGaussianNoise, #Adds gaussian noise to the audio samples
    Compose,
    Gain, #Multiplies the audio by a random gain factor
    OneOf,
    PitchShift, #Shifts the pitch up or down without changing the tempo
    PolarityInversion, #Flips the audio samples upside down, reversing their polarity
    Limiter, #Applies dynamic range compression limiting the audio signal
    Mp3Compression, #Compresses the audio to lower the quality
    PitchShift, # Shifts the pitch up or down without changing the tempo
    RoomSimulator,# Simulates the effect of a room on an audio source
    SpecFrequencyMask, # Applies a frequency mask to the spectrogram
    TimeMask, #Makes a random part of the audio silent
    TimeStretch, #Changes the speed without changing the pitch
    Trim, #Trims leading and trailing silence from the audio
)

# define augmentation
augmentation = Compose(
    [
        TimeStretch(min_rate=0.9, max_rate=1.1, p=0.2, leave_length_unchanged=True),
#         Gain(min_gain_in_db=-6, max_gain_in_db=6, p=0.1),
        PitchShift(min_semitones=-4, max_semitones=4, p=0.2),
        AddGaussianNoise(min_amplitude=0.005, 
                         max_amplitude=0.015, 
                         p=1.0),
        Limiter(min_threshold_db=-16.0,
                max_threshold_db=-6.0,
                threshold_mode="relative_to_signal_peak",
                p=0.2,),
#         RoomSimulator(min_size_x = 2, max_size_x = 10,
#                      min_size_y = 2, min_size_y = 15,
#                      min_size_y = 7, min_size_y = 12,
#                      min_absorption_value = 0.10,
#                      max_absorption_value = 0.50, p =0.2),
        Trim(top_db=20.0,p=1.0),
        Mp3Compression(),
    ]
)

In [20]:
def augment_dataset(batch):
    # load and (possibly) resample audio data to 16kHz
    sample = batch['file_name']

    # apply augmentation
    augmented_waveform = augmentation(sample["array"], sample_rate=sample["sampling_rate"])
    batch["file_name"]["array"] = augmented_waveform
    return batch


In [21]:
augmented_raw_training_dataset = swahili_split["train"].map(
    augment_dataset, 
    num_proc=1, 
    desc="augment train dataset",
    load_from_cache_file=True, 
    # cache_file_name=os.path.join(base_dir, "another_try.arrow")
)

augment train dataset: 100%|██████████████████████████████████████████████████| 170/170 [01:31<00:00,  1.85 examples/s]


In [22]:
print("\n COMBINING Augmented Dataset with Normal Dataset..... \n")
# combine
swahili_split["train"] = concatenate_datasets([swahili_split["train"], augmented_raw_training_dataset])
swahili_split["train"] = swahili_split["train"].shuffle(seed=42)


 COMBINING Augmented Dataset with Normal Dataset..... 



In [23]:
print("\n\n AFTER AUGMENTATION, FINAL train and validation sets are: ")
print("\n FINAL DATASET: \n")
print(swahili_split)



 AFTER AUGMENTATION, FINAL train and validation sets are: 

 FINAL DATASET: 

DatasetDict({
    train: Dataset({
        features: ['file_name', 'transcription'],
        num_rows: 340
    })
    test: Dataset({
        features: ['file_name', 'transcription'],
        num_rows: 30
    })
})


In [24]:
swahili_split['train'].features

{'file_name': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'transcription': Value(dtype='string', id=None)}

In [25]:
def print_model_size(mdl):
    torch.save(mdl.state_dict(), "tmp.pt")
    print("%.2f MB" %(os.path.getsize("tmp.pt")/1e6))
    os.remove('tmp.pt')

In [26]:
swahili_split['train'][0]

{'file_name': {'path': '.\\LARGE DATA\\larger sample\\QH11\\GK_swa_u0501_QH11.mp3',
  'array': array([-4.54747351e-12, -2.91038305e-11, -2.91038305e-11, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]),
  'sampling_rate': 16000},
 'transcription': 'Kuuza kwenye Mtandao kuna faida kwamba unakutana na wateja wengi ambao wanatafuta bidhaa zako lakini tena Tatizo lake ni moja. Uaminifu ni mdogo sana.'}

In [27]:
def extract_all_chars(batch):
    all_text = " ".join(batch["transcription"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocabs = swahili_split.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=swahili_split.column_names["train"])

Map: 100%|█████████████████████████████████████████████████████████████████| 340/340 [00:00<00:00, 56664.02 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 9999.14 examples/s]


In [28]:
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict


{'H': 0,
 '.': 1,
 'W': 2,
 'C': 3,
 'B': 4,
 'g': 5,
 'y': 6,
 'v': 7,
 't': 8,
 "'": 9,
 'a': 10,
 'o': 11,
 'J': 12,
 'z': 13,
 'k': 14,
 'M': 15,
 'm': 16,
 'd': 17,
 'N': 18,
 'f': 19,
 'U': 20,
 'P': 21,
 'w': 22,
 'h': 23,
 'j': 24,
 'V': 25,
 'p': 26,
 'Y': 27,
 'l': 28,
 'r': 29,
 ',': 30,
 'K': 31,
 'e': 32,
 'u': 33,
 'T': 34,
 'S': 35,
 'L': 36,
 'b': 37,
 'c': 38,
 's': 39,
 'i': 40,
 'I': 41,
 'F': 42,
 '?': 43,
 ' ': 44,
 'n': 45,
 'A': 46,
 'R': 47}

In [29]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

50

In [30]:
import json
with open('vocab_W2V_tran.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [31]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab_W2V_tran.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token=" ")

In [32]:
from transformers import Wav2Vec2ForCTC
import transformers
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor, Wav2Vec2ForCTC
from transformers import TrainingArguments, Trainer

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    padding_side="right",
    do_normalize=True,
    return_attention_mask=True,
)


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
binary_path: D:\Downloads\Swahili-translation-English-STT--main\ASR\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll
CUDA SETUP: Loading binary D:\Downloads\Swahili-translation-English-STT--main\ASR\lib\site-packages\bitsandbytes\cuda_setup\libbitsandbytes_cuda116.dll...


In [33]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [34]:
swahili_split["train"][0]["file_name"]['path']

'.\\LARGE DATA\\larger sample\\QH11\\GK_swa_u0501_QH11.mp3'

In [35]:
swahili_split["train"][0]["file_name"]['array']

array([-4.54747351e-12, -2.91038305e-11, -2.91038305e-11, ...,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00])

In [36]:
import random
rand_int = random.randint(0, len(swahili_split["train"]))

print("Target text:", swahili_split["train"][rand_int]["transcription"])
print("Input array shape:", np.asarray(swahili_split["train"][rand_int]["file_name"]["array"]).shape)
print("Sampling rate:", swahili_split["train"][rand_int]["file_name"]["sampling_rate"])

Target text: Inaongeza idadi ya wateja Na wanawashiriki na marafiki, kwa hivyo huongeza idadi ya.
Input array shape: (244736,)
Sampling rate: 16000


In [37]:
def prepare_dataset(batch):
    audio = batch["file_name"]

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
        
    return batch

In [38]:
swahili_split = swahili_split.map(prepare_dataset, remove_columns=swahili_split.column_names["train"], num_proc=1)

Map: 100%|███████████████████████████████████████████████████████████████████| 340/340 [00:02<00:00, 136.28 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 93.39 examples/s]


In [39]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")
bleu = evaluate.load("bleu")
do_normalize_eval = True

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    if do_normalize_eval:
        pred_str = [normalizer(pred) for pred in pred_str]
        label_str = [normalizer(label) for label in label_str]

    print(pred_str)
    print(label_str)
    wer = 100*wer_metric.compute(predictions=pred_str, references=label_str)
    cer = 100*cer_metric.compute(predictions=pred_str, references=label_str)
    bleu_m = bleu.compute(predictions=pred_str, references=label_str)['bleu']
    
    return {"cer": cer, "wer": wer, "bleu": bleu_m}

In [40]:
# wer_metric = evaluate.load("wer")
# cer_metric = evaluate.load("cer")
# bleu = evaluate.load("bleu")
# do_normalize_eval = True

# def compute_metrics(pred):
#     pred_ids = pred.predictions
#     label_ids = pred.label_ids

#     label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

#     pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

#     if do_normalize_eval:
#         pred_str = [normalizer(pred) for pred in pred_str]
#         label_str = [normalizer(label) for label in label_str]

#     print(pred_str)
#     print(label_str)
#     wer = 100*wer_metric.compute(predictions=pred_str, references=label_str)
#     cer = 100*cer_metric.compute(predictions=pred_str, references=label_str)
#     bleu_m = bleu.compute(predictions=pred_str, references=label_str)['bleu']
#     return {"cer": cer, "wer": wer, "bleu": bleu_m}

In [41]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [42]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


In [43]:
model = Wav2Vec2ForCTC.from_pretrained(
    "PaschalK/wav2vec-XLSR-swahili",
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of the model checkpoint at PaschalK/wav2vec-XLSR-swahili were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_v', 'wav2vec2.encoder.pos_conv_embed.conv.weight_g']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at PaschalK/wav2vec-XLSR-swahili and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this mo

In [44]:
# model.freeze_feature_encoder()

In [45]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=".\Wav2Vec",
  group_by_length=True,
  per_device_train_batch_size=1,
  per_device_eval_batch_size=1,

  evaluation_strategy="epoch",
  num_train_epochs=10,
  # fp16=True,
  # gradient_checkpointing=True,
  # save_steps=100,
  # eval_steps=100,
  logging_steps=500,
  learning_rate=1e-5,
  weight_decay=0.01,
  warmup_steps=100,
  save_total_limit=2,
)

In [46]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=swahili_split["train"],
    eval_dataset=swahili_split["test"],
    tokenizer=processor.feature_extractor,
)
trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 132.00 MiB. GPU 0 has a total capacty of 8.00 GiB of which 0 bytes is free. Of the allocated memory 7.15 GiB is allocated by PyTorch, and 107.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF