# Setup

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Jun  6 22:04:29 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.23                 Driver Version: 536.23       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1080 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
| 23%   38C    P8              15W / 250W |   3840MiB / 11264MiB |     23%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce GTX 1080 Ti   WDDM  | 00000000:03:00.0 Off |  

In [None]:
# !pip install datasets
# !pip install git+https://github.com/huggingface/transformers
# !pip install librosa
# !pip install evaluate
# !pip install jiwer
# !pip install gradio

In [2]:
from torch.utils.data import random_split
import numpy as np
import os
import pickle

In [3]:
os.chdir(r"D:\MARONE\WOLOF\SPEECH_TO_TEXT")

# Create my data exctractor

In [4]:
from torch.utils.data import Dataset

class MyDataset(Dataset):
    def __init__(self, feature_extractor, directory):
        self.directory = directory
        self.file_list = os.listdir(directory)
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        with open(os.path.join(self.directory, self.file_list[idx]), 'rb') as f:
            batch = pickle.load(f)
        
        audio = batch['audio'].iloc[0]
        input_values = self.feature_extractor(audio, sampling_rate=16000).input_features[0]
        labels = self.feature_extractor(text=batch['transcription'].iloc[0]).input_ids

        return {'input_values': input_values, 'labels': labels}

In [None]:
data = "alffa"
# Assuming 'dataset' is an instance of your 'MyDataset' class
dataset = MyDataset(r"DATA\PREPROCESSED\\" + data)

In [5]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [69]:
from transformers import WhisperTokenizer
tokenizer_path = r'D:\MARONE\WOLOF\SPEECH_TO_TEXT\MODELS\WHISPER\tokenizer'
new_tokenizer =  WhisperTokenizer(vocab_file=os.path.join(tokenizer_path,  "vocab.json"), merges_file=os.path.join(tokenizer_path,  "merges.txt"), normalizer_file = None, errors = 'replace', nunk_token = '<|endoftext|>', bos_token = '<|endoftext|>', eos_token = '<|endoftext|>', pad_token = None, add_prefix_space = False, add_bos_token = False)
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [182]:
import json
import os 
from transformers import WhisperFeatureExtractor, WhisperTokenizer

save_dir =  r"D:\MARONE\WOLOF\SPEECH_TO_TEXT\MODELS\WHISPER"
custom_tokenizer_path = os.path.join(save_dir, 'custom_tokenizer')
tokenizer_path = os.path.join(save_dir,  "tokenizer")
pretrained_tokenizer_path = os.path.join(save_dir,  "pretrained_tokenizer")
pretrained_path = "openai/whisper-small"
os.makedirs(tokenizer_path, exist_ok=True)

# Initialize tokenizers
custom_tokenizer =  WhisperTokenizer(vocab_file=os.path.join(custom_tokenizer_path,  "vocab.json"), merges_file=os.path.join(custom_tokenizer_path,  "merges.txt"))
pretrained_tokenizer = WhisperTokenizer.from_pretrained(pretrained_path, task="transcribe")

os.makedirs(pretrained_tokenizer_path, exist_ok=True)
pretrained_tokenizer.save_pretrained(pretrained_tokenizer_path)

# Load custom merges directly from the file
custom_merges_path = os.path.join(custom_tokenizer_path, "merges.txt")
with open(custom_merges_path, 'r', encoding='utf-8') as merges_file:
    custom_merges = [tuple(line.split()) for line in merges_file.read().splitlines()]

# Load pretrained merges directly from the file
pretrained_merges_path = os.path.join(pretrained_tokenizer_path, "merges.txt")
with open(pretrained_merges_path, 'r', encoding='utf-8') as merges_file:
    pretrained_merges = [tuple(line.split()) for line in merges_file.read().splitlines()]

# Extract vocabularies
custom_vocab = custom_tokenizer.get_vocab()
pretrained_vocab = pretrained_tokenizer.get_vocab()

# Combine vocabularies
combined_vocab = {**pretrained_vocab, **custom_vocab}

# Combine merges and remove duplicates while maintaining order
combined_merges = pretrained_merges + [merge for merge in custom_merges if merge not in pretrained_merges]

# Save the combined vocab and merges to temporary files
combined_vocab_path = os.path.join(tokenizer_path, "combined_vocab.json")
combined_merges_path = os.path.join(tokenizer_path, "combined_merges.txt")

with open(combined_vocab_path, 'w', encoding='utf-8') as vocab_file:
    json.dump(combined_vocab, vocab_file, ensure_ascii=False)

with open(combined_merges_path, 'w', encoding='utf-8') as merges_file:
    merges_file.write('\n'.join([' '.join(merge) for merge in combined_merges]))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [172]:

tokenizer = WhisperTokenizer(vocab_file=combined_vocab_path, merges_file=combined_merges_path)

In [162]:

# tokenizer = WhisperTokenizer(vocab_file=combined_vocab_path, merges_file=combined_merges_path)

# Extend the byte decoder and encoder with new characters
byte_decoder = pretrained_tokenizer.byte_decoder.copy()
byte_encoder = pretrained_tokenizer.byte_encoder.copy()

# Identify keys to modify
keys_to_modify = []
replacable_letter = ['Ę', 'ę', 'Ĝ', 'ĝ', 'Ğ', 'ğ', 'Ġ', 'ġ', 'Ć', 'ć', 'Ĉ', 'ĉ', 'Ċ', 'ċ', 'Č', 'č']
already_used_character = []

for char in custom_vocab.keys():
    if len(char) == 1 and char not in byte_decoder:
        for ch in replacable_letter:
            if ch not in custom_vocab and ch.isalpha() and ch not in already_used_character:
                byte_value = byte_decoder[ch]
                already_used_character.append(ch)
                keys_to_modify.append((ch, char, byte_value))
                break
            
print(keys_to_modify)
# Apply the changes
for old_key, new_key, byte_value in keys_to_modify:
    byte_decoder.pop(old_key)
    byte_decoder[new_key] = byte_value
    byte_encoder[byte_value] = new_key

class CustomWhisperTokenizer(WhisperTokenizer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # self.byte_decoder.update(byte_decoder)
        # self.byte_encoder.update(byte_encoder)
    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (string) in a single string."""
        text = "".join(tokens)
        # print(text)
        # text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
        return text
    
# Initialize the new tokenizer with the combined vocab and merges
tokenizer = CustomWhisperTokenizer(vocab_file=combined_vocab_path, merges_file=combined_merges_path)
tokenizer.byte_decoder = byte_decoder
tokenizer.byte_encoder = byte_encoder
# tokenizer = WhisperTokenizer.from_pretrained(pretrained_path, task="transcribe")

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.save_pretrained(tokenizer_path)

[('Ę', 'ŋ', 24), ('ę', 'ɓ', 25), ('Ĝ', 'ẽ', 28)]


('D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\tokenizer\\tokenizer_config.json',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\tokenizer\\special_tokens_map.json',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\tokenizer\\vocab.json',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\tokenizer\\merges.txt',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\tokenizer\\normalizer.json',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\tokenizer\\added_tokens.json')

In [None]:
print(tokenizer.byte_decoder)

In [186]:
pred_ids = [pretrained_tokenizer.encode('dam')]
print(pred_ids)
pred_strs = pretrained_tokenizer.batch_decode(pred_ids, skip_special_tokens = True)
print(pred_strs)

[[50258, 50359, 50363, 10170, 50257]]
['dam']


In [None]:
tokenizer

In [184]:
# check if the new tokens are already in the vocabulary
new_tokens = set(custom_tokenizer.get_vocab().keys()) - set(tokenizer.get_vocab().keys())

# add the tokens to the tokenizer vocabulary
pretrained_tokenizer.add_tokens(list(new_tokens))

23257

In [185]:
# Verify the updated tokenizer
for char in custom_vocab.keys():
    encoded = pretrained_tokenizer.encode(char)
    decoded = pretrained_tokenizer.decode(encoded)
    print(f"Original: {char}, Encoded: {encoded}, Decoded: {decoded}")

Original: a, Encoded: [50258, 50359, 50363, 64, 50257], Decoded: <|startoftranscript|><|transcribe|><|notimestamps|>a<|endoftext|>
Original: b, Encoded: [50258, 50359, 50363, 65, 50257], Decoded: <|startoftranscript|><|transcribe|><|notimestamps|>b<|endoftext|>
Original: c, Encoded: [50258, 50359, 50363, 66, 50257], Decoded: <|startoftranscript|><|transcribe|><|notimestamps|>c<|endoftext|>
Original: d, Encoded: [50258, 50359, 50363, 67, 50257], Decoded: <|startoftranscript|><|transcribe|><|notimestamps|>d<|endoftext|>
Original: e, Encoded: [50258, 50359, 50363, 68, 50257], Decoded: <|startoftranscript|><|transcribe|><|notimestamps|>e<|endoftext|>
Original: f, Encoded: [50258, 50359, 50363, 69, 50257], Decoded: <|startoftranscript|><|transcribe|><|notimestamps|>f<|endoftext|>
Original: g, Encoded: [50258, 50359, 50363, 70, 50257], Decoded: <|startoftranscript|><|transcribe|><|notimestamps|>g<|endoftext|>
Original: h, Encoded: [50258, 50359, 50363, 71, 50257], Decoded: <|startoftranscrip

KeyboardInterrupt: 

In [57]:
tokenizer.save_pretrained(r"D:\MARONE\WOLOF\SPEECH_TO_TEXT\MODELS\WHISPER\ASR_2\tokenizer")

('D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\ASR_2\\tokenizer\\tokenizer_config.json',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\ASR_2\\tokenizer\\special_tokens_map.json',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\ASR_2\\tokenizer\\vocab.json',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\ASR_2\\tokenizer\\merges.txt',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\ASR_2\\tokenizer\\normalizer.json',
 'D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\MODELS\\WHISPER\\ASR_2\\tokenizer\\added_tokens.json')

In [66]:
old_vocab = tokenizer.get_vocab()
new_vocab = new_tokenizer.get_vocab()
new_tokens = list(set(new_vocab.keys()) - set(old_vocab.keys()))
tokenizer.add_tokens(new_tokens)
# print(new_tokens)




['aljihad', 'duggoon', 'boppum', 'dimblee', 'kuut', 'soxlay', 'tegaloon', 'wayba', 'daarur', 'mellen', 'ofisiye', 'asal', 'yeemu', 'jébbaluleen', 'nikus', 'waun', 'lejep', 'leeru', 'yelen', 'jennd', 'wéttal', 'julli', 'fawwmel', 'téyeleen', 'bayer', 'diwone', 'méngook', 'digaaleb', 'saagu', 'nangooti', 'xaritook', 'kumandan', 'miirub', 'deewug', 'méñ', 'wolaŋ', 'naataange', 'néewdidoole', 'mannda', 'dedd', 'yéréy', 'dajewukaayu', 'neeti', 'gannar', 'nantee', 'gadi', 'tegleen', 'xalaatleen', 'toloftolof', 'naples', 'lati', 'weel', 'tëd', 'guddaayu', 'juróomñettfukk', 'faadal', 'albaatar', 'larry', 'lonku', 'soppul', 'toppnjiitu', 'yilif', 'silu', 'tebbi', 'juróombennfukkeel', 'njekkee', 'degar', 'afandii', 'gaññ', 'olympi', 'jamñaajo', 'kordo', 'shab', 'mannaan', 'nanguwutóona', 'joantekat', 'niseeriyaa', 'sai', 'tëddleen', 'nguu', 'ibraayima', 'ceex', 'manatu', 'riime', 'njëlbéen', 'oktobar', 'ruslu', 'yoqqat', 'xéewloo', 'gallu', 'gué', 'yooy', 'féetewul', 'mamoor', 'golo', 'daalu', '

In [70]:
s = new_tokenizer("lu muy neú ko ko foox")
print(s)

ValueError: type of None unknown: <class 'NoneType'>. Should be one of a python, numpy, pytorch or tensorflow object.

In [11]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", task="transcribe")
processor.tokenizer = tokenizer
processor.feature_extractor = feature_extractor

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# # Define the size of the train and test sets
# train_size = int(0.8 * len(dataset))  # 80% for training
# test_size = len(dataset) - train_size  # 20% for testing

# # Split the dataset
# train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [23]:
from base.base_dataset import BaseDataset
special_tokens = {"bos_token" : "<bos>",
"eos_token" : "<eos>",
"unk_token" : "<unk>",
"pad_token" : "<pad>"}
sr = 16000
delimiter = "|"
path_train = r"D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\DATA\\CLEANED\\WOLOF_AUDIO_TRANS\\validation_data.csv"
path_validation = r"D:\\MARONE\\WOLOF\\SPEECH_TO_TEXT\\DATA\\CLEANED\\WOLOF_AUDIO_TRANS\\validation_data.csv"

In [25]:
import pandas as pd
train_data = pd.read_csv(path_train, delimiter=delimiter)
eval_data = pd.read_csv(path_validation, delimiter=delimiter)
train_data.head(2).T

Unnamed: 0,0,1
Unnamed: 0,15886,15003
transcription,Lu muy mébét ne ko ko foox,sa bésu ngénte laay xaar ma jox la tur
filename,ba524146-26b0-4d13-81fd-5a16547d9ce3/3dae16688...,WOL_03_lect_0321
path,DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\audio\ba...,DATA\CLEANED\WOLOF_AUDIO_TRANS\alffa_git\audio...


In [34]:
import regex

chars_to_keep = r'[^a-zA-Z\sёñïóŋöäàéîā́сđớ\'ˈоɗɲtx їüúaëçèĩã̈ûjämсукéеɓìs️öŋïõăаrýànóvñlò̃qẽyfƭhgziâwíńồpêáôùёībkр]'
chars_to_ignore = r'[кɲớˈ\'\xa0\r\n]'
replace_dict = {'ï': 'a', 'î': 'i', 'ā': 'a', 'ƭ': 'c', 'ī': 'i', 'ä': 'a', 'ɗ': 'nd', 'ń': 'ñ', 'ồ': 'o',
                    'ї': 'i', 'ü': 'u', 'ù': 'u', 'ú': 'u', 'ă': 'ã', '̃': '', 'â': 'a', '́': '', 'û': 'u',
                    '̈': '', 'è': 'e', 'ç': 's', 'ö': 'o', 'ý': 'y', 'ì': 'i', 'í': 'i', '̀': '', 'ɓ':'b', 'ô':'o',
                    'ê':'e', 'à':'a'}
    
def clean_transcript(transcript) -> str:
    cleaned_transcript = ""
    for word in transcript.split(','):
        if cleaned_transcript != "":
            cleaned_transcript = cleaned_transcript + ", " + word.rstrip().lstrip()
        else:
            cleaned_transcript = word
    cleaned_transcript = regex.sub(chars_to_keep, '', cleaned_transcript).lower() + " "
    cleaned_transcript = regex.sub(chars_to_ignore, '', cleaned_transcript) + " "

    for key, value in replace_dict.items():
        cleaned_transcript = regex.sub(key, value, cleaned_transcript) + " "
    return cleaned_transcript

In [47]:
from utils.feature import load_wav
import numpy as np

def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = load_wav(batch['path'], sr = sr)

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio, sampling_rate = sr).input_features[0]

    # encode target text to label ids 
    cleaned_trans =  clean_transcript(batch["transcription"])
    print(cleaned_trans)
    batch["labels"] = tokenizer(cleaned_trans).input_ids
    return batch

ValueError: type of None unknown: <class 'NoneType'>. Should be one of a python, numpy, pytorch or tensorflow object.

In [48]:
train_dataset = train_data[["transcription", "path"]].apply(lambda row: prepare_dataset(row), axis=1)

lu muy mébét ne ko ko foox                                


ValueError: type of None unknown: <class 'NoneType'>. Should be one of a python, numpy, pytorch or tensorflow object.

# Combine To Create A WhisperProcessor

# Training and Evaluation

## Define a Data Collator

In [12]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [13]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

## Evaluation Metrics

In [15]:
# !pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
   ---------------------------------------- 84.1/84.1 kB 2.3 MB/s eta 0:00:00
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2




In [16]:
import evaluate

metric = evaluate.load("wer")

In [17]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Load a PreTrained Checkpoint

In [18]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [19]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

## Define the Training Configuration

In [20]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir=r"D:\MARONE\WOLOF\SPEECH_TO_TEXT\MODELS\WHISPER\ASR_3",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [None]:
processor.save_pretrained(training_args.output_dir)

## Training

In [None]:
trainer.train()

In [None]:
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_11_0",
    "dataset": "Common Voice 11.0",  # a 'pretty' name for the training dataset
    "dataset_args": "config: hi, split: test",
    "language": "hi",
    "model_name": "Whisper Small Hi - Sanchit Gandhi",  # a 'pretty' name for our model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
    "tags": "hf-asr-leaderboard",
}

In [None]:
trainer.push_to_hub(**kwargs)

# Building a Demo

In [None]:
from transformers import pipeline
import gradio as gr

pipe = pipeline(model="sanchit-gandhi/whisper-small-hi")  # change to "your-username/the-name-you-picked"

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs="text",
    title="Whisper Small Hindi",
    description="Realtime demo for Hindi speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()