# Upgrading necessary libraries

In [None]:
!pip install --upgrade pip
!pip install --upgrade datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio Unidecode
!pip install tensorflow[and-cuda]

In [None]:
!nvidia-smi

In [None]:
from huggingface_hub import notebook_login, login

HUGGINGFACE_TOKEN = ""  # hugging face token with write privilege


login(HUGGINGFACE_TOKEN)

In [None]:
model_name = "openai/whisper-small"
dataset_name = "mozilla-foundation/common_voice_11_0"
language = "Uzbek"
language_abbr = "uz"

# Load dataset

In [None]:
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

common_voice["train"] = load_dataset(dataset_name, language_abbr ,split="train[:50%]" ,token=HUGGINGFACE_TOKEN, trust_remote_code=True)
common_voice["validation"] = load_dataset(dataset_name, language_abbr ,split="validation[:40%]" ,token=HUGGINGFACE_TOKEN, trust_remote_code=True)
common_voice["test"] = load_dataset(dataset_name, language_abbr ,split="test[:40%]" ,token=HUGGINGFACE_TOKEN, trust_remote_code=True)


print(common_voice)

In [None]:
import os

for split in ["train", "validation", "test"]:
  total_size = sum(os.path.getsize(file_path) for file_path in common_voice[split]["path"])
  print(f"Total {split} dataset size: {total_size / (1024 ** 3):.2f} GB")

# Define function to convert numbers to Uzbek words

In [None]:
ones = ["", "bir", "ikki", "uch", "to'rt", "besh", "olti", "yetti", "sakkiz", "to'qqiz"]
tens = ["", "o'n bir", "o'n ikki", "o'n uch", "o'n to'rt", "o'n besh", "o'n olti", "o'n yetti", "o'n sakkiz", "o'n to'qqiz"]
teens = ["", "o'n", "yigirma", "o'ttiz", "qirq", "ellik", "oltmish", "yetmish", "sakson", "to'qson"]

def convert_three_digits(num):
    word = ""
    if num // 100 > 0:
        word += ones[num // 100] + " yuz"
        num %= 100
        if num > 0:
            word += " "

    if num >= 11 and num <= 19:
        word += tens[num - 10]
    else:
        if num // 10 > 0:
            word += teens[num // 10]
            num %= 10
            if num > 0:
                word += " " + ones[num]
        else:
            word += ones[num]

    return word.strip()


def convert_number(num):
    if num == 0:
        return "nol"

    words = []
    scales = ["", "ming", "million", "milliard", "trillion"]
    scale_idx = 0

    while num > 0:
        if num % 1000 > 0:
            words.append(convert_three_digits(num % 1000) + (" " + scales[scale_idx] if scale_idx > 0 else ""))
        num //= 1000
        scale_idx += 1

    return " ".join(reversed(words))

In [None]:
import re
import string
import unidecode

chars_to_ignore_regex=f"[{re.escape(string.punctuation)}]"


def number2word(sentence):
    words = sentence.split()
    for i, word in enumerate(words):
        if word.isdigit():
            words[i] = convert_number(int(word))
    return " ".join(words)


def remove_special_characters(batch):
    batch["sentence"] = re.sub("-", " ", batch["sentence"])
    batch["sentence"] = re.sub(
        chars_to_ignore_regex,
        "",
        re.sub(
            "['`´]", "’",
            re.sub(
                "([og])['`´]", "\g<1>‘",
                unidecode.unidecode(number2word(batch["sentence"])).lower()
            )
        )
    ) + " "


    return batch

In [None]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

common_voice = common_voice.map(remove_special_characters)

print(common_voice.column_names)

# Prepare Feature Extractor, Tokenizer and Data

1. A feature extractor which pre-processes the raw audio-inputs
2. The model which performs the sequence-to-sequence mapping
3. A tokenizer which post-processes the model outputs to text format

In [None]:
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor


feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)

tokenizer = WhisperTokenizer.from_pretrained(model_name, language=language, task="transcribe")

processor = WhisperProcessor.from_pretrained(model_name, language=language, task="transcribe")

## Downsampling the audio from 48kHz to 16kHz that is expected by Whisper model

In [None]:
print(common_voice["train"][0])

In [None]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(common_voice["train"][0])

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
import multiprocessing

cpu_count = multiprocessing.cpu_count()
print("Number of CPU cores:", cpu_count)

## Apply the preparation function above to our data

In [None]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=cpu_count)

The **num_proc** parameter is used for parallel processing, meaning it determines how many processes run simultaneously. It returns the number of CPU cores in the computer (e.g., 4, 8, 16). The map function divides the dataset processing into this number of parallel processes, thereby increasing speed.

## Load a Pre-Trained Checkpoint

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_name)

In [None]:
model.generation_config.language = "uzbek"
model.generation_config.task = "transcribe"

model.generation_config.forced_decoder_ids = None

In [None]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding witdecoder_start_token_idh -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

## Evaluation Metrics

In [None]:
import evaluate

metric = evaluate.load("wer")

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

## Define the Training Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-uz",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [None]:
processor.save_pretrained(training_args.output_dir)

## Training

In [None]:
trainer.train()

## Pushing the model to Hugging face Hub

In [None]:
kwargs = {
    "dataset_tags": dataset_name,
    "dataset": "Common Voice 11.0",
    "dataset_args": "config: uz, split: test",
    "language": "uz",
    "model_name": "Whisper-small-uz-V2 ",  # a 'pretty' name for our model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}

In [None]:
trainer.push_to_hub(**kwargs)

## Uploaded model import

In [13]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("automatic-speech-recognition", model="tukhtashevshohruh/whisper-small-uz")

Device set to use cuda:0


In [12]:
from pydub import AudioSegment

m4a_audio = AudioSegment.from_file("/kaggle/input/uz-audio/test_1.m4a", format="m4a")
m4a_audio.export("audio1.mp3", format="mp3")

<_io.BufferedRandom name='audio1.mp3'>

## Play audio

In [14]:
import numpy as np
from IPython.display import Audio
audio_path = "/kaggle/working/audio1.mp3"

Audio(audio_path, rate=250)

### [audio link](http://https://drive.google.com/file/d/15RiuVGWfLOPWpG56w4HbtRxHO46kOG7-/view?usp=drive_link)

## Test fine tuning model

In [15]:
text = pipe(audio_path)





In [16]:
# Natijani chiqarish
print("Matn:", text['text'])

Matn: propaganda uch turt besh yoxud miyaning chirishi bu kontentdan olgan xulasalarim go‘yo ichimdagi ovozga o‘xshaydi ustozlar jamoasiga hurmat ham ilm fan xantibi va diniy tomondan hozirgi kun vabosini yoritishibdi baraka topkorlar 
