INSTALLING REQUIRED LIBRARIES AND MOUNTING DRIVE TO STORE CHECKPOINTS

In [None]:
!pip install --upgrade --quiet pip
!pip install --upgrade --quiet datasets[audio] transformers accelerate evaluate jiwer tensorboard gradio

from google.colab import drive
drive.mount('/content/drive')


HUGGING FACE LOGIN FOR MODEL, DATA DOWNLOADS AND FINAL UPLOAD

In [None]:

from huggingface_hub import notebook_login
notebook_login()


LOADING DATASETS AS REQUIRED

In [None]:
from datasets import load_dataset, DatasetDict

# Load your datasets
common_voice = DatasetDict()
common_voice["train"] = load_dataset("PradyumSomebody/Audio_Medical4", "default", split="train", use_auth_token=True)
common_voice["test"] = load_dataset("PradyumSomebody/Audio_Medical4", "default", split="train", use_auth_token=True)

common_voice



REMOVING REDUNDANT DATA COLUMNS

In [None]:

# Load the second dataset
additional_data = load_dataset("PradyumSomebody/Audio_Medical3", "default", split="train", use_auth_token=True)

# Remove the redundant columns
if 'Unnamed: 2' in additional_data.column_names:
    additional_data = additional_data.remove_columns(['Unnamed: 2'])
if 'Audio' in additional_data.column_names:
    additional_data = additional_data.remove_columns(['Audio'])

additional_data



In [None]:
# Concatenate the datasets
from datasets import load_dataset, DatasetDict, concatenate_datasets
common_voice["train"] = concatenate_datasets([common_voice["train"], additional_data])
common_voice

DOWNLOADING IMPORTANT FEATURES-

FEATURE EXTRACTOR FOR CONVERTING AUDIO FILES TO ACCEPTABLE INPUT FORMAT

TOKENIZER TO CONVERT SENTENCES TO ACCEPTABLE INPUT FORMAT

In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="English", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="English", task="transcribe")


In [8]:
from datasets import Audio

# Cast the 'audio' column to Audio and prepare the dataset
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["Sentence"]).input_ids
    return batch

common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice["train"].column_names, num_proc=1)


Map:   0%|          | 0/600 [00:00<?, ? examples/s]

LOADING THE MODEL FROM HF

In [10]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
model.generation_config.language = "english"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None


In [11]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)


In [12]:
import evaluate
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}


SETTING TRAINING ARGUMENTS

BATCH SIZE = 10 AUDIO FILES PER BATCH

MAX STEPS = 300 BATCHES

CHECKPOINTING EVERY 20 BATCHES(15 CHECKPOINTS TOTAL)

EVALUATING EVERY 10 BATCHES

LOGGING STATE EVERY 5 BATCHES


In [13]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/My Drive/whisper-small-hi-custom4",  # change to a repo name of your choice
    per_device_train_batch_size=10,  # Batch size per device (10 audio files per batch)
    gradient_accumulation_steps=1,   # No gradient accumulation
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=300,  # Total training steps
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=20,  # Save checkpoint every 2 batches
    eval_steps=10,
    logging_steps=5,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
    save_total_limit=3,  # Keep only the last 3 checkpoints
)




In [14]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)


max_steps is given, it will override any value given in num_train_epochs


FINDING SECOND LATEST CHECKPOINT IF IT EXISTS(NOT LATEST BECAUSE MIGHT HAVE INCONSISTENCIES)

STARTING TRAINING IF NO CHECKPOINTS OTHERWISE SDTARTING FROM SECOND LAST CHECKPOINT

In [16]:
import os

if training_args.max_steps > 0:
    max_steps = training_args.max_steps
    steps_trained_in_current_epoch = 0
else:
    max_steps = len(common_voice["train"]) // training_args.gradient_accumulation_steps * training_args.num_train_epochs
    steps_trained_in_current_epoch = 0

# Check if there is an existing checkpoint to resume from
checkpoint_dir = training_args.output_dir
last_checkpoint = None
if os.path.exists(checkpoint_dir):
    checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint")]
    if checkpoints:
        last_checkpoint = os.path.join(checkpoint_dir, sorted(checkpoints, key=lambda x: int(x.split('-')[1]))[-2])

# Resume from the last checkpoint if available
if last_checkpoint:
    print(f"Resuming from checkpoint: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("Starting training from scratch")
    trainer.train()

processor.save_pretrained(training_args.output_dir)


Starting training from scratch


`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Wer
10,0.7221,0.839529,19.19571
20,0.7458,0.829741,18.016086
30,0.7464,0.794459,17.158177
40,0.5503,0.732778,16.407507
50,0.5756,0.67106,14.477212
60,0.4484,0.580784,13.243968
70,0.2956,0.405876,12.600536
80,0.1247,0.269624,12.439678
90,0.2073,0.232277,10.831099
100,0.0944,0.203067,9.329759


You have passed task=transcribe, but also have set `forced_decoder_ids` to [[1, 50259], [2, 50359], [3, 50363]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of task=transcribe.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'b

[]

In [17]:
kwargs = {
    "dataset_tags": "PradyumSomebody/Audio_Medical4",
    "dataset": "Audio Medical Combined Dataset",  # a 'pretty' name for the training dataset
    "dataset_args": "config: combined, split: train",
    "language": "en",
    "model_name": "Whisper Small EN - Pradyum Agarwal 4",  # a 'pretty' name for our model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}

trainer.push_to_hub(**kwargs)


Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}


CommitInfo(commit_url='https://huggingface.co/PradyumSomebody/whisper-small-hi-custom4/commit/34dee97f58042b5cdff1d219857a60ac155ac866', commit_message='End of training', commit_description='', oid='34dee97f58042b5cdff1d219857a60ac155ac866', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
from transformers import pipeline
import gradio as gr

pipe = pipeline(model="PradyumSomebody/whisper-small-hi-custom4")  # change to "your-username/the-name-you-picked"

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),  # Removed the 'source' argument
    outputs="text",
    title="Whisper Small English",
    description="Realtime demo for medical speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()

config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://1d9f47285406138cf9.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




EVALUATION

In [None]:
from transformers import pipeline
import os

# Initialize the fine-tuned Whisper model pipeline
fine_tuned_pipe = pipeline(model="PradyumSomebody/whisper-small-hi2")  # change to "your-username/the-name-you-picked"

# Initialize the base Whisper model pipeline
base_pipe = pipeline(model="openai/whisper-small")

# Define the directory where the audio files are stored
audio_dir = "audio_files"
audio_files = sorted([os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith('.mp3')], key=lambda x: int(os.path.basename(x).split('.')[0]))

# Expected sentences (in the same order as they were generated)
expected_sentences = [
    "The patient reports severe headaches and blurred vision.",
    "Dr. Smith prescribed amoxicillin for the bacterial infection.",
    "She has a history of diabetes and hypertension.",
    "The MRI scan revealed a benign tumor in the brain.",
    "Please take two tablets of ibuprofen every six hours for pain relief.",
    "The child was diagnosed with chickenpox last week.",
    "He has been experiencing shortness of breath and chest pain.",
    "The lab results indicate an elevated white blood cell count.",
    "Administer 5 milligrams of prednisone daily for asthma management.",
    "The patient should follow a low-sodium diet to manage their blood pressure.",
    "Her symptoms include fever, cough, and difficulty breathing.",
    "The biopsy confirmed the presence of malignant cells.",
    "He needs to schedule a follow-up appointment in two weeks.",
    "The nurse will administer the influenza vaccine today.",
    "Please avoid taking aspirin without consulting your doctor."
]

# Function to transcribe audio using the fine-tuned model
def transcribe_audio_fine_tuned(audio_path):
    result = fine_tuned_pipe(audio_path)
    return result["text"]

# Function to transcribe audio using the base model
def transcribe_audio_base(audio_path):
    result = base_pipe(audio_path)
    return result["text"]

# Transcribe each audio file and print the expected, base, and fine-tuned texts
for i, audio_file in enumerate(audio_files):
    transcribed_text_fine_tuned = transcribe_audio_fine_tuned(audio_file)
    transcribed_text_base = transcribe_audio_base(audio_file)
    print(f"Expected:    {expected_sentences[i]}")
    print(f"Base:        {transcribed_text_base}")
    print(f"Fine-tuned:  {transcribed_text_fine_tuned}\n")
