<a href="https://colab.research.google.com/github/Vedalaxman/Audio_to_Text/blob/main/Wishper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch

import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install -U bitsandbytes

In [None]:
import transformers

In [None]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, BitsAndBytesConfig
model_id = "Na0s/Medical-Whisper-Large-v3"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Loading model on device: {device}")
processor = AutoProcessor.from_pretrained(model_id)
print("Processor Loaded")
if device == "cuda":
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.float16
    )
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        use_safetensors=True
    )
    print("Model Loaded in 8-bit precision with float16 compute dtype.")
else:
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id,
        torch_dtype=torch.float32,
        low_cpu_mem_usage=True,
        use_safetensors=True
    ).to(device)
    print("Model Loaded on CPU in float32 precision.")

print("Model Loaded")

In [None]:
file_path = '/content/eval-00000-of-00001.parquet'

try:
    df = pd.read_parquet(file_path)
    print('Done')
except FileNotFoundError:
    print(f"File not found: {file_path}")
except Exception as e:
    print(f"Error reading Parquet: {e}")

In [None]:
df

In [None]:
from datasets import Dataset, Audio
import librosa

In [None]:
/*dataset = Dataset.from_pandas(df)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

columns_to_remove = []
if 'audio_id' in dataset.features:
    columns_to_remove.append('audio_id')
if 'duration' in dataset.features:
    columns_to_remove.append('duration')

if columns_to_remove:
    dataset = dataset.remove_columns(columns_to_remove)

In [None]:
dataset

In [None]:
test_df = dataset.select(range(5))

In [None]:
test_df

In [None]:
def transcribe_audio_batch(batch):
    audio_inputs = [item["array"] for item in batch["audio"]]
    input_features = processor.feature_extractor(
        audio_inputs,
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features.to(device)
    if device == "cuda":
        input_features = input_features.to(torch.float16)
    predicted_ids = model.generate(input_features, language="en", task="transcribe")
    transcriptions = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
    return {"transcribed_text": transcriptions}

In [None]:
transcription_results_dataset = test_df.map(
    transcribe_audio_batch,
    batched=True,
    batch_size=1,
    remove_columns=["audio"]
)

df_final = transcription_results_dataset.to_pandas

In [None]:
df_final

In [None]:
import soundfile as sf
import librosa

In [None]:
def transcribe_single_audio(audio_path):
    speech_array, sr = librosa.load(audio_path, sr=16000)
    input_features = processor.feature_extractor(
        speech_array,
        sampling_rate=sr,
        return_tensors="pt"
    ).input_features.to(device)
    if device == "cuda":
        input_features = input_features.to(torch.float16)
    predicted_ids = model.generate(input_features, language="en", task="transcribe")
    transcription = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription

In [None]:
audio_path = '/content/OSR_us_000_0011_8k.wav'

In [None]:
text = transcribe_single_audio(audio_path)

In [None]:
text