In [1]:
# pip install transformers datasets torch librosa

In [2]:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import torch
import librosa
import requests
import os

# pip install paramiko --upgrade
# pip install cryptography --upgrade

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [3]:
model_name = "openai/whisper-small"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
def transcribe_audio(audio_path, chunk_duration=30):
    """Load and transcribe audio in chunks to handle long files."""
    try:
        # Load the audio file
        audio, sr = librosa.load(audio_path, sr=16000)
        
        # Split audio into 30-second chunks
        chunk_size = chunk_duration * sr
        total_chunks = len(audio) // chunk_size + (1 if len(audio) % chunk_size != 0 else 0)
        
        # Transcribing each chunk and combine results
        transcriptions = []
        for i in range(total_chunks):
            start = i * chunk_size
            end = (i + 1) * chunk_size
            chunk = audio[start:end]
            
            # Processing the chunk using Whisper
            inputs = processor(chunk, sampling_rate=16000, return_tensors="pt", return_attention_mask=True)
            input_features = inputs.input_features.to("cuda" if torch.cuda.is_available() else "cpu")
            attention_mask = inputs.attention_mask.to("cuda" if torch.cuda.is_available() else "cpu")

            # Generating transcription in English
            with torch.no_grad():
                generated_ids = model.generate(
                    input_features,
                    attention_mask=attention_mask,
                    forced_decoder_ids=processor.get_decoder_prompt_ids(language="en", task="transcribe")
                )

            # Decoding  transcription
            transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
            transcriptions.append(transcription)

        # Joining transcriptions of all chunks
        final_transcription = " ".join(transcriptions)
        return final_transcription
    
    except Exception as e:
        return f"Error processing audio: {str(e)}"


def download_audio(url, save_path="downloaded_audio.mp3"):
    """Download audio file from a URL."""
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with open(save_path, "wb") as f:
                f.write(response.content)
            return save_path
        else:
            return None
    except Exception as e:
        return None

# For local file
# audio_file_path = r"C:\Users\91705\Desktop\CYFUTURE\transcribing.mp3"  

# For remote audios from internet
audio_url = "https://www.nch.com.au/scribe/practice/audio-sample-1.mp3"

# Checking if URL is provided
if audio_url:
    audio_file_path = download_audio(audio_url)
    if not audio_file_path:
        print("Error: Failed to download audio from URL.")
        exit()

# Transcribing audio
if os.path.exists(audio_file_path):
    result = transcribe_audio(audio_file_path)
    print("Transcription:\n", result)
    
else:
    print("Error: Audio file not found.")



Transcription:
  Please type your statement for a winner statement. Address is Sample Solicitors. Address 1, Address 2, Address 3 and then Post-Grid Underneath. The date is the 19th of February 2010. Your reference is A for Alpha, B for Bravo, C for Charlie, Slash, P for Papa, Q for Quebec, R for Romeo, Slash 1, 2, Slash 3, 4, 5. Our reference is A for Alpha, C for Charlie, C for Charlie, 9, 8, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9  Next line  Please type Client underlined and bold as the head in.  We would refer to the attached sta