In [1]:
pip install torch transformers soundfile sounddevice numpy

Collecting sounddevice
  Downloading sounddevice-0.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Coll

In [2]:
!apt-get update && apt-get install -y portaudio19-dev python3-pyaudio
!pip install sounddevice soundfile transformers torch numpy pyaudio

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,773 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,241 kB]
Get:13 http://security.ubuntu.com/ubuntu jammy-security/restrict

In [3]:
from IPython.display import Audio, display
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import pipeline
import numpy as np
from datetime import datetime
import IPython.display as ipd
from google.colab import files

class WhisperTranscriberColab:
    def __init__(self, model_size="base"):
        """
        Initialize the Whisper model
        model_size options: "tiny", "base", "small", "medium", "large"
        """
        print(f"Loading Whisper {model_size} model...")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_size = model_size
        self.sample_rate = 16000

        # Load model and processor
        model_name = f"openai/whisper-{model_size}"
        self.processor = WhisperProcessor.from_pretrained(model_name)
        self.model = WhisperForConditionalGeneration.from_pretrained(model_name).to(self.device)

        # Alternative: Use pipeline for simpler usage
        self.pipe = pipeline(
            "automatic-speech-recognition",
            model=model_name,
            device=self.device
        )

        print(f"Model loaded successfully! Using device: {self.device}")

    def transcribe_uploaded_file(self, audio_file):
        """
        Transcribe uploaded audio file
        """
        print("Processing audio...")

        # Use pipeline for file transcription
        result = self.pipe(audio_file)
        return result["text"]

def save_transcription(text, filename=None):
    """Save transcription to file"""
    if filename is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"transcription_{timestamp}.txt"

    with open(filename, 'w', encoding='utf-8') as f:
        f.write(text)
    return filename

def main():
    print("Whisper Speech Recognition System for Google Colab")
    print("===============================================")

    model_sizes = ["tiny", "base", "small", "medium", "large"]
    print("\nAvailable model sizes:", ", ".join(model_sizes))
    print("Note: Larger models are more accurate but require more memory and processing power")

    model_size = input(f"\nChoose model size (default: base): ").strip().lower()
    if model_size not in model_sizes:
        model_size = "base"

    transcriber = WhisperTranscriberColab(model_size)

    print("\nPlease upload an audio file (supported formats: mp3, wav, m4a, etc.)")
    uploaded = files.upload()

    for filename in uploaded.keys():
        print(f"\nProcessing file: {filename}")
        try:
            # Transcribe
            transcription = transcriber.transcribe_uploaded_file(filename)

            print("\nTranscription:")
            print("--------------")
            print(transcription)

            transcript_file = save_transcription(transcription)
            print(f"\nTranscription saved to: {transcript_file}")

            files.download(transcript_file)

        except Exception as e:
            print(f"Error processing {filename}: {str(e)}")

if __name__ == "__main__":
    main()

Whisper Speech Recognition System for Google Colab

Available model sizes: tiny, base, small, medium, large
Note: Larger models are more accurate but require more memory and processing power

Choose model size (default: base): medium
Loading Whisper medium model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

Device set to use cpu


Model loaded successfully! Using device: cpu

Please upload an audio file (supported formats: mp3, wav, m4a, etc.)


Saving test1.mp3 to test1 (1).mp3

Processing file: test1 (1).mp3
Processing audio...


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.



Transcription:
--------------
 All right, now we will be running the pre-trained model from the OpenAI which is something called the WISPAR model.

Transcription saved to: transcription_20250402_061229.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>