In [2]:
pip install whisper

Collecting whisper
  Downloading whisper-1.1.10.tar.gz (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: whisper
  Building wheel for whisper (setup.py) ... [?25l[?25hdone
  Created wheel for whisper: filename=whisper-1.1.10-py3-none-any.whl size=41120 sha256=bffa77e9f3e572ff9eb18fb0fa32ae1642541449aaa2658ac7084c64a899123b
  Stored in directory: /root/.cache/pip/wheels/21/65/ee/4e6672aabfa486d3341a39a04f8f87c77e5156149299b5a7d0
Successfully built whisper
Installing collected packages: whisper
Successfully installed whisper-1.1.10


In [3]:
import whisper
import torch
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

def transcribe_audio(audio_path):
    """
    Transcribe audio to text using Whisper model.

    Supported models:
    - tiny: Fastest, lowest quality
    - base: Small size, moderate quality
    - small: Balanced performance
    - medium: High quality
    - large: Best accuracy, most compute-intensive
    """
    model = whisper.load_model("base")  # Choose model size
    result = model.transcribe(audio_path)
    return result["text"]

def generate_summaries(text):
    """
    Generate summaries using different models.

    Summarization Models:
    1. BART-based models
    2. T5-based models
    3. PEGASUS models
    """
    # BART Summarization
    bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    bart_summary = bart_summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']

    # T5 Summarization
    t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
    t5_tokenizer = AutoTokenizer.from_pretrained("t5-small")

    t5_inputs = t5_tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    t5_summary_ids = t5_model.generate(t5_inputs["input_ids"], num_beams=4, max_length=100, early_stopping=True)
    t5_summary = t5_tokenizer.decode(t5_summary_ids[0], skip_special_tokens=True)

    return {
        "BART Summary": bart_summary,
        "T5 Summary": t5_summary
    }

def main(audio_path):
    """Main function to process audio and generate summaries"""
    # Transcribe audio
    transcribed_text = transcribe_audio(audio_path)
    print("Transcribed Text:")
    print(transcribed_text)

    # Generate summaries
    summaries = generate_summaries(transcribed_text)

    print("\nSummaries:")
    for model, summary in summaries.items():
        print(f"\n{model}:")
        print(summary)


In [4]:
# Example usage
if __name__ == "__main__":
    main("/content/sample.mp3")

AttributeError: module 'whisper' has no attribute 'load_model'

In [8]:
import whisper
import torch
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

def transcribe_audio(audio_path):
    """
    Transcribe audio to text using Whisper model.

    Supported models:
    - tiny: Fastest, lowest quality
    - base: Small size, moderate quality
    - small: Balanced performance
    - medium: High quality
    - large: Best accuracy, most compute-intensive
    """
    model = whisper.load_model('base')  # Correct method for loading Whisper model
    result = model.transcribe(audio_path)
    return result["text"]

def generate_summaries(text):
    """
    Generate summaries using different models.
    """
    # BART Summarization
    bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    bart_summary = bart_summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']

    # T5 Summarization
    t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
    t5_tokenizer = AutoTokenizer.from_pretrained("t5-small")

    t5_inputs = t5_tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    t5_summary_ids = t5_model.generate(t5_inputs["input_ids"], num_beams=4, max_length=100, early_stopping=True)
    t5_summary = t5_tokenizer.decode(t5_summary_ids[0], skip_special_tokens=True)

    return {
        "BART Summary": bart_summary,
        "T5 Summary": t5_summary
    }

def main(audio_path):
    """Main function to process audio and generate summaries"""
    # Transcribe audio
    transcribed_text = transcribe_audio(audio_path)
    print("Transcribed Text:")
    print(transcribed_text)

    # Generate summaries
    summaries = generate_summaries(transcribed_text)

    print("\nSummaries:")
    for model, summary in summaries.items():
        print(f"\n{model}:")
        print(summary)

# Example usage
if __name__ == "__main__":
    main("/content/sample.mp3")

AttributeError: module 'whisper' has no attribute 'load_model'

In [6]:
!pip install --upgrade whisper # update whisper to ensure you have the most recent version with the load_model() function

# Restart your runtime/kernel at this point



Approach 2: Using Google Speech Recognition Model

In [11]:
!pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.14.1-py3-none-any.whl.metadata (31 kB)
Downloading SpeechRecognition-3.14.1-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.1


In [22]:
import speech_recognition as sr
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer

def transcribe_audio(audio_path):
    """
    Transcribe audio to text using Google Speech Recognition.
    Supports various audio formats (WAV, FLAC, etc.)
    """
    recognizer = sr.Recognizer()

    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)

    try:
        # Use Google Speech Recognition (requires internet)
        text = recognizer.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        return "Google Speech Recognition could not understand audio"
    except sr.RequestError as e:
        return f"Could not request results from Google Speech Recognition service; {e}"

def generate_summaries(text, max_length=150):
    """
    Generate summaries using multiple models.
    """
    summaries = {}

    # BART Summarization
    bart_summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summaries["BART"] = bart_summarizer(
        text,
        max_length=max_length,
        min_length=30,
        do_sample=False
    )[0]['summary_text']

    # T5 Summarization
    t5_model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")
    t5_tokenizer = AutoTokenizer.from_pretrained("t5-small")

    t5_inputs = t5_tokenizer(
        "summarize: " + text,
        return_tensors="pt",
        max_length=512,
        truncation=True
    )
    t5_summary_ids = t5_model.generate(
        t5_inputs["input_ids"],
        num_beams=4,
        max_length=max_length,
        early_stopping=True
    )
    summaries["T5"] = t5_tokenizer.decode(t5_summary_ids[0], skip_special_tokens=True)

    return summaries

def main(audio_path):
    """Main function to process audio and generate summaries"""
    # Transcribe audio
    transcribed_text = transcribe_audio(audio_path)
    print("Transcribed Text:")
    print(transcribed_text)

    # Generate summaries
    summaries = generate_summaries(transcribed_text)

    print("\nSummaries:")
    for model, summary in summaries.items():
        print(f"\n{model} Summary:")
        print(summary)

# Example usage
if __name__ == "__main__":
    main("/content/sample.wav")

Transcribed Text:
break to end the long night of their captivity Bud 100 years later the Negro still is not free 100 years later and the chains of discrimination 100 years ago


Device set to use cpu
Your max_length is set to 150, but your input_length is only 32. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=16)



Summaries:

BART Summary:
The Negro still is not free 100 years later and the chains of discrimination 100 years ago. Break to end the long night of their captivity Bud.

T5 Summary:
the Negro still is not free 100 years later and the chains of discrimination 100 years ago.
