# Interactive Audio Transcription and Summarization

This notebook allows you to transcribe an audio file and generate a summary using Ollama.

In [4]:
import json
from datetime import datetime
import subprocess
import ollama
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import ipywidgets as widgets
from IPython.display import display

# Check for CUDA availability
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(f"Using device: {device}")

Using device: cpu


## Load the Speech Recognition Model

In [8]:
available_models = [
    "openai/whisper-tiny",
    "openai/whisper-base",
    "openai/whisper-small",
    "openai/whisper-medium",
    "openai/whisper-large-v3",
    "distil-whisper/distil-large-v2",
    "distil-whisper/distil-medium.en",
]

In [10]:
model_dropdown = widgets.Dropdown(
    options=available_models,
    value="distil-whisper/distil-large-v2",
    description="Model:",
    disabled=False,
)

In [None]:
model_id = model_dropdown.value

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

print("Speech recognition model loaded successfully.")

## File Selection and Transcription

In [None]:
def transcribe_audio(audio_file_path):
    # Check if file format is not wav or mp3
    if audio_file_path.split(".")[-1] not in ["wav", "mp3"]:
        # Convert the audio file to mp3
        file_name = audio_file_path.split(".")[0]
        subprocess.run(["ffmpeg", "-i", audio_file_path, f"{file_name}.mp3"])
        audio_file_path = f"{file_name}.mp3"
    
    # Transcribe the audio
    transcription = pipeline(audio_file_path)
    return transcription["text"]

# Create file uploader widget
file_uploader = widgets.FileUpload(
    accept='.wav,.mp3,.m4a',  # Accepted file extensions
    multiple=False  # Allow only single file selection
)

# Create a button to trigger transcription
transcribe_button = widgets.Button(description="Transcribe")
output = widgets.Output()

def on_transcribe_button_clicked(b):
    with output:
        output.clear_output()
        if not file_uploader.value:
            print("Please upload an audio file first.")
            return
        
        # Get the uploaded file
        uploaded_file = next(iter(file_uploader.value.values()))
        file_name = uploaded_file['metadata']['name']
        
        # Save the file temporarily
        with open(file_name, 'wb') as f:
            f.write(uploaded_file['content'])
        
        print(f"Transcribing {file_name}...")
        transcription = transcribe_audio(file_name)
        print("Transcription:")
        print(transcription)
        
        # Store the transcription in a global variable for later use
        global current_transcription
        current_transcription = transcription

transcribe_button.on_click(on_transcribe_button_clicked)

display(widgets.VBox([file_uploader, transcribe_button, output]))