# Audio to Transcript conversion

This code allows you to convert from raw audio of a meeting to a time-annotated and speaker diarized transcript.

Import this code into Google Colab to use with T4.

In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install pyannote.audio
!apt-get install -y ffmpeg

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-and6gq9b
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-and6gq9b
  Resolved https://github.com/openai/whisper.git to commit 90db0de1896c23cbfaf0c58bc2d30665f709f170
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')

In [None]:
import os
import json
import subprocess
from pyannote.audio import Pipeline
import whisper
import tempfile
import torch

# -----------------------------
# Configuration
# -----------------------------
INPUT_AUDIO_FILE = "meeting_audio_5min.wav"  # Update with your input file name/path
DIARIZATION_MODEL = "pyannote/speaker-diarization"  # Pretrained diarization model
WHISPER_MODEL = "base"  # "tiny", "base", "small", "medium", "large", etc.

# -----------------------------
# Preprocessing (Optional)
# If your file is not in the correct format (mono, 16kHz), you can uncomment:
# -----------------------------
# def preprocess_audio(input_file, output_file):
#     # Convert to 16kHz, mono wav
#     cmd = [
#         "ffmpeg",
#         "-y",
#         "-i", input_file,
#         "-ar", "16000",
#         "-ac", "1",
#         output_file
#     ]
#     subprocess.run(cmd, check=True)
#
# # Example usage:
# preprocessed_file = "processed_audio.wav"
# preprocess_audio(INPUT_AUDIO_FILE, preprocessed_file)
# INPUT_AUDIO_FILE = preprocessed_file

# -----------------------------
# Speaker Diarization
# -----------------------------
def perform_diarization(audio_file):
    pipeline = Pipeline.from_pretrained(DIARIZATION_MODEL,
                                        use_auth_token=HF_TOKEN)
    pipeline.to(torch.device('cuda'))
    diarization = pipeline(audio_file)

    # Extract segments: (start, end, speaker_label)
    diarization_segments = []
    for segment, _, speaker in diarization.itertracks(yield_label=True):
        diarization_segments.append((segment.start, segment.end, speaker))

    return diarization_segments

# -----------------------------
# Transcription with Whisper
# -----------------------------
def perform_transcription(audio_file, model_name=WHISPER_MODEL):
    model = whisper.load_model(model_name)
    result = model.transcribe(audio_file)
    # result['segments'] includes a list of segments with 'start', 'end', 'text'
    return result['segments']

# -----------------------------
# Helper: Find Speaker for a Given Segment
# We try to find which diarization segment overlaps most with the transcription segment.
# -----------------------------
def find_speaker_for_time(diarization_segments, seg_start, seg_end):
    best_overlap = 0.0
    best_speaker = "Unknown"
    for (d_start, d_end, d_speaker) in diarization_segments:
        # Calculate overlap
        overlap = min(seg_end, d_end) - max(seg_start, d_start)
        if overlap > best_overlap:
            best_overlap = overlap
            best_speaker = d_speaker
    return best_speaker

# -----------------------------
# Main Integration
# -----------------------------
def main():
    # Perform diarization
    print("Performing diarization...")
    diarization_segments = perform_diarization(INPUT_AUDIO_FILE)
    print("...Diarization complete.")

    # Perform transcription
    print("Performing transcription...")
    transcription_segments = perform_transcription(INPUT_AUDIO_FILE, WHISPER_MODEL)
    print("... Transcription complete.")

    # Merge results
    print("Merging results...")
    merged_transcript = []
    for seg in transcription_segments:
        seg_start = seg["start"]
        seg_end = seg["end"]
        seg_text = seg["text"].strip()

        speaker_label = find_speaker_for_time(diarization_segments, seg_start, seg_end)

        merged_transcript.append({
            "start": seg_start,
            "end": seg_end,
            "speaker": speaker_label,
            "text": seg_text
        })
    print("... Merging results complete.")

    # Print or save the results
    # Here we print as JSON for demonstration:
    print(json.dumps(merged_transcript, indent=2))

if __name__ == "__main__":
    main()


Performing diarization...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu121. Bad things might happen unless you revert torch to 1.x.
...Diarization complete.
Performing transcription...


100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 76.6MiB/s]
  checkpoint = torch.load(fp, map_location=device)


... Transcription complete.
Merging results...
... Merging results complete.
[
  {
    "start": 0.0,
    "end": 4.44,
    "speaker": "SPEAKER_01",
    "text": "So actually getting back to a kin as like a question"
  },
  {
    "start": 4.44,
    "end": 5.12,
    "speaker": "SPEAKER_01",
    "text": "at the beginning."
  },
  {
    "start": 5.12,
    "end": 10.36,
    "speaker": "SPEAKER_01",
    "text": "So for now, probably we just need to make a really quick"
  },
  {
    "start": 10.36,
    "end": 15.200000000000001,
    "speaker": "SPEAKER_01",
    "text": "decision probably within today, like if we want to have"
  },
  {
    "start": 15.200000000000001,
    "end": 17.400000000000002,
    "speaker": "SPEAKER_01",
    "text": "the ASR module, right?"
  },
  {
    "start": 17.400000000000002,
    "end": 22.400000000000002,
    "speaker": "SPEAKER_01",
    "text": "So if we want to have the ASR module,"
  },
  {
    "start": 22.400000000000002,
    "end": 25.76,
    "speaker": "SPEAKE