In [1]:
!pip install tensorflow tensorflow-hub librosa pandas openai-whisper ffmpeg-python
!pip install torch librosa git+https://github.com/openai/whisper.git pyannote.audio transformers langdetect
!pip install langdetect pyannote.audio

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=1a95055fcbc30eec890789ee2f375d075ff027e8f1d8398a4886a82f13e7ea6c
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d126cba375b15898010b6cc28578d8afdde5869
Successfully built openai-whisper
Installing collected packages: 

In [None]:
import os
import torch
import librosa
import whisper
from pyannote.audio import Pipeline
from transformers import pipeline as hf_pipeline
import subprocess, json
from pathlib import Path

# ===================== CONFIG =====================
AUDIO_FILE = "/content/audio_001.mp3" 
OUTPUT_JSON = "final_output.json"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ===================== HELPER =====================
def convert_to_wav(input_path, output_path="temp.wav"):
    """Convert mp4/mkv etc. to wav for processing"""
    subprocess.run([
        "ffmpeg", "-y", "-i", input_path,
        "-ar", "16000", "-ac", "1", output_path
    ], check=True)
    return output_path

# ===================== 1. SOUND EVENT DETECTION =====================
def run_sed(audio_path, threshold=0.01):
    sed_pipeline = hf_pipeline(
        task="audio-classification",
        model="MIT/ast-finetuned-audioset-10-10-0.4593",
        device=0 if DEVICE == "cuda" else -1
    )
    audio_data, sr = librosa.load(audio_path, sr=16000)
    results = sed_pipeline(audio_data)
    # keep only non-speech sounds above threshold
    results = [
        e for e in results
        if "speech" not in e["label"].lower() and e["score"] >= threshold
    ]
    return results

# ===================== 2. SPEAKER DIARIZATION =====================
def run_diarization(audio_path):
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization@2.1",
        use_auth_token="YOUR_TOKEN" 
    )
    diarization = pipeline(audio_path)
    segments = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        segments.append({
            "start": float(turn.start),
            "end": float(turn.end),
            "speaker": speaker
        })
    return segments

# ===================== 3. SPEECH TO TEXT =====================
def run_stt(audio_path):
    model = whisper.load_model("medium", device=DEVICE)
    transcription = model.transcribe(audio_path)
    detected_lang = transcription["language"]
    segments = []
    for seg in transcription["segments"]:
        segments.append({
            "start_time": seg["start"],
            "end_time": seg["end"],
            "text": seg["text"],
            "confidence": seg.get("avg_logprob", None)
        })
    return detected_lang, segments

# ===================== 4. EMOTION DETECTION =====================
def run_emotion_detection(texts):
    emo_pipeline = hf_pipeline(
        "text-classification",
        model="j-hartmann/emotion-english-distilroberta-base",
        top_k=None,
        device=0 if DEVICE == "cuda" else -1
    )
    results = emo_pipeline(texts)
    # keep only top emotion
    emotions = []
    for res in results:
        if isinstance(res, list):
            res = sorted(res, key=lambda x: x["score"], reverse=True)[0]
        emotions.append({"label": res["label"], "score": float(res["score"])})
    return emotions

# ===================== 5. MERGE & SAVE JSON =====================
def build_output(file_id, audio_path, sed_events, diarization, transcript, detected_lang):
    # ---- attach speakers to transcript ----
    for t in transcript:
        t["speaker"] = "Unknown"
        for s in diarization:
            if (s["start"] <= t["start_time"] <= s["end"]) or (s["start"] <= t["end_time"] <= s["end"]):
                t["speaker"] = s["speaker"]
                break

    # ---- add emotion detection ----
    texts = [t["text"] for t in transcript]
    emotions = run_emotion_detection(texts)
    for t, emo in zip(transcript, emotions):
        t["emotion"] = emo["label"]
        t["emotion_score"] = emo["score"]

    merged_transcript = [
        {
            "speaker": t["speaker"],
            "start_time": t["start_time"],
            "end_time": t["end_time"],
            "text": t["text"].strip(),
            "emotion": t["emotion"],
            "emotion_score": round(t["emotion_score"], 3)
        }
        for t in transcript
    ]

    # ---- sort and clean sound effects ----
    sorted_sounds = sorted(sed_events, key=lambda x: x["score"], reverse=True)
    sound_effects = [
        {"label": s["label"], "score": round(s["score"], 3)}
        for s in sorted_sounds
    ]

    # ---- final JSON ----
    data = {
        "file_id": Path(file_id).stem,
        "file_path": audio_path,
        "detected_language": detected_lang,
        "transcript": merged_transcript,
        "sound_effects": sound_effects,
        "metadata": {
            "annotator": "system_auto"
        }
    }

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    print(f"\n✅ Final JSON saved to {OUTPUT_JSON}")

# ===================== MAIN =====================
if __name__ == "__main__":
    wav_path = convert_to_wav(AUDIO_FILE)

    print("Running SED...")
    sed_events = run_sed(wav_path)

    print("Running diarization...")
    diarization = run_diarization(wav_path)

    print("Running STT...")
    detected_lang, transcript = run_stt(wav_path)

    print("Building JSON with emotions and timestamps...")
    build_output(AUDIO_FILE, AUDIO_FILE, sed_events, diarization, transcript, detected_lang)


Running SED...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Device set to use cuda:0


Running diarization...


config.yaml:   0%|          | 0.00/500 [00:00<?, ?B/s]

  available_backends = torchaudio.list_audio_backends()
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.3. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`
  torchaudio.list_audio_backends()
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cu126. Bad things might happen unless you revert torch to 1.x.


hyperparams.yaml: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml' -> '/root/.cache/torch/pyannote/speechbrain/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /root/.cache/torch/pyann

embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt' -> '/root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt' -> '/root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt' -> '/root/.cache/torch/pyannote/speechbrain/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /root/.cache/torch/pyannote/speechbrain/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


label_encoder.txt: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt' -> '/root/.cache/torch/pyannote/speechbrain/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /root/.cache/torch/pyannote/speechbrain/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /root/.cache/torch/pyannote/speechbrain/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): mean_var_norm_emb -> /root/.cache/torch/pyannote/speechbrain/mean_var_norm_emb.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): classifier -> /root/.cac

Running STT...


100%|█████████████████████████████████████| 1.42G/1.42G [01:21<00:00, 18.8MiB/s]


Building JSON with emotions and timestamps...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0



✅ Final JSON saved to final_output.json
