Pipeline A: Whisper -> LLM-Diarisierung
Kein Chunking, kein JSON
Output: [Sprecher 1]: ... 

In [38]:
### Imports + Konfiguration
import os
from pathlib import Path
import requests
from dotenv import load_dotenv

load_dotenv()

VLLM_BASE_URL = os.getenv("VLLM_BASE_URL", "http://127.0.0.1:8005/v1")
VLLM_MODEL = os.getenv("VLLM_MODEL", "openai/gpt-oss-120b")

# Repository-Root: individuell anpassen
REPO_ROOT = Path("~/jupyter/diarization-benchmark").expanduser().resolve()

# Pfad zu gespeicherten Audios
AUDIO_DIR = REPO_ROOT / "data" / "input_audio"

# Test-Audio-Datei (Benennen mit test1.mp3, ggf. anpassen)
TEST_AUDIO = REPO_ROOT /"data" / "test_audio" / "test1.mp3"


In [39]:
### Audio-Normalisierung via ffmpeg, alles konvertieren zu WAV 16khz Mono -> Wichtig für Pyannote
import subprocess
from pathlib import Path

def ensure_wav_16k_mono(input_path: str | Path, out_dir: str | Path) -> Path:
    input_path = Path(input_path)
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    out_path = out_dir / f"{input_path.stem}_16k_mono.wav"

    # Reuse, wenn schon vorhanden
    if out_path.exists() and out_path.stat().st_size > 0:
        return out_path

    cmd = [
        "ffmpeg", "-y",
        "-i", str(input_path),
        "-ac", "1",        # mono
        "-ar", "16000",    # 16kHz
        "-vn",             # no video
        str(out_path),
    ]
    res = subprocess.run(cmd, capture_output=True, text=True)
    if res.returncode != 0:
        raise RuntimeError(f"ffmpeg failed:\n{res.stderr}")
    return out_path

In [40]:
### Konvertierungs-Test
# Testaudio mit normalisierter Version überschreiben
conversion_test = True
if conversion_test:
    CONVERTED_DIR = REPO_ROOT / "data" / "normalised_audio"
    TEST_AUDIO = ensure_wav_16k_mono(TEST_AUDIO, CONVERTED_DIR)

    print("WAV:", TEST_AUDIO)
    print("Exists:", TEST_AUDIO.exists(), "Size:", TEST_AUDIO.stat().st_size)

WAV: /home/liegepa/jupyter/diarization-benchmark/data/normalised_audio/test1_16k_mono.wav
Exists: True Size: 34685862


In [41]:
### vLLM-client, OpenAI-kompatibel
import requests
import json

def chat_vllm(messages, model=VLLM_MODEL, temperature=0.0, max_tokens=800, timeout=600):
    url = f"{VLLM_BASE_URL}/chat/completions"
    payload = {
        "model": model,
        "messages": messages,
        "temperature": temperature,
        "max_tokens": max_tokens,
    }
    r = requests.post(url, json=payload, timeout=timeout)
    r.raise_for_status()
    data = r.json()

    content = data["choices"][0]["message"].get("content", None)
    if content is None:
        raise RuntimeError(f"LLM returned no content. Full response: {json.dumps(data)[:2000]}")
    return content


In [42]:
# Kurzer LLM-Test: Bei Bedarf auf True setzen
llm_test = True

if llm_test:
    print(chat_vllm([{"role":"user","content":"Antworte nur mit Ja."}]))

Ja


Whisper-Transkription der Audio via faster-whisper
- lokal reproduzierbar, keine Cloud
- wir nehmen Segment-Zeitstempel für späteren Pyannote-Merge (nicht benötigt für LLM-Diarisierung)

In [43]:
### Code für Transkription
from faster_whisper import WhisperModel

WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "small") # Kleines Modell für Tests
# WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "large-v3") # Großes Modell für Prod
WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cpu")  # Für unseren Test auf CPU laufen lassen -> langsamer
# WHISPER_DEVICE = os.getenv("WHISPER_DEVICE", "cuda") # Wenn verfügbar: Auf GPU laufen lassen -> schneller -> Aber: Konfig-Anpassungen notwendig!
WHISPER_COMPUTE_TYPE = os.getenv("WHISPER_COMPUTE_TYPE", "int8")  # cpu: int8 gut

_whisper_model = None

def get_whisper_model():
    global _whisper_model
    if _whisper_model is None:
        _whisper_model = WhisperModel(
            WHISPER_MODEL_SIZE,
            device=WHISPER_DEVICE,
            compute_type=WHISPER_COMPUTE_TYPE,
        )
    return _whisper_model

def transcribe_faster_whisper(audio_path: str, language: str | None = None):
    """
    Returns:
      transcript: str (full text)
      segments: list of dicts: [{"start": float, "end": float, "text": str}, ...]
    """
    model = get_whisper_model()
    segments_iter, info = model.transcribe(
        audio_path,
        language=language,
        vad_filter=True,
        word_timestamps=False,  # später ggf. True, falls wir word-level brauchen
    )
    segments = []
    texts = []
    for seg in segments_iter:
        txt = seg.text.strip()
        segments.append({"start": float(seg.start), "end": float(seg.end), "text": txt})
        texts.append(txt)
    transcript = "\n".join(texts).strip()
    return transcript, segments, info

In [44]:
# Whisper-Test anhand der Test-Audio. Aktivieren -> True setzen
whisper_test = False
if whisper_test:
    t, segs, info = transcribe_faster_whisper(TEST_AUDIO, language="de")
    print("Language:", info.language, "Prob:", info.language_probability)
    print("Transcript:\n", t[:1000])
    print("\nFirst segments:", segs[:3])

### LLM-Diarisierung Prompt
System- und User-Prompts

In [45]:
def diarize_with_llm(transcript: str):
    system_prompt = (
        "You are a professional conversation diarization engine. "
        "Assign speaker labels logically solely based on text. "
        "Do not add, remove, or summarize content. "
        "Make sure before assignment that the context fits (a doctor will not say things a patient would say for example). "
        "Output only the diarized transcript."
    )

    user_prompt = (
        "Add speaker labels to the following transcript using logic to determine which sentence belongs to which speaker. "
        "For assigning sentences: Make out the role of the speaker (for example doctor). Keep this in mind when assigning speaker names. "
        "Use '[Sprecher 1]', '[Sprecher 2]', ... for the different speakers. "
        "Check thoroughly for every sentence if the assignment to the speaker is contextually and logically plausible. "
        "For example, in a doctor-patient-dialogue, do not assign medical questions to the Speaker who was identified as a patient before. "
        "Produce a readable dialogue format between Speakers.\n\n"
        f"{transcript}"
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    return chat_vllm(messages, temperature=0.0, max_tokens=2000)

In [None]:
# Kombinierter Test: Whisper-Diarisierung -> LLM-Diarisierung anhand der Test-Audio, True setzen für Test
whisper_llm_test = True
if whisper_llm_test:
    transcript, segs, info = transcribe_faster_whisper(TEST_AUDIO, language="de")
    print("Chars:", len(transcript))
    print("Words:", len(transcript.split()))
    diarized = diarize_with_llm(transcript[:3000])
    print(diarized)

KeyboardInterrupt: 