# VibeVoice 7B + LoRA (Private) — Kaggle Notebook

Før du kjører:
- Slå på GPU (T4/P100) og gjerne High-RAM
- Slå på Internet: On (for å hente base-modellen fra Hugging Face)
- Visibility: Private
- Last opp to private Kaggle Datasets:
  - LoRA-checkpoint: innholdet fra `checkpoint-3800/`
  - VibeVoice-lib: innholdet fra `vendor/vibevoice/` (så `pyproject.toml` ligger i rot)

Tips: Etter “Add Data” i Kaggle, se mappene under `/kaggle/input/` for nøyaktige katalognavn.



In [None]:
pip -q install -U pip
pip -q install "transformers>=4.51.3" accelerate "peft>=0.11.0" sentencepiece librosa soundfile torchaudio speechbrain numpy scipy tqdm pyyaml

# Installer VibeVoice fra ditt private dataset (erstatt katalognavn om nødvendig)
# Eksempel: /kaggle/input/vibevoice-lib
if [ -d "/kaggle/input/vibevoice-lib" ]; then
  pip -q install /kaggle/input/vibevoice-lib
else
  echo "⚠️ Sett riktig katalognavn for VibeVoice-lib under /kaggle/input/"
fi


In [None]:
import os, glob, json, textwrap
from pprint import pprint

print("/kaggle/input contents:")
for p in sorted(glob.glob('/kaggle/input/*')):
    print("-", p)

# Konfigurer dataset-stier (BYTT navn hvis nødvendig)
LORA_DIR = "/kaggle/input/vibevoice-lora-checkpoint"  # <- endre hvis katalognavn er annet
VIBEVOICE_LIB_DIR = "/kaggle/input/vibevoice-lib"      # <- endre hvis katalognavn er annet

assert os.path.isdir(VIBEVOICE_LIB_DIR), "VibeVoice-lib dataset ikke funnet. Endre VIBEVOICE_LIB_DIR."
assert os.path.isdir(LORA_DIR), "LoRA-checkpoint dataset ikke funnet. Endre LORA_DIR."

print("OK: Dataset-stier ser gyldige ut.")


In [None]:
import torch
from peft import PeftModel
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference

BASE_MODEL_ID = "Jmica/VibeVoice7B"  # Hentes fra Hugging Face (Internet: On)

device_map = "auto"  # shard mellom GPU/CPU ved behov (tåler 16GB GPU)
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

print("Laster basemodell …")
model = VibeVoiceForConditionalGenerationInference.from_pretrained(
    BASE_MODEL_ID,
    torch_dtype=torch_dtype,
    device_map=device_map,
    low_cpu_mem_usage=True,
)
print("Påfører LoRA-adapter …")
model = PeftModel.from_pretrained(model, LORA_DIR)
try:
    model = model.merge_and_unload()
except Exception:
    pass
model.eval()
print("Dry-run OK: base + LoRA lastet.")


In [None]:
# Valgfritt: kort generering (kan feile pga minne). Kjør bare hvis dry-run var OK.
import numpy as np, soundfile as sf
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor

TARGET_SR = 24000
VISION_START, VISION_PAD, VISION_END = "<|vision_start|>", "<|vision_pad|>", "<|vision_end|>"

def build_prompt(text: str, seconds: float) -> str:
    approx_samples = int(seconds * TARGET_SR)
    N = int(np.ceil(approx_samples / 3200))
    control = "[voice: neutral, non-identifiable, Norwegian bokmål]"
    return (
        "Text input:\n"
        f"Speaker: {text} {control}\n"
        "Speech output:\n"
        f"{VISION_START} " + (" ".join([VISION_PAD]*N) + " " if N>0 else "") + f"{VISION_END}\n"
    )

seconds = 1.5
text = "Hei, dette er en veldig kort test."

processor = VibeVoiceProcessor.from_pretrained(BASE_MODEL_ID)
tok = processor.tokenizer
tok.add_special_tokens({"additional_special_tokens":[VISION_START, VISION_PAD, VISION_END]})
try:
    model.resize_token_embeddings(len(tok))
except Exception:
    pass

prompt = build_prompt(text, seconds)
inputs = tok([prompt], return_tensors="pt")

speech_input_mask = torch.zeros((1, inputs["input_ids"].shape[1]), dtype=torch.bool)
speech_masks = torch.zeros((1, 1), dtype=torch.bool)
speech_tensors = torch.zeros((1, max(1, int(TARGET_SR*min(seconds,0.1)))), dtype=torch.float32)

with torch.no_grad():
    out = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs.get("attention_mask"),
        tokenizer=tok,
        max_new_tokens=1,
        show_progress_bar=False,
        return_speech=True,
        speech_tensors=speech_tensors,
        speech_masks=speech_masks,
        speech_input_mask=speech_input_mask,
    )

audio = None
if hasattr(out, "speech_outputs") and out.speech_outputs:
    audio = out.speech_outputs[0]
if audio is None:
    audio = np.zeros(int(seconds*TARGET_SR), dtype=np.float32)

audio = np.squeeze(np.array(audio, dtype=np.float32))
sf.write("/kaggle/working/out.wav", audio, TARGET_SR)
print("Lagret /kaggle/working/out.wav")
