# 🚀 CapTTS Demo



## 🔥 Step 1: Install packages

In [None]:
# Python 3.10
# pip install git+https://github.com/WangHelin1997/CapSpeech.git

## 🔥 Step 2: Load packages

In [5]:
import torch
from capspeech.ar.parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer, set_seed
import soundfile as sf
import time
import os
from huggingface_hub import snapshot_download
import argparse
from IPython.display import Audio, display

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")

Device: cuda:0


## 🔥 Step 3: Customize your input

In [2]:
# Change to your transcript
transcript = "<dog> at this moment miss brandon entered with her brilliant cousin rachel the blonde and the dark it was a dazzling contrast <I_start> <I_end>" 
# Change to your style caption
caption = "A young woman speaks at a moderate pace, her voice carrying a hint of monotone. Remarkably, she maintains a high pitch, giving her speech an air of focused determination." 

## 🔥 Step 4: Generate

⏳ The first run may take some time as it needs to download the pretrained checkpoints.

In [6]:
print("Downloading model from Huggingface...")
local_dir = snapshot_download(
    repo_id="OpenSound/CapSpeech-models"
)
model_path = os.path.join(local_dir, "ar_CapTTS-SE")

print("Loading model...")
model = ParlerTTSForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

print("Start Generation...")
start_time = time.time()
input_ids = tokenizer(caption, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(transcript, return_tensors="pt").input_ids.to(device)
set_seed(42) # change to your favorite seed
generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids, guidance_scale=1.5)
audio_arr = generation.cpu().numpy().squeeze()
end_time = time.time()
audio_len = audio_arr.shape[-1] / 44100
rtf = (end_time-start_time)/audio_len
print(f"RTF: {rtf:.4f}")
Audio(audio_arr, rate=44100)

Downloading model from Huggingface...


Fetching 34 files:   0%|          | 0/34 [00:00<?, ?it/s]

Loading model...


Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32512
}

Config of the audio_encoder: <class 'capspeech.ar.parler_tts.dac_wra

Start Generation...
RTF: 1.2959
