<a href="https://colab.research.google.com/github/RKapadia01/orpheus-inference/blob/main/Trelis_Orpheus_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Trelis Orpheus Inference
Built on by [Trelis Research](https://trelis.com) from an original notebook by [Canopy Labs](https://colab.research.google.com/drive/1KhXT56UePPUHhqitJNUxq63k-pQomz3N?usp=sharing).

<div style="border: 2px solid #ff9800; padding: 10px; border-radius: 8px; background-color: #fff3e0;">
  <strong> Want More AI Resources & Tutorials from Trelis?</strong>  
  👉 <a href="https://trelis.substack.com" style="font-size: 18px; font-weight: bold;">Subscribe HERE</a>
</div>

## Non-streaming

In [None]:
model_name = "canopylabs/orpheus-3b-0.1-ft"
# model_name = "Trelis/orpheus-tts-0.1-pretrained-ft" # trelis fine-tune

device = "cuda" # swap to "cuda" for Nvidia or "cpu" otherwise

In [None]:
#@title Installation & Setup
!pip install snac ipywebrtc hf_transfer transformers soundfile librosa ipywidgets huggingface_hub -qU
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [None]:
# # # if not logged in, log in here OR pass a token below
# from huggingface_hub import login

# login()

## If the above doesnt' work
from getpass import getpass

# get a token from here https://huggingface.co/settings/tokens
token = getpass("Enter your token: ")

Enter your token: ··········


In [None]:
import os
os.environ["HF_TOKEN"] = token

In [None]:
from snac import SNAC
import torch
import torch
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
import numpy as np
import soundfile as sf
import IPython.display as ipd
import librosa
from ipywebrtc import AudioRecorder, Audio
from IPython.display import display
import ipywidgets as widgets

snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model = snac_model.to(device)

In [None]:
tokeniser_name = "meta-llama/Llama-3.2-3B-Instruct"
from huggingface_hub import snapshot_download

# Download only model config and safetensors
model_path = snapshot_download(
    repo_id=model_name,
    allow_patterns=[
        "config.json",
        "*.safetensors",
        "model.safetensors.index.json",
    ],
    ignore_patterns=[
        "optimizer.pt",
        "pytorch_model.bin",
        "training_args.bin",
        "scheduler.pt",
        "tokenizer.json",
        "tokenizer_config.json",
        "special_tokens_map.json",
        "vocab.json",
        "merges.txt",
        "tokenizer.*"
    ]
)

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.32G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/5.41M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/22.8M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

In [None]:
#### CHANGE THIS ####

prompts = [
    "Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.",
    # "I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!",
    # "I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.",
]

chosen_voice = "tara" # see github for other voices

print("*** See our github for tips on prompting the model for cleaning, humanlike generations.")

*** See our github for tips on prompting the model for cleaning, humanlike generations.


In [None]:
#@title Format prompts into correct template

prompts = [f"{chosen_voice}: " + p for p in prompts]

all_input_ids = []

for prompt in prompts:
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
  all_input_ids.append(input_ids)

start_token = torch.tensor([[ 128259]], dtype=torch.int64) # Start of human
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64) # End of text, End of human

all_modified_input_ids = []
for input_ids in all_input_ids:
  modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) # SOH SOT Text EOT EOH
  all_modified_input_ids.append(modified_input_ids)

all_padded_tensors = []
all_attention_masks = []
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
for modified_input_ids in all_modified_input_ids:
  padding = max_length - modified_input_ids.shape[1]
  padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
  attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
  all_padded_tensors.append(padded_tensor)
  all_attention_masks.append(attention_mask)

all_padded_tensors = torch.cat(all_padded_tensors, dim=0)
all_attention_masks = torch.cat(all_attention_masks, dim=0)

input_ids = all_padded_tensors.to(device)
attention_mask = all_attention_masks.to(device)

In [None]:
#@title Generate Output
print("*** Model.generate is slow - see vllm implementation on github for realtime streaming and inference")
print("*** Increase/decrease inference params for more expressive less stable generations")

with torch.no_grad():
  generated_ids = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_new_tokens=1200,
      do_sample=True,
      temperature=0.6,
      top_p=0.95,
      repetition_penalty=1.1,
      num_return_sequences=1,
      eos_token_id=128258,
  )

Setting `pad_token_id` to `eos_token_id`:128258 for open-end generation.


*** Model.generate is slow - see vllm implementation on github for realtime streaming and inference
*** Increase/decrease inference params for more expressive less stable generations


In [None]:
#@title Parse Output as speech
# Define special tokens used in the model's tokenization
token_to_find = 128257  # Likely a start-of-speech token
token_to_remove = 128258  # Likely an end-of-speech token

# Find all indices where the start-of-speech token appears
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)

# Extract the last occurrence of the start-of-speech token
if len(token_indices[1]) > 0:
    # Get the index of the last start-of-speech token
    last_occurrence_idx = token_indices[1][-1].item()
    # Crop the tensor to start after this token
    cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
    # If no start-of-speech token is found, use the entire generated tensor
    cropped_tensor = generated_ids

# Create a mask to remove specific tokens (end-of-speech tokens)
mask = cropped_tensor != token_to_remove

# Process each row of the cropped tensor
processed_rows = []
for row in cropped_tensor:
    # Remove end-of-speech tokens from each row
    masked_row = row[row != token_to_remove]
    processed_rows.append(masked_row)

# Prepare to convert tokens to audio codes
code_lists = []
for row in processed_rows:
    # Ensure the row length is divisible by 7 (likely related to audio encoding)
    row_length = row.size(0)
    new_length = (row_length // 7) * 7
    trimmed_row = row[:new_length]

    # Subtract a base value from each token (normalization step)
    trimmed_row = [t - 128266 for t in trimmed_row]
    code_lists.append(trimmed_row)

# Function to redistribute audio codes into different layers
def redistribute_codes(code_list):
    # Initialize layers for audio code reconstruction
    layer_1 = []
    layer_2 = []
    layer_3 = []

    # Reorganize codes into specific layers
    for i in range((len(code_list)+1)//7):
        # First layer: first code of each 7-token group
        layer_1.append(code_list[7*i])

        # Second layer: second code and fifth code, with offset subtraction
        layer_2.append(code_list[7*i+1]-4096)
        layer_2.append(code_list[7*i+4]-(4*4096))

        # Third layer: multiple codes with increasing offsets
        layer_3.append(code_list[7*i+2]-(2*4096))
        layer_3.append(code_list[7*i+3]-(3*4096))
        layer_3.append(code_list[7*i+5]-(5*4096))
        layer_3.append(code_list[7*i+6]-(6*4096))

    # Convert layers to tensors and move to CPU
    codes = [torch.tensor(layer_1).unsqueeze(0).cpu(),
             torch.tensor(layer_2).unsqueeze(0).cpu(),
             torch.tensor(layer_3).unsqueeze(0).cpu()]

    # Move SNAC model to CPU and decode
    snac_model_cpu = snac_model.to('cpu')
    audio_hat = snac_model_cpu.decode(codes)
    return audio_hat

# Generate audio samples for each code list
my_samples = []
for code_list in code_lists:
    # Convert each code list to an audio sample
    samples = redistribute_codes(code_list)
    my_samples.append(samples)

In [None]:
from IPython.display import display, Audio
if len(prompts) != len(my_samples):
  raise Exception("Number of prompts and samples do not match")
else:
  for i in range(len(my_samples)):
    print(prompts[i])
    samples = my_samples[i]
    display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))

tara: Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.


## Streaming

WARNING: This will only work on an Nvidia GPU
ALSO - as of Mar 20 2025, this is still not functioning owing to instabilities in vLLM.

In [None]:
!git clone https://github.com/canopyai/Orpheus-TTS.git

Cloning into 'Orpheus-TTS'...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


remote: Enumerating objects: 126, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 126 (delta 26), reused 22 (delta 22), pack-reused 92 (from 1)[K
Receiving objects: 100% (126/126), 3.12 MiB | 23.88 MiB/s, done.
Resolving deltas: 100% (61/61), done.


In [None]:
%cd Orpheus-TTS
!pip install orpheus-speech -q

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/Orpheus-TTS
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [None]:
# !pip install huggingface_hub -qU
# from huggingface_hub import login

# login()

In [None]:
from orpheus_tts import OrpheusModel
import wave
import time

# del model # if you need to delete the model previously loaded (you may need to restart the notebook if you face issues)
model = OrpheusModel(model_name ="canopylabs/orpheus-tts-0.1-finetune-prod") # only run if you need to re-load
prompt = '''Man, the way social media has, um, completely changed how we interact is just wild, right? Like, we're all connected 24/7 but somehow people feel more alone than ever. And don't even get me started on how it's messing with kids' self-esteem and mental health and whatnot.'''

start_time = time.monotonic()
syn_tokens = model.generate_speech(
   prompt=prompt,
   voice="tara",
   )

with wave.open("output.wav", "wb") as wf:
   wf.setnchannels(1)
   wf.setsampwidth(2)
   wf.setframerate(24000)

   total_frames = 0
   chunk_counter = 0
   for audio_chunk in syn_tokens: # output streaming
      chunk_counter += 1
      frame_count = len(audio_chunk) // (wf.getsampwidth() * wf.getnchannels())
      total_frames += frame_count
      wf.writeframes(audio_chunk)
   duration = total_frames / wf.getframerate()

   end_time = time.monotonic()
   print(f"It took {end_time - start_time} seconds to generate {duration:.2f} seconds of audio")

  state_dict = torch.load(model_path, map_location="cpu")


INFO 03-20 11:43:50 [__init__.py:256] Automatically detected platform cuda.


ImportError: /usr/local/lib/python3.11/dist-packages/vllm/_C.abi3.so: undefined symbol: _ZNK3c1011StorageImpl27throw_data_ptr_access_errorEv