### **Step 1 â€“ Load Required Libraries**
Import all Python packages needed for AI training, audio processing, and dataset handling.

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install snac

### **Step 2 â€“ Load Pretrained AI Model**
Use FastLanguageModel.from_pretrained() to load an existing AI model and tokenizer into memory.

In [2]:
from unsloth import FastLanguageModel  # Get the special tool for loading the AI model
import torch  # A library that helps the AI run faster

# Load the AI model and the "tokenizer" (tokenizer = tool that breaks text into pieces the AI understands)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/orpheus-3b-0.1-ft",  # The name of the AI model we want to use
    max_seq_length = 2048,  # How much text the AI can read at once
    dtype = None,  # Let the computer decide the best type for speed/memory
    load_in_4bit = False,  # If True = smaller memory use, but slower; here we keep it normal
    # token = "hf_...",  # Only needed if the AI model is locked and needs a key to use
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.4: Fast Llama patching. Transformers: 4.55.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Step 3 â€“ Prepare Model for LoRA Training
Apply FastLanguageModel.get_peft_model() to make the model ready for LoRA (a faster, memory-efficient fine-tuning method).

In [3]:
# Make the model ready for LoRA training (LoRA = a way to train big AI models faster and with less memory)
model = FastLanguageModel.get_peft_model(
    model,  # The AI model we already loaded earlier

    r = 64,  # How much the model can "adjust" during training. Bigger = more learning, but more memory needed.
             # Common values: 8, 16, 32, 64, 128

    target_modules = [  # Parts of the AI's brain that will be fine-tuned (changed during training)
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],

    lora_alpha = 64,  # Controls how strong the LoRA changes affect the model

    lora_dropout = 0,  # Randomly turn off some training connections to help generalization.
                       # 0 = no dropout, usually fastest.

    bias = "none",  # How we treat bias terms. "none" = no extra memory use, most efficient.

    # Special feature in Unsloth: Uses ~30% less GPU memory and allows bigger training batches
    use_gradient_checkpointing = "unsloth",  # Helps train very long text by saving memory

    random_state = 3407,  # A fixed number so results are the same each time you run it

    use_rslora = False,  # Option for a special LoRA variant (rank stabilized LoRA). Not using it here.

    loftq_config = None,  # Another special training method (LoftQ). Not using it here.
)

Unsloth 2025.8.4 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


### Step 4 â€“ Load Dataset
Use load_dataset() to fetch the dataset (e.g., "MrDragonFox/Elise") from Hugging Face, selecting the "train" split.

In [4]:
from datasets import load_dataset  # Tool to get ready-made datasets for AI training

# Load the dataset
dataset = load_dataset("MrDragonFox/Elise", split="train")


### Step 5 â€“ Inspect Dataset
Check how many items are in the dataset and what columns/labels it contains.

In [5]:

# Check how many items are in the dataset
print("Number of rows in dataset:", len(dataset))

# Check what columns/labels the dataset has
print("Columns in dataset:", dataset.column_names)

Number of rows in dataset: 1195
Columns in dataset: ['audio', 'text']


### Step 6 â€“ Load Audio Tokenizer (SNAC)
Load the SNAC audio model (hubertsiuzdak/snac_24khz) for converting raw audio into discrete tokens.



In [6]:
# Tokenization Function - turns audio + text into numbers the AI can understand

import locale
import torchaudio.transforms as T  # Audio tools (e.g., resampling audio)
import os
import torch
from snac import SNAC  # Special audio-to-token encoder

# Fix encoding problems on some systems
locale.getpreferredencoding = lambda: "UTF-8"

# Get the sample rate (speed of audio) from the first audio file in the dataset
ds_sample_rate = dataset[0]["audio"]["sampling_rate"]

# Load a pre-trained SNAC model for turning audio into tokens
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model = snac_model.to("cuda")  # Move the model to GPU for faster processing

# Function to turn an audio waveform into tokens
def tokenise_audio(waveform):
    # Convert NumPy array to a Torch tensor and add a "batch" dimension
    waveform = torch.from_numpy(waveform).unsqueeze(0)
    waveform = waveform.to(dtype=torch.float32)

    # Change the audio speed to 24kHz (what the SNAC model expects)
    resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
    waveform = resample_transform(waveform)

    # Add extra dimension and move to GPU
    waveform = waveform.unsqueeze(0).to("cuda")

    # Generate token codes from the audio without calculating gradients (faster)
    with torch.inference_mode():
        codes = snac_model.encode(waveform)

    # Combine all the codes into a single list with offsets
    all_codes = []
    for i in range(codes[0].shape[1]):
        all_codes.append(codes[0][0][i].item() + 128266)
        all_codes.append(codes[1][0][2*i].item() + 128266 + 4096)
        all_codes.append(codes[2][0][4*i].item() + 128266 + (2*4096))
        all_codes.append(codes[2][0][(4*i)+1].item() + 128266 + (3*4096))
        all_codes.append(codes[1][0][(2*i)+1].item() + 128266 + (4*4096))
        all_codes.append(codes[2][0][(4*i)+2].item() + 128266 + (5*4096))
        all_codes.append(codes[2][0][(4*i)+3].item() + 128266 + (6*4096))

    return all_codes

# Function to add audio codes to each example in the dataset
def add_codes(example):
    codes_list = None  # Default value if something goes wrong
    try:
        answer_audio = example.get("audio")
        if answer_audio and "array" in answer_audio:
            audio_array = answer_audio["array"]
            codes_list = tokenise_audio(audio_array)  # Turn audio into tokens
    except Exception as e:
        print(f"Skipping row due to error: {e}")  # Skip problem files

    example["codes_list"] = codes_list
    return example

# Apply the tokenisation to every row in the dataset
dataset = dataset.map(add_codes, remove_columns=["audio"])

# Token ID constants for different parts of the input
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009
start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2
start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4
start_of_ai = tokeniser_length + 5
end_of_ai = tokeniser_length + 6
pad_token = tokeniser_length + 7
audio_tokens_start = tokeniser_length + 10

# Keep only rows with valid audio codes
dataset = dataset.filter(lambda x: x["codes_list"] is not None)
dataset = dataset.filter(lambda x: len(x["codes_list"]) > 0)

# Function to remove duplicate frames in audio tokens
def remove_duplicate_frames(example):
    vals = example["codes_list"]

    # Make sure length is a multiple of 7 (audio token groups)
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")

    result = vals[:7]  # Keep the first group
    removed_frames = 0

    for i in range(7, len(vals), 7):
        current_first = vals[i]
        previous_first = result[-7]

        if current_first != previous_first:
            result.extend(vals[i:i+7])  # Keep if different
        else:
            removed_frames += 1  # Skip if same as before

    example["codes_list"] = result
    return example

# Apply duplicate removal
dataset = dataset.map(remove_duplicate_frames)

# Info for the user
tok_info = '''*** HERE you can modify the text prompt
If you are training a multi-speaker model (e.g., canopylabs/orpheus-3b-0.1-ft),
ensure that the dataset includes a "source" field and format the input accordingly:
- Single-speaker: f"{example['text']}"
- Multi-speaker: f"{example['source']}: {example['text']}"
'''
print(tok_info)

# Function to create the final input IDs for training
def create_input_ids(example):
    # Add speaker info if available
    text_prompt = f"{example['source']}: {example['text']}" if "source" in example else example["text"]

    # Turn text into tokens and add end-of-text marker
    text_ids = tokenizer.encode(text_prompt, add_special_tokens=True)
    text_ids.append(end_of_text)

    example["text_tokens"] = text_ids

    # Combine text tokens + audio tokens + special markers into one sequence
    input_ids = (
        [start_of_human]
        + example["text_tokens"]
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + example["codes_list"]
        + [end_of_speech]
        + [end_of_ai]
    )

    # Labels (what the AI tries to predict) are the same as input
    example["input_ids"] = input_ids
    example["labels"] = input_ids

    # Attention mask (tells AI which tokens are real vs. padding)
    example["attention_mask"] = [1] * len(input_ids)

    return example

# Apply text+audio token creation to dataset
dataset = dataset.map(create_input_ids, remove_columns=["text", "codes_list"])

# Keep only the columns we need for training
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in dataset.column_names if col not in columns_to_keep]
dataset = dataset.remove_columns(columns_to_remove)



Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1195 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1195 [00:00<?, ? examples/s]

Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

*** HERE you can modify the text prompt
If you are training a multi-speaker model (e.g., canopylabs/orpheus-3b-0.1-ft),
ensure that the dataset includes a "source" field and format the input accordingly:
- Single-speaker: f"{example['text']}"
- Multi-speaker: f"{example['source']}: {example['text']}"



Map:   0%|          | 0/1195 [00:00<?, ? examples/s]

<a name="Train"></a>
### 7: Train the model
Now let's use Huggingface  `Trainer`! More docs here: [Transformers docs](https://huggingface.co/docs/transformers/main_classes/trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.

**Note:** Using a per_device_train_batch_size >1 may lead to errors if multi-GPU setup to avoid issues, ensure CUDA_VISIBLE_DEVICES is set to a single GPU (e.g., CUDA_VISIBLE_DEVICES=0).

In [7]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

# Create a Trainer object to handle the AI training process
trainer = Trainer(
    model = model,          # The AI model we want to train
    train_dataset = dataset, # The dataset the model will learn from

    # Training settings
    args = TrainingArguments(
        per_device_train_batch_size = 1,  # How many examples per device (GPU/CPU) at once
        gradient_accumulation_steps = 4,  # Combine 4 mini-batches before updating the model (saves memory)
        warmup_steps = 5,                  # Start slow for 5 steps before full learning rate
        # num_train_epochs = 1,            # (Optional) Train for 1 full pass over the dataset
        max_steps = 60,                    # Stop after 60 total training steps
        learning_rate = 2e-4,              # How fast the model learns (0.0002 here)
        logging_steps = 1,                 # Print progress every 1 step
        optim = "adamw_8bit",               # Use memory-efficient Adam optimizer
        weight_decay = 0.01,               # Small penalty to prevent overfitting
        lr_scheduler_type = "linear",      # Slowly lower learning rate over time
        seed = 3407,                        # Random seed for reproducibility
        output_dir = "outputs",             # Folder to save trained model
        report_to = "none",                 # No external logging service (e.g., WandB)
    ),
)

In [8]:
# @title Show current memory stats

# Get details about the first GPU (index 0)
gpu_stats = torch.cuda.get_device_properties(0)

# Check how much GPU memory has been reserved so far (in GB)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)

# Get the total available GPU memory (in GB)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

# Print GPU name and total memory
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")

# Print how much GPU memory is currently reserved
print(f"{start_gpu_memory} GB of memory reserved.")


GPU = Tesla T4. Max memory = 14.741 GB.
6.818 GB of memory reserved.


### Train the Model
Run trainer.train() to start training and store results in trainer_stats.



In [9]:
# Start the model training and save the training statistics (like loss, accuracy, time taken)
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,195 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 97,255,424 of 3,398,122,496 (2.86% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,5.1101
2,4.8689
3,4.9886
4,4.8521
5,5.0039
6,4.8409
7,4.8162
8,4.953
9,5.0103
10,4.6788



Invalid credentials in Authorization header - silently ignoring the lookup for the file config.json in unsloth/orpheus-3b-0.1-ft.


In [10]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

211.3084 seconds used for training.
3.52 minutes used for training.
Peak reserved memory = 7.684 GB.
Peak reserved memory for training = 0.866 GB.
Peak reserved memory % of max memory = 52.127 %.
Peak reserved memory for training % of max memory = 5.875 %.


###  Define Text Prompts and Voice Settings for Speech Generation

In [18]:
prompts = [
    "Hey there, my name is Suresh Beekhani how are MY DEAR FORENDS <giggles> and I'm a speech generation model that can sound like a person from Pakistan.",
]
# Define chosen_voice (can be None or a string)
chosen_voice = None  # or e.g. "pakistani_male_voice"



In [19]:
# @title Run Inference - Generate speech audio from text prompts using the model

# Enable faster inference mode for the model (native 2x speedup)
FastLanguageModel.for_inference(model)

# Move the SNAC audio decoder model from GPU to CPU to free GPU memory
snac_model.to("cpu")

# Prepare prompts, adding the chosen voice prefix if specified (for multi-speaker models)
prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]

all_input_ids = []

# Convert each prompt text into token IDs using the tokenizer
for prompt in prompts_:
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
  all_input_ids.append(input_ids)

# Define special tokens marking start and end of human speech/text segments
start_token = torch.tensor([[128259]], dtype=torch.int64)  # Start of human
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)  # End of text, End of human

all_modified_input_ids = []

# Add special start/end tokens around each input token sequence
for input_ids in all_input_ids:
  modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)  # SOH SOT Text EOT EOH
  all_modified_input_ids.append(modified_input_ids)

all_padded_tensors = []
all_attention_masks = []

# Find the longest input length to pad others to same length
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])

# Pad all sequences with a padding token (128263) so they have equal length
# Also create an attention mask to tell model which tokens are real (1) vs padding (0)
for modified_input_ids in all_modified_input_ids:
  padding = max_length - modified_input_ids.shape[1]
  padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
  attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
  all_padded_tensors.append(padded_tensor)
  all_attention_masks.append(attention_mask)

# Combine all padded input tensors and attention masks into batch tensors
all_padded_tensors = torch.cat(all_padded_tensors, dim=0)
all_attention_masks = torch.cat(all_attention_masks, dim=0)

# Move input tensors and masks to GPU for generation
input_ids = all_padded_tensors.to("cuda")
attention_mask = all_attention_masks.to("cuda")

# Generate new token sequences from the model using sampling settings
generated_ids = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_new_tokens=1200,         # Generate up to 1200 new tokens
      do_sample=True,              # Use sampling for variety
      temperature=0.6,             # Control randomness (lower = less random)
      top_p=0.95,                  # Nucleus sampling threshold
      repetition_penalty=1.1,      # Avoid repeating same tokens too much
      num_return_sequences=1,      # Generate 1 output per prompt
      eos_token_id=128258,         # Token that marks end of generation
      use_cache=True               # Cache to speed up generation
  )

token_to_find = 128257  # Token marking start of audio codes to crop
token_to_remove = 128258  # Token marking end of audio codes to remove

# Find positions of the start token in generated sequences
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)

# Crop the generated token sequences from last occurrence of start token to the end
if len(token_indices[1]) > 0:
    last_occurrence_idx = token_indices[1][-1].item()
    cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
    cropped_tensor = generated_ids

# Create a mask to remove all end tokens from the sequences
mask = cropped_tensor != token_to_remove

processed_rows = []

# Remove all padding/end tokens from each sequence row
for row in cropped_tensor:
    masked_row = row[row != token_to_remove]
    processed_rows.append(masked_row)

code_lists = []

# Adjust the token values by subtracting offset (128266) and trim to multiples of 7 tokens
for row in processed_rows:
    row_length = row.size(0)
    new_length = (row_length // 7) * 7  # Ensure length divisible by 7
    trimmed_row = row[:new_length]
    trimmed_row = [t - 128266 for t in trimmed_row]
    code_lists.append(trimmed_row)


# Function to separate audio codes into layers and decode them back to audio waveform
def redistribute_codes(code_list):
  layer_1 = []
  layer_2 = []
  layer_3 = []
  for i in range((len(code_list)+1)//7):
    layer_1.append(code_list[7*i])
    layer_2.append(code_list[7*i+1]-4096)
    layer_3.append(code_list[7*i+2]-(2*4096))
    layer_3.append(code_list[7*i+3]-(3*4096))
    layer_2.append(code_list[7*i+4]-(4*4096))
    layer_3.append(code_list[7*i+5]-(5*4096))
    layer_3.append(code_list[7*i+6]-(6*4096))
  codes = [torch.tensor(layer_1).unsqueeze(0),
           torch.tensor(layer_2).unsqueeze(0),
           torch.tensor(layer_3).unsqueeze(0)]

  # Decode audio codes into waveform audio using SNAC model
  audio_hat = snac_model.decode(codes)
  return audio_hat

my_samples = []

# Decode all generated code lists into audio samples
for code_list in code_lists:
  samples = redistribute_codes(code_list)
  my_samples.append(samples)

from IPython.display import display, Audio

# Check that number of generated samples matches prompts
if len(prompts) != len(my_samples):
  raise Exception("Number of prompts and samples do not match")
else:
  # For each generated audio sample, print the prompt and play the audio
  for i in range(len(my_samples)):
    print(prompts[i])
    samples = my_samples[i]
    display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))

# Clean up variables to free RAM
del my_samples, samples


Hey there, my name is Suresh Beekhani how are MY DEAR FORENDS <giggles> and I'm a speech generation model that can sound like a person from Pakistan.
