# Tokenize Expresso

In [13]:
from datasets import load_from_disk

ds = load_from_disk("/home/shaan/Projects/Dataset/datasets/encoded_custom_data")
ds = ds.with_format('torch')

Going to drop everything else

In [14]:
#supported_styles = ["confused", "enunciated", "happy", "laughing", "default", "sad", "whisper", "emphasis"]
#ds = ds.filter(lambda r: r["style"] in supported_styles, num_proc=12)

## Add control tokens to model and tokenizer

In [15]:
import os
from transformers import AutoTokenizer

init_folder = "../inits/csm-1b-expresso"
os.makedirs(init_folder, exist_ok=True)
tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B")

tokenizer.save_pretrained(init_folder)

('../inits/csm-1b-expresso/tokenizer_config.json',
 '../inits/csm-1b-expresso/special_tokens_map.json',
 '../inits/csm-1b-expresso/tokenizer.json')

In [16]:
import sys
import os
# Add parent directory to path to find modeling module
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

## Tokenize to CSM format

Now let's load our new tokenizer back again:

In [17]:
from modeling.utils import PromptEncoder

tokenizer = AutoTokenizer.from_pretrained(init_folder)
prompt_encoder = PromptEncoder(tokenizer=tokenizer)

Finally, we prepare the inputs:

In [18]:
import torch
import logging # Optional: for warnings

# Assuming prompt_encoder is correctly defined with the base tokenizer

def tokenize_row(row: dict):
    # --- Text Input: NO Style Token ---
    text_input = row.get("text", "") # Get text, default to empty if missing

    # --- Speaker ID Handling: Correct for Custom Data ---
    try:
        # Assumes 'speaker_id' column has strings like "0", "1", ...
        speaker_id_int = int(row["speaker_id"])
    except (ValueError, KeyError) as e:
        logging.warning(f"Could not parse speaker_id '{row.get('speaker_id', 'N/A')}' as int. Using 0. Error: {e}")
        speaker_id_int = 0 # Use a default speaker ID if parsing fails

    # --- Checks and Tokenization ---
    if 'codes' not in row:
         logging.warning(f"Warning: Missing 'codes' in row. Skipping.")
         return {"ground_truth": torch.empty(0, 33), "ground_truth_masks": torch.empty(0, 33)}
    codes_tensor = row['codes']
    if not isinstance(codes_tensor, torch.Tensor):
         logging.warning(f"Warning: 'codes' is not a tensor (type: {type(codes_tensor)}). Skipping.")
         return {"ground_truth": torch.empty(0, 33), "ground_truth_masks": torch.empty(0, 33)}

    # Call the tokenizer with corrected inputs (no style token)
    text_tokens, text_masks = prompt_encoder._tokenize_text_segment(
        text_input, speaker_id_int
    )
    audio_tokens, audio_masks = prompt_encoder._tokenize_audio(codes_tensor)

    if text_tokens.numel() == 0 or audio_tokens.numel() == 0:
         logging.warning(f"Warning: Empty tokens generated for row text: '{text_input}'")
         return {"ground_truth": torch.empty(0, 33), "ground_truth_masks": torch.empty(0, 33)}

    return {
        "ground_truth": torch.cat([text_tokens, audio_tokens], dim=0),
        "ground_truth_masks": torch.cat([text_masks, audio_masks], dim=0),
    }

In [19]:
from datasets import DatasetDict
import logging # Optional for filtering log

# Assuming 'ds' holds the loaded encoded_custom_data dataset
# Assuming 'tokenize_row' is the correct version (no style, correct speaker ID)

orig_colnames = ds.column_names
ds_mapped = ds.map(tokenize_row, num_proc=12, remove_columns=orig_colnames) # Assign to new var

# --- Optional but Recommended: Filter out failed tokenizations ---
initial_count = len(ds_mapped)
ds_filtered = ds_mapped.filter(lambda x: x['ground_truth'].numel() > 0)
filtered_count = len(ds_filtered)
if initial_count != filtered_count:
    logging.warning(f"Filtered out {initial_count - filtered_count} examples due to tokenization errors.")
# --- End Optional Filter ---


# Structure as DatasetDict (using the filtered dataset if you added that step)
ds_final = DatasetDict({
    "train": ds_filtered # Use ds_filtered if you added the filter step, otherwise use ds_mapped
})

# --- CORRECTED Save Path ---
output_path = "../datasets/tokenized_custom_data_no_style" # Use a descriptive name
ds_final.save_to_disk(output_path)
print(f"Saved tokenized custom data (no style) to: {output_path}")

Saving the dataset (1/1 shards): 100%|██████████| 14584/14584 [00:02<00:00, 6056.60 examples/s] 


Saved tokenized custom data (no style) to: ../datasets/tokenized_custom_data_no_style
