# Seseme CSM Fine-Tuning

This notebook demonstrates how to fine-tune the Seseme CSM model using the Hugging Face `transformers` library.

## Setup

Connect to the Hugging Face Hub and install the dataset. The dataset used is in `parquet` format, which is efficient for large datasets.

In [None]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!git clone https://huggingface.co/datasets/MrDragonFox/Elise

Cloning into 'Elise'...
remote: Enumerating objects: 17, done.[K
remote: Total 17 (delta 0), reused 0 (delta 0), pack-reused 17 (from 1)[K
Unpacking objects: 100% (17/17), 5.95 KiB | 1.98 MiB/s, done.


## Preprocessing

In [None]:
from transformers import CsmForConditionalGeneration, AutoProcessor, Trainer, TrainingArguments
from datasets import load_dataset, Audio
import torch
import numpy as np
from tqdm import tqdm
import os

model_id = "sesame/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model and processor
processor = AutoProcessor.from_pretrained(model_id)
model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
model.train()
model.codec_model.eval()  # Keep codec model in eval mode during training

# Path to the dataset after git clone
dataset_path = os.path.join("content", "Elise", "data", "train-00000-of-00001.parquet")

# Check if the path exists, if not, adjust for current directory structure
if not os.path.exists(dataset_path):
    # Adjust path if running in the current directory
    dataset_path = os.path.join("Elise", "data", "train-00000-of-00001.parquet")
    # If you're running this notebook elsewhere, adjust the path accordingly

# Load the dataset from parquet file - use pandas first to avoid LocalFileSystem error
try:
    import pandas as pd
    df = pd.read_parquet(dataset_path)
    # Convert pandas DataFrame to Hugging Face Dataset
    from datasets import Dataset
    ds = {"train": Dataset.from_pandas(df)}
    print(f"Dataset structure: {ds}")
except Exception as e:
    print(f"Error loading with pandas: {e}")
    # Fallback to direct loading - may cause the error
    try:
        ds = load_dataset("parquet", data_files={"train": dataset_path})
        print(f"Dataset structure: {ds}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        # Last resort: try arrow directly
        try:
            import pyarrow.parquet as pq
            table = pq.read_table(dataset_path)
            from datasets import Dataset
            ds = {"train": Dataset.from_arrow_table(table)}
            print(f"Dataset loaded with pyarrow: {ds}")
        except Exception as e:
            print(f"All loading methods failed: {e}")
            raise

# Ensure the audio is 24kHz (CSM requirement)
# Check if 'audio' column exists, otherwise look for the correct audio column
audio_column = "audio"
if audio_column in ds["train"].column_names:
    ds["train"] = ds["train"].cast_column(audio_column, Audio(sampling_rate=24000))
else:
    print(f"Warning: Column '{audio_column}' not found. Available columns: {ds['train'].column_names}")
    # Try to identify the audio column if it has a different name
    # You might need to adjust this based on your dataset structure

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset structure: {'train': Dataset({
    features: ['audio', 'text'],
    num_rows: 1195
})}


In [None]:
def prepare_conversation_batch(batch_size=4, offset=0):
    """Prepare a batch of conversations from the dataset"""
    batch_indices = list(range(offset, min(offset + batch_size, len(ds["train"]))))
    conversation = []

    # Use a stringified integer for the speaker ID (e.g., "0")
    speaker_id = "0"  # Changed from "Elise" to "0"

    for idx in batch_indices:
        example = ds["train"][idx]

        # Extract text and audio from the dataset
        text = example.get("text", "")

        # Handle audio data correctly based on dataset structure
        if "audio" in example and isinstance(example["audio"], dict) and "array" in example["audio"]:
            audio_data = example["audio"]["array"]
        elif "audio" in example:
            # If audio is directly accessible
            audio_data = example["audio"]
        else:
            # Try to find audio under a different key
            print(f"Warning: Audio not found in example. Available keys: {example.keys()}")
            continue

        conversation.append({
            "role": speaker_id,  # Now using a stringified integer
            "content": [
                {"type": "text", "text": text},
                {"type": "audio", "path": audio_data}
            ],
        })

    return conversation

## Modeling

In [None]:
# Training configurations
output_dir = "./csm_elise_model"
num_train_epochs = 3
per_device_train_batch_size = 1  # CSM models can be memory intensive
gradient_accumulation_steps = 4
learning_rate = 5e-5
warmup_steps = 100
logging_steps = 10
save_steps = 500
max_steps = 2000  # Adjust based on dataset size

# Initialize training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    warmup_steps=warmup_steps,
    logging_steps=logging_steps,
    save_steps=save_steps,
    max_steps=max_steps,
    fp16=True,  # Use mixed precision training
    remove_unused_columns=False,
    report_to="tensorboard",
)

# Custom training loop (as alternative to using Trainer)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer,
                                             start_factor=1.0,
                                             end_factor=0.0,
                                             total_iters=max_steps)

# Training loop
model.train()
total_batches = min(max_steps, len(ds["train"]) // per_device_train_batch_size)

for step in tqdm(range(total_batches)):
    batch_offset = step * per_device_train_batch_size
    conversation = prepare_conversation_batch(per_device_train_batch_size, batch_offset)

    # Skip empty conversations
    if not conversation:
        continue

    # Process the conversation batch
    try:
        inputs = processor.apply_chat_template(
            conversation,
            tokenize=True,
            return_dict=True,
            output_labels=True,
        ).to(device)

        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Optimizer step with gradient accumulation
        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

        # Log progress
        if step % logging_steps == 0:
            print(f"Step {step}, Loss: {loss.item()}")

        # Save checkpoint
        if step % save_steps == 0 and step > 0:
            model.save_pretrained(f"{output_dir}/checkpoint-{step}")
            processor.save_pretrained(f"{output_dir}/checkpoint-{step}")

    except Exception as e:
        print(f"Error in batch at offset {batch_offset}: {e}")
        continue

  0%|          | 1/1195 [00:02<48:09,  2.42s/it]

Step 0, Loss: 5.28347635269165


  1%|          | 11/1195 [00:05<06:29,  3.04it/s]

Step 10, Loss: 6.679079055786133


  2%|▏         | 21/1195 [00:08<06:23,  3.06it/s]

Step 20, Loss: 5.763577938079834


  3%|▎         | 31/1195 [00:12<06:24,  3.03it/s]

Step 30, Loss: 4.629986763000488


  3%|▎         | 41/1195 [00:15<06:37,  2.90it/s]

Step 40, Loss: 5.395144462585449


  4%|▍         | 51/1195 [00:18<06:28,  2.95it/s]

Step 50, Loss: 6.2297563552856445


  5%|▌         | 61/1195 [00:21<05:58,  3.16it/s]

Step 60, Loss: 4.918163299560547


  6%|▌         | 71/1195 [00:25<05:57,  3.14it/s]

Step 70, Loss: 5.020974159240723


  7%|▋         | 81/1195 [00:28<06:08,  3.02it/s]

Step 80, Loss: 5.537406921386719


  8%|▊         | 91/1195 [00:31<06:20,  2.90it/s]

Step 90, Loss: 5.171121597290039


  8%|▊         | 101/1195 [00:34<06:06,  2.99it/s]

Step 100, Loss: 5.276912689208984


  9%|▉         | 111/1195 [00:37<05:38,  3.20it/s]

Step 110, Loss: 5.919561386108398


 10%|█         | 121/1195 [00:41<05:43,  3.13it/s]

Step 120, Loss: 6.238894939422607


 11%|█         | 131/1195 [00:44<05:15,  3.37it/s]

Step 130, Loss: 6.193996429443359


 12%|█▏        | 141/1195 [00:47<05:52,  2.99it/s]

Step 140, Loss: 4.5522356033325195


 13%|█▎        | 151/1195 [00:50<05:10,  3.37it/s]

Step 150, Loss: 7.082564353942871


 13%|█▎        | 161/1195 [00:53<05:16,  3.27it/s]

Step 160, Loss: 5.653117656707764


 14%|█▍        | 171/1195 [00:56<05:36,  3.05it/s]

Step 170, Loss: 4.872664451599121


 15%|█▌        | 181/1195 [01:00<05:40,  2.98it/s]

Step 180, Loss: 6.2809600830078125


 16%|█▌        | 191/1195 [01:03<05:25,  3.09it/s]

Step 190, Loss: 5.962337970733643


 17%|█▋        | 201/1195 [01:06<05:18,  3.12it/s]

Step 200, Loss: 5.3110504150390625


 18%|█▊        | 211/1195 [01:09<04:41,  3.50it/s]

Step 210, Loss: 5.920234680175781


 18%|█▊        | 221/1195 [01:12<04:52,  3.33it/s]

Step 220, Loss: 5.854984760284424


 19%|█▉        | 231/1195 [01:15<04:43,  3.40it/s]

Step 230, Loss: 5.118786334991455


 20%|██        | 241/1195 [01:18<05:14,  3.04it/s]

Step 240, Loss: 6.7440266609191895


 21%|██        | 251/1195 [01:21<04:53,  3.22it/s]

Step 250, Loss: 5.2213029861450195


 22%|██▏       | 261/1195 [01:24<04:41,  3.32it/s]

Step 260, Loss: 5.257656097412109


 23%|██▎       | 271/1195 [01:27<05:12,  2.96it/s]

Step 270, Loss: 5.759271621704102


 24%|██▎       | 281/1195 [01:31<05:17,  2.87it/s]

Step 280, Loss: 6.606866836547852


 24%|██▍       | 291/1195 [01:34<04:59,  3.02it/s]

Step 290, Loss: 5.266965389251709


 25%|██▌       | 301/1195 [01:37<04:37,  3.22it/s]

Step 300, Loss: 5.558274269104004


 26%|██▌       | 311/1195 [01:40<04:51,  3.03it/s]

Step 310, Loss: 6.760072708129883


 27%|██▋       | 321/1195 [01:43<04:26,  3.28it/s]

Step 320, Loss: 4.074254035949707


 28%|██▊       | 331/1195 [01:46<04:14,  3.40it/s]

Step 330, Loss: 6.230975151062012


 29%|██▊       | 341/1195 [01:50<04:59,  2.85it/s]

Step 340, Loss: 4.9073076248168945


 29%|██▉       | 351/1195 [01:53<04:02,  3.48it/s]

Step 350, Loss: 4.447729110717773


 30%|███       | 361/1195 [01:56<04:45,  2.92it/s]

Step 360, Loss: 4.445489406585693


 31%|███       | 371/1195 [01:59<04:01,  3.42it/s]

Step 370, Loss: 5.010861396789551


 32%|███▏      | 381/1195 [02:02<04:21,  3.11it/s]

Step 380, Loss: 6.178777694702148


 33%|███▎      | 391/1195 [02:05<04:14,  3.16it/s]

Step 390, Loss: 4.356114387512207


 34%|███▎      | 401/1195 [02:08<04:20,  3.05it/s]

Step 400, Loss: 5.3696417808532715


 34%|███▍      | 411/1195 [02:12<03:56,  3.31it/s]

Step 410, Loss: 5.116121768951416


 35%|███▌      | 421/1195 [02:15<04:21,  2.96it/s]

Step 420, Loss: 5.151555061340332


 36%|███▌      | 431/1195 [02:18<04:06,  3.10it/s]

Step 430, Loss: 4.914307594299316


 37%|███▋      | 441/1195 [02:21<04:08,  3.04it/s]

Step 440, Loss: 4.23335075378418


 38%|███▊      | 451/1195 [02:24<03:51,  3.22it/s]

Step 450, Loss: 5.222774982452393


 39%|███▊      | 461/1195 [02:28<03:58,  3.08it/s]

Step 460, Loss: 5.697205543518066


 39%|███▉      | 471/1195 [02:31<04:04,  2.96it/s]

Step 470, Loss: 5.2659807205200195


 40%|████      | 481/1195 [02:34<03:26,  3.46it/s]

Step 480, Loss: 5.533018589019775


 41%|████      | 491/1195 [02:37<03:16,  3.59it/s]

Step 490, Loss: 4.122077465057373


 42%|████▏     | 500/1195 [02:40<03:47,  3.06it/s]

Step 500, Loss: 4.643707752227783


 43%|████▎     | 511/1195 [03:05<05:45,  1.98it/s]

Step 510, Loss: 5.422162055969238


 44%|████▎     | 521/1195 [03:08<03:38,  3.08it/s]

Step 520, Loss: 6.155267238616943


 44%|████▍     | 531/1195 [03:11<03:17,  3.36it/s]

Step 530, Loss: 5.657868385314941


 45%|████▌     | 541/1195 [03:14<03:25,  3.19it/s]

Step 540, Loss: 5.951122283935547


 46%|████▌     | 551/1195 [03:18<03:37,  2.96it/s]

Step 550, Loss: 5.424754619598389


 47%|████▋     | 561/1195 [03:21<03:22,  3.14it/s]

Step 560, Loss: 6.137777805328369


 48%|████▊     | 571/1195 [03:24<03:36,  2.88it/s]

Step 570, Loss: 5.736979007720947


 49%|████▊     | 581/1195 [03:27<03:24,  3.01it/s]

Step 580, Loss: 4.066653251647949


 49%|████▉     | 591/1195 [03:30<02:59,  3.36it/s]

Step 590, Loss: 6.045615196228027


 50%|█████     | 601/1195 [03:34<03:27,  2.86it/s]

Step 600, Loss: 5.554749011993408


 51%|█████     | 611/1195 [03:37<03:01,  3.22it/s]

Step 610, Loss: 4.984900951385498


 52%|█████▏    | 621/1195 [03:40<03:13,  2.96it/s]

Step 620, Loss: 5.357428073883057


 53%|█████▎    | 631/1195 [03:43<02:51,  3.29it/s]

Step 630, Loss: 5.408138751983643


 54%|█████▎    | 641/1195 [03:46<03:00,  3.08it/s]

Step 640, Loss: 5.056497573852539


 54%|█████▍    | 651/1195 [03:49<02:31,  3.59it/s]

Step 650, Loss: 6.179388046264648


 55%|█████▌    | 661/1195 [03:52<03:09,  2.82it/s]

Step 660, Loss: 5.797098159790039


 56%|█████▌    | 671/1195 [03:56<02:33,  3.41it/s]

Step 670, Loss: 5.283552646636963


 57%|█████▋    | 681/1195 [03:59<02:52,  2.98it/s]

Step 680, Loss: 3.4165215492248535


 58%|█████▊    | 691/1195 [04:02<02:46,  3.03it/s]

Step 690, Loss: 6.355926036834717


 59%|█████▊    | 701/1195 [04:05<02:44,  3.00it/s]

Step 700, Loss: 5.146256446838379


 59%|█████▉    | 711/1195 [04:08<02:41,  3.00it/s]

Step 710, Loss: 6.295371055603027


 60%|██████    | 721/1195 [04:12<02:36,  3.02it/s]

Step 720, Loss: 4.950864791870117


 61%|██████    | 731/1195 [04:15<02:24,  3.21it/s]

Step 730, Loss: 4.365095615386963


 62%|██████▏   | 741/1195 [04:18<02:17,  3.29it/s]

Step 740, Loss: 4.947088241577148


 63%|██████▎   | 751/1195 [04:21<02:22,  3.11it/s]

Step 750, Loss: 5.401905059814453


 64%|██████▎   | 761/1195 [04:24<02:24,  3.00it/s]

Step 760, Loss: 4.723438739776611


 65%|██████▍   | 771/1195 [04:27<02:09,  3.28it/s]

Step 770, Loss: 6.360671043395996


 65%|██████▌   | 781/1195 [04:31<02:18,  3.00it/s]

Step 780, Loss: 4.023954391479492


 66%|██████▌   | 791/1195 [04:34<02:11,  3.07it/s]

Step 790, Loss: 5.522309303283691


 67%|██████▋   | 801/1195 [04:37<02:03,  3.18it/s]

Step 800, Loss: 4.878626346588135


 68%|██████▊   | 811/1195 [04:40<02:01,  3.16it/s]

Step 810, Loss: 3.7168073654174805


 69%|██████▊   | 821/1195 [04:43<01:50,  3.39it/s]

Step 820, Loss: 1.0314295291900635


 70%|██████▉   | 831/1195 [04:46<01:54,  3.19it/s]

Step 830, Loss: 4.657836437225342


 70%|███████   | 841/1195 [04:50<02:01,  2.92it/s]

Step 840, Loss: 5.00470495223999


 71%|███████   | 851/1195 [04:52<01:36,  3.56it/s]

Step 850, Loss: 4.265064239501953


 72%|███████▏  | 861/1195 [04:55<01:41,  3.30it/s]

Step 860, Loss: 3.855806350708008


 73%|███████▎  | 871/1195 [04:59<01:35,  3.41it/s]

Step 870, Loss: 5.8250885009765625


 74%|███████▎  | 881/1195 [05:02<01:49,  2.86it/s]

Step 880, Loss: 6.355229377746582


 75%|███████▍  | 891/1195 [05:05<01:42,  2.96it/s]

Step 890, Loss: 5.612832069396973


 75%|███████▌  | 901/1195 [05:08<01:39,  2.96it/s]

Step 900, Loss: 4.463918209075928


 76%|███████▌  | 911/1195 [05:12<01:30,  3.15it/s]

Step 910, Loss: 5.325249671936035


 77%|███████▋  | 921/1195 [05:15<01:34,  2.91it/s]

Step 920, Loss: 5.687045097351074


 78%|███████▊  | 931/1195 [05:18<01:22,  3.19it/s]

Step 930, Loss: 4.594488143920898


 79%|███████▊  | 941/1195 [05:21<01:25,  2.99it/s]

Step 940, Loss: 4.9670515060424805


 80%|███████▉  | 951/1195 [05:24<01:14,  3.29it/s]

Step 950, Loss: 6.028736591339111


 80%|████████  | 961/1195 [05:28<01:20,  2.89it/s]

Step 960, Loss: 5.699470043182373


 81%|████████▏ | 971/1195 [05:31<01:12,  3.07it/s]

Step 970, Loss: 5.190751075744629


 82%|████████▏ | 981/1195 [05:34<01:08,  3.11it/s]

Step 980, Loss: 4.977304458618164


 83%|████████▎ | 991/1195 [05:37<01:00,  3.39it/s]

Step 990, Loss: 5.664828777313232


 84%|████████▎ | 1000/1195 [05:40<00:57,  3.37it/s]

Step 1000, Loss: 4.50126838684082


 85%|████████▍ | 1011/1195 [06:04<01:28,  2.07it/s]

Step 1010, Loss: 4.0476908683776855


 85%|████████▌ | 1021/1195 [06:08<00:58,  2.97it/s]

Step 1020, Loss: 5.009944915771484


 86%|████████▋ | 1031/1195 [06:11<00:51,  3.19it/s]

Step 1030, Loss: 5.026532173156738


 87%|████████▋ | 1041/1195 [06:14<00:50,  3.05it/s]

Step 1040, Loss: 5.218618392944336


 88%|████████▊ | 1051/1195 [06:17<00:43,  3.35it/s]

Step 1050, Loss: 4.838877201080322


 89%|████████▉ | 1061/1195 [06:20<00:48,  2.78it/s]

Step 1060, Loss: 4.662326335906982


 90%|████████▉ | 1071/1195 [06:24<00:40,  3.04it/s]

Step 1070, Loss: 5.596148490905762


 90%|█████████ | 1081/1195 [06:27<00:33,  3.40it/s]

Step 1080, Loss: 4.987510681152344


 91%|█████████▏| 1091/1195 [06:30<00:33,  3.09it/s]

Step 1090, Loss: 5.285407066345215


 92%|█████████▏| 1101/1195 [06:33<00:31,  2.94it/s]

Step 1100, Loss: 5.919890403747559


 93%|█████████▎| 1111/1195 [06:36<00:26,  3.16it/s]

Step 1110, Loss: 5.416629314422607


 94%|█████████▍| 1121/1195 [06:39<00:24,  3.06it/s]

Step 1120, Loss: 5.459616661071777


 95%|█████████▍| 1131/1195 [06:42<00:19,  3.28it/s]

Step 1130, Loss: 6.063162326812744


 95%|█████████▌| 1141/1195 [06:45<00:17,  3.07it/s]

Step 1140, Loss: 3.987377643585205


 96%|█████████▋| 1151/1195 [06:48<00:12,  3.60it/s]

Step 1150, Loss: 6.180079460144043


 97%|█████████▋| 1161/1195 [06:51<00:10,  3.15it/s]

Step 1160, Loss: 6.163342475891113


 98%|█████████▊| 1171/1195 [06:54<00:07,  3.38it/s]

Step 1170, Loss: 2.678589344024658


 99%|█████████▉| 1181/1195 [06:57<00:04,  3.19it/s]

Step 1180, Loss: 4.728840351104736


100%|█████████▉| 1191/1195 [07:01<00:01,  3.16it/s]

Step 1190, Loss: 5.619165897369385


100%|██████████| 1195/1195 [07:02<00:00,  2.83it/s]


Now save the trained model in local directory.

In [None]:
# Save the final model
model.save_pretrained(f"{output_dir}/final")
processor.save_pretrained(f"{output_dir}/final")

[]

## Inference

Try the model with some sample inputs to see how it performs.

In [None]:
from IPython.display import Audio, display
import soundfile as sf

# another equivalent way to prepare the inputs
conversation = [
    {"role": "0", "content": [{"type": "text", "text": "The past is just a story we tell ourselves."}]},
]
inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
).to(device)

# infer the model
audio = model.generate(**inputs, output_audio=True)

# Move tensor from GPU to CPU before converting to numpy
audio_cpu = audio[0].to(torch.float32).cpu().numpy()  # This fixes the TypeError

# Write to file and play
output_file = "output.wav"
sf.write(output_file, audio_cpu, 24000)
display(Audio(output_file))

In [None]:
# another equivalent way to prepare the inputs
conversation = [
    {"role": "0", "content": [{"type": "text", "text": "That's so sweet <moans> I know <breathes deeply>. It feels so good."}]},
]
inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
).to(device)

# infer the model
audio = model.generate(**inputs, output_audio=True)

# Move tensor from GPU to CPU before converting to numpy
audio_cpu = audio[0].to(torch.float32).cpu().numpy()  # This fixes the TypeError

# Write to file and play
output_file = "output_2.wav"
sf.write(output_file, audio_cpu, 24000)
display(Audio(output_file))

## Save To Hugging Face Hub

Finally, save the model to the Hugging Face Hub for future use or sharing.

In [None]:
# Install the huggingface_hub package if not already installed
# !pip install huggingface_hub

from huggingface_hub import HfApi

# Define your model repository name (create this on the HF website first)
model_name = "keanteng/sesame-csm-elise"  # Replace with your desired repo name

# Alternatively, push an already saved model directory
api = HfApi()
api.upload_folder(
    folder_path=f"{output_dir}/final",
    repo_id=model_name,
    repo_type="model"
)

model-00002-of-00002.safetensors:   0%|          | 0.00/1.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [None]:
# Add a model card with description
with open(f"{output_dir}/README.md", "w") as f:
    f.write("""
# CSM Elise Voice Model

This model is a fine-tuned version of [sesame/csm-1b](https://huggingface.co/sesame/csm-1b) using the [Elise dataset](https://huggingface.co/datasets/MrDragonFox/Elise).

## Model Details
- **Base Model**: sesame/csm-1b
- **Training Data**: MrDragonFox/Elise dataset
- **Fine-tuning Approach**: Voice cloning through conditional speech generation
- **Voice Characteristics**: [Describe voice qualities]
- **Training Parameters**:
  - Learning Rate: 5e-5
  - Epochs: 3
  - Batch Size: 1 with gradient accumulation steps of 4

""")

In [None]:
api.upload_file(
    path_or_fileobj=f"{output_dir}/README.md",
    path_in_repo="README.md",
    repo_id=model_name,
    repo_type="model"
)

- empty or missing yaml metadata in repo card


CommitInfo(commit_url='https://huggingface.co/keanteng/sesame-csm-elise/commit/e41344358829eccf0fa8c2d2c8577f17625a81f1', commit_message='Upload README.md with huggingface_hub', commit_description='', oid='e41344358829eccf0fa8c2d2c8577f17625a81f1', pr_url=None, repo_url=RepoUrl('https://huggingface.co/keanteng/sesame-csm-elise', endpoint='https://huggingface.co', repo_type='model', repo_id='keanteng/sesame-csm-elise'), pr_revision=None, pr_num=None)