In [1]:
from dotenv import load_dotenv
import os
load_dotenv()
print(os.environ["HF_HOME"])
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import soundfile as sf
import time

print("Starting TTS process with base 1B and German fine-tuned 1B models...")

# Load both models
llasa_1b_base = 'HKUSTAudio/Llasa-1B'
llasa_1b_german = '/media/bodza/Audio_Dataset/Llasa-Kartoffel-1B-v0.2'  # German fine-tuned model

print("Loading base 1B model...")
tokenizer_base = AutoTokenizer.from_pretrained(llasa_1b_base)
model_base = AutoModelForCausalLM.from_pretrained(llasa_1b_base, cache_dir="/media/bodza/Audio_Dataset/hf_cache/hub/models--HKUSTAudio--Llasa-1B")
model_base.eval()
model_base.to('cuda')
print("Base 1B model loaded")

print("Loading German fine-tuned 1B model...")
tokenizer_german = AutoTokenizer.from_pretrained(llasa_1b_german)
model_german = AutoModelForCausalLM.from_pretrained(llasa_1b_german)
model_german.eval()
model_german.to('cuda')
print("German fine-tuned model loaded")

# Load XCodec2 model
from xcodec2.modeling_xcodec2 import XCodec2Model
model_path = "HKUST-Audio/xcodec2"
Codec_model = XCodec2Model.from_pretrained(model_path)
Codec_model.eval().cuda()
print("XCodec2 model loaded")

input_text = 'Über sieben Brücken musst du gehen. Sieben dunkle Jahre überstehen. Sieben Mal wirst du die Asche sein. Aber einmal auch der helle Schein. Über sieben Brücken musst du gehen. Sieben dunkle Jahre überstehen. Sieben Mal wirst du die Asche sein. Aber einmal auch der helle Schein. Über sieben Brücken musst du'

def ids_to_speech_tokens(speech_ids):
    speech_tokens_str = []
    for speech_id in speech_ids:
        speech_tokens_str.append(f"<|s_{speech_id}|>")
    return speech_tokens_str

def extract_speech_ids(speech_tokens_str):
    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith('<|s_') and token_str.endswith('|>'):
            num_str = token_str[4:-2]
            num = int(num_str)
            speech_ids.append(num)
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids

def generate_speech(model, tokenizer, input_text, model_name):
    print(f"\nGenerating speech using {model_name}...")
    start_time = time.time()
    
    with torch.no_grad():
        formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
        
        chat = [
            {"role": "user", "content": "Convert the text to speech:" + formatted_text},
            {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
        ]

        input_ids = tokenizer.apply_chat_template(
            chat,
            tokenize=True,
            return_tensors='pt',
            continue_final_message=True
        )
        input_ids = input_ids.to('cuda')
        speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')

        outputs = model.generate(
            input_ids,
            max_length=2048,
            eos_token_id=speech_end_id,
            do_sample=True,
            top_p=1,
            temperature=0.8,
        )

        generated_ids = outputs[0][input_ids.shape[1]:-1]
        speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        print(f"Number of speech tokens generated: {len(speech_tokens)}")
        
        speech_tokens = extract_speech_ids(speech_tokens)
        print(f"Number of speech IDs extracted: {len(speech_tokens)}")
        
        speech_tokens = torch.tensor(speech_tokens).cuda().unsqueeze(0).unsqueeze(0)
        gen_wav = Codec_model.decode_code(speech_tokens)
        
        end_time = time.time()
        print(f"Generation time for {model_name}: {end_time - start_time:.2f} seconds")
        print(f"Generated waveform shape: {gen_wav.shape}")
        
        return gen_wav

# Generate speech using both models
gen_wav_base = generate_speech(model_base, tokenizer_base, input_text, "Base 1B model")
gen_wav_german = generate_speech(model_german, tokenizer_german, input_text, "German fine-tuned 1B model")

# Save both generated audio files
print("\nSaving generated audio files...")
sf.write("gen_base_1b.wav", gen_wav_base[0, 0, :].cpu().numpy(), 16000)
sf.write("gen_german_ft.wav", gen_wav_german[0, 0, :].cpu().numpy(), 16000)

print("Process completed! Generated files: gen_base_1b.wav and gen_german_ft.wav")
print("\nYou can now compare:")
print("1. gen_base_1b.wav - Generated by the base 1B model")
print("2. gen_german_ft.wav - Generated by the German fine-tuned model")

/media/bodza/Audio_Dataset/hf_cache/


  from .autonotebook import tqdm as notebook_tqdm


Starting TTS process with base 1B and German fine-tuned 1B models...
Loading base 1B model...
Base 1B model loaded
Loading German fine-tuned 1B model...
German fine-tuned model loaded


You are using a model of type xcodec2 to instantiate a model of type xcodec. This is not supported for all configurations of models and can yield errors.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


XCodec2 model loaded

Generating speech using Base 1B model...
Number of speech tokens generated: 1916
Number of speech IDs extracted: 1916


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.


Generation time for Base 1B model: 31.55 seconds
Generated waveform shape: torch.Size([1, 1, 613120])

Generating speech using German fine-tuned 1B model...
Number of speech tokens generated: 1039
Number of speech IDs extracted: 1039
Generation time for German fine-tuned 1B model: 17.17 seconds
Generated waveform shape: torch.Size([1, 1, 332480])

Saving generated audio files...
Process completed! Generated files: gen_base_1b.wav and gen_german_ft.wav

You can now compare:
1. gen_base_1b.wav - Generated by the base 1B model
2. gen_german_ft.wav - Generated by the German fine-tuned model


In [2]:
# Follow-up cell for additional generations

# Different text samples
text_samples = [
    "Essigsäureethylester, auch Ethylacetat oder kurz Essigester, ist eine chemische Verbindung aus der Gruppe der Carbonsäureester. Es ist der Ester, der aus Essigsäure und Ethanol gebildet wird. Es handelt sich um eine farblose, flüchtige Flüssigkeit mit charakteristischem Geruch.",
    "DeepSeek ist ein chinesisches KI-Startup, das sich auf die Entwicklung fortschrittlicher Sprachmodelle und künstlicher Intelligenz spezialisiert hat. Das Unternehmen gewann internationale Aufmerksamkeit mit der Veröffentlichung seines im Januar 2025 vorgestellten Modells DeepSeek R1, das mit etablierten KI-Systemen wie ChatGPT von OpenAI und Claude von Anthropic konkurriert",
    "Guten Morgen! Wie geht es dir heute? Das Wetter ist wunderschön. Lass uns spazieren gehen."
]

# Generate audio for each text sample with both models
for i, text in enumerate(text_samples, 1):
    print(f"\n{'='*50}")
    print(f"Generating sample {i}:")
    print(f"Text: {text[:100]}...")
    
    # Generate with base model
    gen_wav_base = generate_speech(
        model_base, 
        tokenizer_base, 
        text, 
        f"Base 1B model - Sample {i}"
    )
    
    # Generate with German fine-tuned model
    gen_wav_german = generate_speech(
        model_german, 
        tokenizer_german, 
        text, 
        f"German fine-tuned model - Sample {i}"
    )
    
    # Save the audio files with numbered suffixes
    base_filename = f"sample_{i}_base_1b.wav"
    german_filename = f"sample_{i}_german_ft.wav"
    
    print(f"\nSaving audio files for sample {i}...")
    sf.write(base_filename, gen_wav_base[0, 0, :].cpu().numpy(), 16000)
    sf.write(german_filename, gen_wav_german[0, 0, :].cpu().numpy(), 16000)
    
    print(f"Generated files:")
    print(f"- {base_filename}")
    print(f"- {german_filename}")

print("\nAll samples generated successfully!")
print("\nGenerated files summary:")
for i in range(1, len(text_samples) + 1):
    print(f"\nSample {i}:")
    print(f"- sample_{i}_base_1b.wav (Base 1B model)")
    print(f"- sample_{i}_german_ft.wav (German fine-tuned model)")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.



Generating sample 1:
Text: Essigsäureethylester, auch Ethylacetat oder kurz Essigester, ist eine chemische Verbindung aus der G...

Generating speech using Base 1B model - Sample 1...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.


Number of speech tokens generated: 968
Number of speech IDs extracted: 968
Generation time for Base 1B model - Sample 1: 15.71 seconds
Generated waveform shape: torch.Size([1, 1, 309760])

Generating speech using German fine-tuned model - Sample 1...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.


Number of speech tokens generated: 787
Number of speech IDs extracted: 787
Generation time for German fine-tuned model - Sample 1: 13.45 seconds
Generated waveform shape: torch.Size([1, 1, 251840])

Saving audio files for sample 1...
Generated files:
- sample_1_base_1b.wav
- sample_1_german_ft.wav

Generating sample 2:
Text: DeepSeek ist ein chinesisches KI-Startup, das sich auf die Entwicklung fortschrittlicher Sprachmodel...

Generating speech using Base 1B model - Sample 2...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.


Number of speech tokens generated: 1088
Number of speech IDs extracted: 1088
Generation time for Base 1B model - Sample 2: 18.60 seconds
Generated waveform shape: torch.Size([1, 1, 348160])

Generating speech using German fine-tuned model - Sample 2...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.


Number of speech tokens generated: 1430
Number of speech IDs extracted: 1430
Generation time for German fine-tuned model - Sample 2: 24.17 seconds
Generated waveform shape: torch.Size([1, 1, 457600])

Saving audio files for sample 2...
Generated files:
- sample_2_base_1b.wav
- sample_2_german_ft.wav

Generating sample 3:
Text: Guten Morgen! Wie geht es dir heute? Das Wetter ist wunderschön. Lass uns spazieren gehen....

Generating speech using Base 1B model - Sample 3...


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.


Number of speech tokens generated: 367
Number of speech IDs extracted: 367
Generation time for Base 1B model - Sample 3: 6.37 seconds
Generated waveform shape: torch.Size([1, 1, 117440])

Generating speech using German fine-tuned model - Sample 3...
Number of speech tokens generated: 335
Number of speech IDs extracted: 335
Generation time for German fine-tuned model - Sample 3: 6.52 seconds
Generated waveform shape: torch.Size([1, 1, 107200])

Saving audio files for sample 3...
Generated files:
- sample_3_base_1b.wav
- sample_3_german_ft.wav

All samples generated successfully!

Generated files summary:

Sample 1:
- sample_1_base_1b.wav (Base 1B model)
- sample_1_german_ft.wav (German fine-tuned model)

Sample 2:
- sample_2_base_1b.wav (Base 1B model)
- sample_2_german_ft.wav (German fine-tuned model)

Sample 3:
- sample_3_base_1b.wav (Base 1B model)
- sample_3_german_ft.wav (German fine-tuned model)


In [5]:
# Follow-up cell for additional generations

# Different text samples
text_samples = [
    # "Als Anna abends aß, aß Anna abends Ananas. Brautkleid bleibt Brautkleid und Blaukraut bleibt Blaukraut. Fischers Fritze fischte frische Fische, frische Fische fischte Fischers Fritze. Im dichten Fichtendickicht nicken dicke Fichten tüchtig. Kleine Kinder können keine Kirschkerne knacken."
"Actros ist die Bezeichnung für eine Lkw-Baureihe der Marke Mercedes-Benz der Daimler Truck AG. Sie wurde als Nachfolger der Schweren Klasse (SK) auf der IAA Nutzfahrzeuge 1996 eingeführt und daher anfänglich auch als Schwere Klasse Neu, kurz SKN, bezeichnet. In den Jahren 2003 und 2008 gab es umfangreiche Modellpflegemaßnahmen, die auch als MP 2 bzw. MP 3 (Modellprojekt 2 bzw. 3) bekannt sind."
]

# Generate audio for each text sample with both models
for i, text in enumerate(text_samples, 1):
    print(f"\n{'='*50}")
    print(f"Generating sample {i}:")
    print(f"Text: {text[:100]}...")
    
    # Generate with base model
    gen_wav_base = generate_speech(
        model_base, 
        tokenizer_base, 
        text, 
        f"Base 1B model - Sample {i}"
    )
    
    # Generate with German fine-tuned model
    gen_wav_german = generate_speech(
        model_german, 
        tokenizer_german, 
        text, 
        f"German fine-tuned model - Sample {i}"
    )
    
    # Save the audio files with numbered suffixes
    base_filename = f"sample_{i}_base_1b.wav"
    german_filename = f"sample_{i}_german_ft.wav"
    
    print(f"\nSaving audio files for sample {i}...")
    sf.write(base_filename, gen_wav_base[0, 0, :].cpu().numpy(), 16000)
    sf.write(german_filename, gen_wav_german[0, 0, :].cpu().numpy(), 16000)
    
    print(f"Generated files:")
    print(f"- {base_filename}")
    print(f"- {german_filename}")

print("\nAll samples generated successfully!")
print("\nGenerated files summary:")
for i in range(1, len(text_samples) + 1):
    print(f"\nSample {i}:")
    print(f"- sample_{i}_base_1b.wav (Base 1B model)")
    print(f"- sample_{i}_german_ft.wav (German fine-tuned model)")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.



Generating sample 1:
Text: Actros ist die Bezeichnung für eine Lkw-Baureihe der Marke Mercedes-Benz der Daimler Truck AG. Sie w...

Generating speech using Base 1B model - Sample 1...
Number of speech tokens generated: 1872
Number of speech IDs extracted: 1872


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128261 for open-end generation.


Generation time for Base 1B model - Sample 1: 31.06 seconds
Generated waveform shape: torch.Size([1, 1, 599040])

Generating speech using German fine-tuned model - Sample 1...
Number of speech tokens generated: 1253
Number of speech IDs extracted: 1253
Generation time for German fine-tuned model - Sample 1: 23.32 seconds
Generated waveform shape: torch.Size([1, 1, 400960])

Saving audio files for sample 1...
Generated files:
- sample_1_base_1b.wav
- sample_1_german_ft.wav

All samples generated successfully!

Generated files summary:

Sample 1:
- sample_1_base_1b.wav (Base 1B model)
- sample_1_german_ft.wav (German fine-tuned model)
