In [1]:
import torch
from TTS.api import TTS
import torch.serialization

# Override the torch.load function to use weights_only=False by default
# Only do this if you trust the source of the checkpoint
original_torch_load = torch.load
torch.load = lambda f, map_location=None, pickle_module=torch.serialization.pickle, **kwargs: original_torch_load(
    f, map_location=map_location, pickle_module=pickle_module, weights_only=False, **kwargs
)

# Initialize TTS model
tts = TTS("omogr/xtts-ru-ipa").to("cuda")

  from .autonotebook import tqdm as notebook_tqdm


 > Using model: xtts


GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [2]:
import torch
from TTS.api import TTS
import json
import os
import numpy as np
from tqdm import tqdm
import logging
import soundfile as sf
import sys

# Configure minimal logging - more aggressive suppression
logging.basicConfig(level=logging.ERROR)

# Silence TTS-specific logging
logging.getLogger('TTS').setLevel(logging.ERROR)
logging.getLogger('TTS.utils.synthesizer').setLevel(logging.ERROR)

# Suppress stdout temporarily during TTS operations
class NullWriter:
    def write(self, s):
        pass
    def flush(self):
        pass

# Define paths
jsonl_file = "../dataset/text_cleaned/data.jsonl"
voices_path = "../dataset/voices"
output_path = "../dataset/sample_audio"

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# Read all conversations from the JSONL file
conversations = []
with open(jsonl_file, 'r', encoding='utf-8') as f:
    for line in f:
        conversations.append(json.loads(line))

# Select a specific conversation and message
conversation = conversations[9000]
selected_message = conversation[1]
sample_text = selected_message["content"]

print(f"Selected message: {sample_text[:100]}...")

for voice_filename in os.listdir(voices_path):
    voice_path = os.path.join(voices_path, voice_filename)
    print(f"Processing with voice: {voice_filename}")

    # Process the message with language segments if available
    if "language_segments" in selected_message and selected_message["language_segments"]:
        print("Processing with language segments...")
        output_file = os.path.join(output_path, f"mixed_lang_{voice_filename}.wav")
        
        # Generate speech for each segment directly without saving intermediate files
        audio_segments = []
        sample_rate = None
        
        for idx, segment in enumerate(tqdm(selected_message["language_segments"], desc="Processing segments")):
            segment_text = segment["text"].strip()
            segment_lang = segment["lang"]
            
            # Skip empty segments
            if not segment_text:
                continue
            
            # Capture and suppress stdout during TTS generation
            original_stdout = sys.stdout
            sys.stdout = NullWriter()
            
            try:
                # Generate speech directly as numpy array
                segment_audio = tts.tts(
                    text=segment_text,
                    speaker_wav=voice_path,
                    language=segment_lang
                )
                
                # Store the sample rate from the first segment
                if sample_rate is None:
                    sample_rate = tts.synthesizer.output_sample_rate
                
                audio_segments.append(segment_audio)
            except Exception as e:
                print(f"Error processing segment {idx}: {e}")
            finally:
                # Restore stdout
                sys.stdout = original_stdout
        
        # Concatenate all audio segments in memory
        if audio_segments:
            # Concatenate all segments using numpy
            concat_audio = np.concatenate(audio_segments)
            
            # Save the final audio file
            sf.write(output_file, concat_audio, sample_rate)
            
            print(f"Successfully generated mixed language audio: {output_file}")
        else:
            print("No segments were processed successfully.")

    else:
        # Process the whole message as a single language (Russian)
        print("Processing as a single language...")
        output_file = os.path.join(output_path, f"single_lang_{voice_filename}.wav")
        
        # Capture and suppress stdout during TTS generation
        original_stdout = sys.stdout
        sys.stdout = NullWriter()
        
        try:
            # Generate the audio directly
            audio = tts.tts(
                text=sample_text,
                speaker_wav=voice_path,
                language="ru"
            )
            
            # Restore stdout
            sys.stdout = original_stdout
            
            # Save the audio
            sf.write(output_file, audio, tts.synthesizer.output_sample_rate)
            
            print(f"Successfully generated audio: {output_file}")
        except Exception as e:
            # Restore stdout
            sys.stdout = original_stdout
            print(f"Error generating audio: {e}")

print("Audio generation complete.")

Selected message: В китайском языке приветствия могут варьироваться в зависимости от ситуации и времени суток\. Вот ос...
Processing with voice: voice6.wav
Processing with language segments...


Processing segments:   0%|          | 0/17 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Processing segments: 100%|██████████| 17/17 [00:13<00:00,  1.28it/s]


Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice6.wav.wav
Processing with voice: voice1.wav
Processing with language segments...


Processing segments: 100%|██████████| 17/17 [00:11<00:00,  1.42it/s]


Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice1.wav.wav
Processing with voice: voice0.mp3
Processing with language segments...


Processing segments: 100%|██████████| 17/17 [00:13<00:00,  1.27it/s]


Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice0.mp3.wav
Processing with voice: voice4.wav
Processing with language segments...


Processing segments: 100%|██████████| 17/17 [00:11<00:00,  1.48it/s]


Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice4.wav.wav
Processing with voice: voice5.wav
Processing with language segments...


Processing segments: 100%|██████████| 17/17 [00:11<00:00,  1.43it/s]


Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice5.wav.wav
Processing with voice: voice8.wav
Processing with language segments...


Processing segments: 100%|██████████| 17/17 [00:14<00:00,  1.17it/s]


Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice8.wav.wav
Processing with voice: voice7.wav
Processing with language segments...


Processing segments: 100%|██████████| 17/17 [00:15<00:00,  1.12it/s]


Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice7.wav.wav
Processing with voice: voice3.wav
Processing with language segments...


Processing segments: 100%|██████████| 17/17 [00:12<00:00,  1.31it/s]


Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice3.wav.wav
Processing with voice: voice2.wav
Processing with language segments...


Processing segments: 100%|██████████| 17/17 [00:12<00:00,  1.34it/s]


Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice2.wav.wav
Processing with voice: voice9.wav
Processing with language segments...


Processing segments: 100%|██████████| 17/17 [00:12<00:00,  1.36it/s]

Successfully generated mixed language audio: ../dataset/sample_audio/mixed_lang_voice9.wav.wav
Audio generation complete.





In [None]:
import torch
from TTS.api import TTS
import json
import os
import numpy as np
from tqdm import tqdm
import logging
import soundfile as sf
import sys
import random

# Configure minimal logging - aggressive suppression
logging.basicConfig(level=logging.ERROR)
logging.getLogger('TTS').setLevel(logging.ERROR)
logging.getLogger('TTS.utils.synthesizer').setLevel(logging.ERROR)

# Suppress stdout temporarily during TTS operations
class NullWriter:
    def write(self, s):
        pass
    def flush(self):
        pass

# Define paths
jsonl_file = "../dataset/text_cleaned/data.jsonl"
voices_path = "../dataset/voices"
output_path = "../dataset/audio"

# Ensure output directory exists
os.makedirs(output_path, exist_ok=True)

# Read all conversations from the JSONL file
conversations = []
with open(jsonl_file, 'r', encoding='utf-8') as f:
    for line in f:
        conversations.append(json.loads(line))

# Get list of all available voice files
available_voices = os.listdir(voices_path)

# Process all conversations with minimal logging
for conversation_idx, conversation in enumerate(tqdm(conversations, desc="Processing conversations")):
    try:
        # Create a directory for this conversation
        conversation_dir = os.path.join(output_path, f"conversation_{conversation_idx}")
        os.makedirs(conversation_dir, exist_ok=True)
        
        # Select a random speaker pair for this conversation
        user_voice = random.choice(available_voices)
        assistant_voice = random.choice(available_voices)
        
        # Map roles to voice files
        role_to_voice = {
            'user': os.path.join(voices_path, user_voice),
            'assistant': os.path.join(voices_path, assistant_voice)
        }
        
        # Process each turn in the conversation
        for turn_idx, message in enumerate(conversation):
            role = message.get("role", "")
            
            # Skip if role is not defined or not in our mapping
            if not role or role not in role_to_voice:
                continue
                
            # Get the appropriate voice for this role
            voice_path = role_to_voice[role]
            
            # Define output filename (1-indexed turn number)
            output_file = os.path.join(conversation_dir, f"{turn_idx+1}_{role}.wav")
            
            # Skip if file already exists
            if os.path.exists(output_file):
                continue
                
            # Process with language segments if available
            if "language_segments" in message and message["language_segments"]:
                audio_segments = []
                sample_rate = None
                
                # Process each language segment
                for segment in message["language_segments"]:
                    segment_text = segment["text"].strip()
                    segment_lang = segment["lang"]
                    
                    # Skip empty segments
                    if not segment_text:
                        continue
                    
                    # Capture and suppress stdout during TTS generation
                    original_stdout = sys.stdout
                    sys.stdout = NullWriter()
                    
                    try:
                        # Generate speech directly as numpy array
                        segment_audio = tts.tts(
                            text=segment_text,
                            speaker_wav=voice_path,
                            language=segment_lang
                        )
                        
                        # Store the sample rate from the first segment
                        if sample_rate is None:
                            sample_rate = tts.synthesizer.output_sample_rate
                        
                        audio_segments.append(segment_audio)
                    except Exception:
                        # Silent exception handling
                        pass
                    finally:
                        # Restore stdout
                        sys.stdout = original_stdout
                
                # Concatenate all audio segments in memory
                if audio_segments:
                    # Concatenate all segments using numpy
                    concat_audio = np.concatenate(audio_segments)
                    
                    # Save the final audio file
                    sf.write(output_file, concat_audio, sample_rate)
            
            else:
                # Process the whole message as a single language (default to Russian)
                text = message.get("content", "").strip()
                
                if not text:
                    continue
                
                # Capture and suppress stdout during TTS generation
                original_stdout = sys.stdout
                sys.stdout = NullWriter()
                
                try:
                    # Generate the audio directly
                    audio = tts.tts(
                        text=text,
                        speaker_wav=voice_path,
                        language="ru"  # Default to Russian
                    )
                    
                    # Save the audio
                    sf.write(output_file, audio, tts.synthesizer.output_sample_rate)
                except Exception:
                    # Silent exception handling
                    pass
                finally:
                    # Restore stdout
                    sys.stdout = original_stdout
                    
    except Exception:
        # Silent exception handling for the entire conversation
        continue

print("Audio generation complete.")

Processing conversations:   0%|          | 3/10006 [03:56<205:34:31, 73.98s/it]