<a href="https://colab.research.google.com/github/SidAS-ai/video-sdk-asssignmet/blob/main/fine_tunning_off_tts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [1]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `sdsdscpcipnvpsadasd` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is:

In [8]:
import os
import torch
import torchaudio
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset, Audio
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers.trainer_utils import get_last_checkpoint
from dataclasses import dataclass
from typing import Dict, List, Optional, Union, Any
import gc
import librosa
import soundfile as sf
from tqdm.auto import tqdm
import pandas as pd
from torch.utils.data import Dataset
import random
import re
import IPython.display as ipd
from google.colab import drive

# Mount Google Drive to save checkpoints and models
drive.mount('/content/drive')
SAVE_DIR = "/content/drive/MyDrive/tts_indicvoices"
os.makedirs(SAVE_DIR, exist_ok=True)

# Set seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load IndicVoices-R dataset using Hugging Face's streaming mode
def load_indicvoices_dataset(language="Hindi", split="train", streaming=True, max_samples=100):
    """
    Load the IndicVoices-R dataset for the specified language and split.
    Using much smaller default sample size (100) and streaming by default.
    """
    try:
        print(f"Loading {language} dataset, {split} split (max {max_samples} samples)...")
        dataset = load_dataset(
            "ai4bharat/indicvoices_r",
            f"{language}",
            split=split,
            streaming=streaming
        )

        # Resample audio to 16kHz and convert to mono if needed
        dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

        # Take a subset of samples if specified
        if max_samples is not None:
            if streaming:
                dataset = dataset.take(max_samples)
            else:
                dataset = dataset.select(range(min(max_samples, len(dataset))))

        # Verify that we have data
        try:
            first_example = next(iter(dataset))
            print(f"Successfully loaded {language} dataset with first example: {first_example.keys()}")
        except StopIteration:
            print(f"Warning: Dataset loaded but appears to be empty!")
            return None

        return dataset

    except Exception as e:
        print(f"Error loading dataset: {e}")
        # Check if it's a language code issue
        if "Loading script" in str(e) and "failed" in str(e):
            print(f"The language code '{language}' might be incorrect. Please check available languages.")

        # Try alternative language codes
        language_variants = {
            "Hindi": ["hindi", "hi"],
            "tamil": ["Tamil", "ta"],
            "gujarati": ["Gujarati", "gu"],
            "bengali": ["Bengali", "bn"],
            "telugu": ["Telugu", "te"],
            "marathi": ["Marathi", "mr"]
        }

        if language in language_variants:
            for variant in language_variants[language]:
                if variant != language:
                    print(f"Trying alternative language code: {variant}")
                    try:
                        dataset = load_dataset(
                            "ai4bharat/indicvoices_r",
                            f"{variant}",
                            split=split,
                            streaming=streaming
                        )
                        dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
                        print(f"Successfully loaded dataset with language code: {variant}")
                        return dataset
                    except Exception:
                        continue

        print("Failed to load dataset after trying alternatives. Please check the dataset availability.")
        return None

# Function to preprocess text data
def preprocess_text(text):
    """Clean and normalize text input"""
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Normalize punctuation
    text = re.sub(r'[^\w\s.,!?]', '', text)
    return text

# Audio preprocessing functions
def preprocess_audio(audio_array, sampling_rate=16000):
    """Preprocess audio: normalize, trim silence"""
    # Convert to float32 if needed
    if audio_array.dtype != np.float32:
        audio_array = audio_array.astype(np.float32)

    # Normalize audio
    audio_array = audio_array / (np.max(np.abs(audio_array)) + 1e-8)

    # Trim leading and trailing silence
    trimmed_audio, _ = librosa.effects.trim(audio_array, top_db=20)

    return trimmed_audio

# Feature extraction for SpeechT5
def extract_features(batch, processor):
    """Extract input features for SpeechT5 model"""
    try:
        # Process text input
        text = preprocess_text(batch["text"])
        inputs = processor(text=text, return_tensors="pt")

        # Store original text for later use in evaluation
        features = {
            "input_ids": inputs.input_ids[0],
            "attention_mask": inputs.attention_mask[0],
            "original_text": text  # Keep the original text
        }

        # Process audio for speaker embedding
        if "audio" in batch:
            audio_array = batch["audio"]["array"]
            sampling_rate = batch["audio"]["sampling_rate"]

            # Preprocess audio
            audio_array = preprocess_audio(audio_array, sampling_rate)

            # Extract speaker embedding (simplified for now)
            # In a full implementation, you would extract speaker embeddings properly
            speaker_embeddings = torch.randn(1, 512)  # Placeholder

            features.update({
                "labels": torch.tensor(audio_array),
                "speaker_embeddings": speaker_embeddings[0]
            })

        return features
    except Exception as e:
        print(f"Error in extract_features: {e}")
        # Return empty features to skip this sample
        return {}

# Custom dataset class for efficient processing
@dataclass
class TTSDataCollator:
    """Data collator for TTS fine-tuning"""
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        try:
            # Filter out empty features
            features = [f for f in features if f and "input_ids" in f]

            if not features:
                raise ValueError("No valid features found after filtering")

            input_ids = [feature["input_ids"] for feature in features]
            attention_mask = [feature["attention_mask"] for feature in features]

            # Pad input sequences
            input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=self.processor.tokenizer.pad_token_id)
            attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)

            # Initialize batch
            batch = {
                "input_ids": input_ids,
                "attention_mask": attention_mask,
            }

            # Add original text if available (for reference only, not used in training)
            if "original_text" in features[0]:
                batch["original_text"] = [feature["original_text"] for feature in features]

            # Add labels if available
            if "labels" in features[0]:
                labels = [feature["labels"] for feature in features]
                # We'll use raw waveforms
                max_label_length = max(label.shape[0] for label in labels)
                # Pad labels
                padded_labels = []
                for label in labels:
                    padding_length = max_label_length - label.shape[0]
                    padded_label = torch.nn.functional.pad(label, (0, padding_length))
                    padded_labels.append(padded_label)
                labels = torch.stack(padded_labels)
                batch["labels"] = labels

            # Add speaker embeddings if available
            if "speaker_embeddings" in features[0]:
                speaker_embeddings = torch.stack([feature["speaker_embeddings"] for feature in features])
                batch["speaker_embeddings"] = speaker_embeddings

            return batch
        except Exception as e:
            print(f"Error in data collator: {e}")
            # Return empty batch as fallback
            return {"input_ids": torch.zeros(1, 1), "attention_mask": torch.zeros(1, 1)}

# Setup SpeechT5 model and processor
def setup_model():
    """Initialize and prepare the SpeechT5 model and processor"""
    try:
        print("Setting up SpeechT5 model and processor...")
        processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        return processor, model, vocoder
    except Exception as e:
        print(f"Error setting up model: {e}")
        return None, None, None

# Function to generate speech samples for evaluation
def generate_speech_sample(text, model, processor, vocoder, speaker_embeddings=None):
    """Generate a speech sample using the model"""
    try:
        # Ensure model, processor, and vocoder are not None
        if model is None or processor is None or vocoder is None:
            print("Model, processor, or vocoder is None. Cannot generate speech.")
            # Return a small silence segment as fallback
            return np.zeros(16000, dtype=np.float32)  # 1 second of silence at 16kHz

        inputs = processor(text=text, return_tensors="pt").to(device)

        if speaker_embeddings is None:
            # Use random speaker embedding if not provided
            speaker_embeddings = torch.randn(1, 512).to(device)
        elif not isinstance(speaker_embeddings, torch.Tensor):
            # Convert to tensor if it's not already
            speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0).to(device)

        # Check input shape
        if speaker_embeddings.dim() == 1:
            speaker_embeddings = speaker_embeddings.unsqueeze(0)  # Add batch dimension

        # Ensure speaker embeddings are on the correct device
        speaker_embeddings = speaker_embeddings.to(device)

        # Generate speech
        speech = model.generate_speech(
            inputs["input_ids"],
            speaker_embeddings=speaker_embeddings,
            vocoder=vocoder
        )

        return speech.cpu().numpy()
    except Exception as e:
        print(f"Error generating speech: {e}")
        # Return silence as fallback
        return np.zeros(16000, dtype=np.float32)  # 1 second of silence at 16kHz

# Helper function to display and save audio samples
def display_and_save_audio(speech, sample_rate=16000, filename=None):
    """Display audio in notebook and save to file if filename is provided"""
    try:
        ipd.display(ipd.Audio(speech, rate=sample_rate))

        if filename:
            sf.write(filename, speech, sample_rate)
            print(f"Saved audio to {filename}")
    except Exception as e:
        print(f"Error displaying/saving audio: {e}")

# Function to prepare the dataset for training
def prepare_dataset(dataset, processor, max_samples=None):
    """Prepare the dataset for training"""
    # Check if the dataset is None
    if dataset is None:
        print("Error: Dataset is None. Cannot prepare dataset.")
        return None

    try:
        # If max_samples is specified, limit the dataset first to save processing time
        if max_samples is not None and not isinstance(dataset, list):
            if hasattr(dataset, 'take'):  # For streaming datasets
                dataset = dataset.take(max_samples)
            else:  # For non-streaming datasets
                dataset = dataset.select(range(min(max_samples, len(dataset))))

        # Process the dataset - works for both streaming and non-streaming
        processed_dataset = dataset.map(
            lambda batch: extract_features(batch, processor),
            batched=False
        )

        # Convert streaming dataset to list for easier handling
        if hasattr(processed_dataset, '_iter_dataset'):  # Check if it's a streaming dataset
            processed_list = list(processed_dataset)
            # Filter out empty entries
            processed_list = [item for item in processed_list if item and "input_ids" in item]
            return processed_list

        return processed_dataset
    except Exception as e:
        print(f"Error preparing dataset: {e}")
        return []  # Return empty list as fallback

# Setup training arguments
def setup_training_args(output_dir, max_steps=1000):
    """Configure training arguments with reduced steps and smaller batch size"""
    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1,  # Reduced batch size to fit in memory
        gradient_accumulation_steps=2,  # Reduced to work with smaller datasets
        learning_rate=5e-5,
        max_steps=max_steps,  # Reduced from 5000 to 1000
        save_steps=100,        # Save more frequently with smaller dataset
        eval_steps=100,        # Evaluate more frequently
        logging_steps=20,      # Log more frequently
        evaluation_strategy="steps",
        predict_with_generate=True,
        generation_max_length=200,
        save_total_limit=2,    # Limit checkpoints to save space
        fp16=True,             # Use mixed precision
        dataloader_num_workers=1,  # Reduced workers
        load_best_model_at_end=True,
        remove_unused_columns=False,  # Keep our custom columns
    )
    return training_args

# Main training function
def train_tts_model(language="Hindi", max_samples=100, max_steps=1000):
    """Main function to train the TTS model with reduced dataset size"""
    try:
        # Setup model and processor
        processor, model, vocoder = setup_model()

        # Check if model setup was successful
        if model is None or processor is None or vocoder is None:
            print("Failed to set up model, processor, or vocoder. Aborting training.")
            return None, None, None

        model.to(device)
        vocoder.to(device)

        # Load dataset - using smaller samples by default
        train_dataset = load_indicvoices_dataset(
            language=language,
            split="train",
            streaming=True,  # Use streaming for memory efficiency
            max_samples=max_samples
        )

        # Check if dataset loading failed
        if train_dataset is None:
            print("Failed to load training dataset. Aborting training.")
            return model, processor, vocoder  # Return the pretrained model without training

        val_dataset = load_indicvoices_dataset(
            language=language,
            split="validation",
            streaming=True,
            max_samples=max(10, max_samples//10)  # Even smaller validation set
        )

        # If validation dataset loading failed, continue without validation
        if val_dataset is None:
            print("Failed to load validation dataset. Will train without validation.")
            val_dataset = None

        # Prepare datasets for training
        print(f"Preparing datasets for training (max {max_samples} samples)...")
        train_dataset = prepare_dataset(train_dataset, processor, max_samples)
        if not train_dataset:  # Check if empty
            print("Failed to prepare training dataset. Aborting training.")
            return model, processor, vocoder  # Return the pretrained model without training

        if val_dataset is not None:
            val_dataset = prepare_dataset(val_dataset, processor, max(10, max_samples//10))
            if not val_dataset:  # Check if empty
                print("Failed to prepare validation dataset. Will train without validation.")
                val_dataset = None

        # Setup data collator
        data_collator = TTSDataCollator(processor=processor)

        # Setup output directory
        output_dir = os.path.join(SAVE_DIR, f"speecht5_finetuned_{language}_small")
        os.makedirs(output_dir, exist_ok=True)

        # Check for existing checkpoint to resume training
        last_checkpoint = get_last_checkpoint(output_dir) if os.path.exists(output_dir) else None
        if last_checkpoint:
            print(f"Resuming training from checkpoint: {last_checkpoint}")

        # Setup training arguments with reduced steps
        training_args = setup_training_args(output_dir, max_steps)

        # Initialize the Trainer
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=processor.tokenizer,
            data_collator=data_collator,
        )

        # Training loop
        print("Starting training with reduced dataset...")
        try:
            train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
            print("Training completed successfully!")
        except Exception as e:
            print(f"Error during training: {e}")
            print("Training failed, but returning the pretrained model.")
            return model, processor, vocoder

        # Save the final model
        try:
            trainer.save_model(os.path.join(output_dir, "final_model"))
            processor.save_pretrained(os.path.join(output_dir, "final_model"))
            print("Model saved successfully!")
        except Exception as e:
            print(f"Error saving model: {e}")

        # Generate and save a few samples for comparison
        generate_samples(model, processor, vocoder, language, output_dir)

        return model, processor, vocoder
    except Exception as e:
        print(f"Unexpected error in train_tts_model: {e}")
        # Initialize a new model to return
        backup_processor, backup_model, backup_vocoder = setup_model()
        return backup_model, backup_processor, backup_vocoder

# Function to generate samples before and after fine-tuning
def generate_samples(model, processor, vocoder, language, output_dir):
    """Generate and save speech samples before and after fine-tuning"""
    try:
        # Check if model components are available
        if model is None or processor is None or vocoder is None:
            print("Model components are None. Cannot generate samples.")
            return

        # Test phrases for different languages
        test_phrases = {
            "Hindi": "नमस्ते, आप कैसे हैं?",
            "hindi": "नमस्ते, आप कैसे हैं?",
            "tamil": "வணக்கம், எப்படி இருக்கிறீர்கள்?",
            "gujarati": "નમસ્તે, તમે કેમ છો?",
            "bengali": "নমস্কার, আপনি কেমন আছেন?",
            "telugu": "నమస్కారం, మీరు ఎలా ఉన్నారు?",
            "marathi": "नमस्कार, तुम्ही कसे आहात?",
        }

        # Create samples directory
        samples_dir = os.path.join(output_dir, "samples")
        os.makedirs(samples_dir, exist_ok=True)

        # Get test phrase for current language
        test_phrase = test_phrases.get(language, "Hello, how are you?")

        print(f"Generating sample for: '{test_phrase}' ({language})")

        # Generate speech
        speech = generate_speech_sample(test_phrase, model, processor, vocoder)

        # Save the sample
        sample_path = os.path.join(samples_dir, f"{language}_sample.wav")
        display_and_save_audio(speech, filename=sample_path)

        # Save metadata
        with open(os.path.join(samples_dir, f"{language}_info.txt"), "w") as f:
            f.write(f"Language: {language}\n")
            f.write(f"Text: {test_phrase}\n")
            f.write(f"Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    except Exception as e:
        print(f"Error generating samples: {e}")

# Evaluation function
def evaluate_model(model, processor, vocoder, eval_dataset, language):
    """Evaluate the fine-tuned model"""
    print("Evaluating model...")

    # Check if model components are available
    if model is None or processor is None or vocoder is None:
        print("Model, processor, or vocoder is None. Cannot evaluate model.")
        print("Loading pretrained models as fallback...")
        processor, model, vocoder = setup_model()
        if model is None:
            print("Failed to load fallback model. Cannot continue evaluation.")
            return []

    try:
        # Ensure model and vocoder are on the correct device
        model.to(device)
        vocoder.to(device)

        # Generate just 3 samples from the evaluation set instead of 5
        samples = []

        # Default test phrases
        test_phrases = {
            "Hindi": "नमस्ते, आप कैसे हैं?",
            "hindi": "नमस्ते, आप कैसे हैं?",
            "tamil": "வணக்கம், எப்படி இருக்கிறீர்கள்?",
            "gujarati": "નમસ્તે, તમે કેમ છો?",
            "bengali": "নমস্কার, আপনি কেমন আছেন?",
            "telugu": "నమస్కారం, మీరు ఎలా ఉన్నారు?",
            "marathi": "नमस्कार, तुम्ही कसे आहात?",
        }

        # Check if eval_dataset is valid
        if eval_dataset and isinstance(eval_dataset, (list, torch.utils.data.Dataset)) and len(eval_dataset) > 0:
            # Use examples from dataset
            for i, example in enumerate(eval_dataset):
                if i >= 3:  # Reduced from 5 to 3
                    break

                # First try to access original_text, if not present use test phrases
                if example and "original_text" in example:
                    text = example["original_text"]
                else:
                    # Fallback to default test phrases
                    text = test_phrases.get(language, "Hello, how are you?")
                    print(f"Original text not found, using default phrase for {language}")

                print(f"Generating speech for: '{text}'")

                # Get speaker embeddings if available
                speaker_embeddings = example.get("speaker_embeddings", None) if example else None

                # Generate speech
                speech = generate_speech_sample(text, model, processor, vocoder, speaker_embeddings)
                samples.append((text, speech))

                # Display the sample
                print(f"Sample {i+1}:")
                display_and_save_audio(speech)
        else:
            # If no evaluation dataset, generate samples with default phrases
            print("No valid evaluation dataset. Using default phrases.")
            for i in range(3):
                text = test_phrases.get(language, "Hello, how are you?")
                print(f"Generating speech for: '{text}'")

                # Generate speech
                speech = generate_speech_sample(text, model, processor, vocoder)
                samples.append((text, speech))

                # Display the sample
                print(f"Sample {i+1}:")
                display_and_save_audio(speech)

        return samples
    except Exception as e:
        print(f"Error during evaluation: {e}")
        # Generate one fallback sample with a default phrase
        try:
            text = test_phrases.get(language, "Hello, how are you?")
            print(f"Error occurred. Falling back to default phrase: '{text}'")

            speech = generate_speech_sample(text, model, processor, vocoder)
            display_and_save_audio(speech)
            return [(text, speech)]
        except:
            print("Failed to generate even the fallback sample.")
            return []

# Main execution
if __name__ == "__main__":
    try:
        # Parameters - significantly reduced from original
        LANGUAGE = "Hindi"    # Choose your language
        MAX_SAMPLES = 100     # Reduced from 1000 to 100
        MAX_STEPS = 1000      # Reduced from 5000 to 1000

        # Test dataset loading with very small sample
        print("Testing dataset loading...")
        test_dataset = load_indicvoices_dataset(LANGUAGE, "train", streaming=True, max_samples=3)

        if test_dataset:
            try:
                # Convert streaming dataset to list to access items
                test_examples = list(test_dataset.take(3))
                print(f"Dataset structure:")
                print(test_examples[0])

                # Display a sample audio
                print("Sample audio from dataset:")
                sample_audio = test_examples[0]["audio"]["array"]
                sample_rate = test_examples[0]["audio"]["sampling_rate"]
                display_and_save_audio(sample_audio, sample_rate)

                # Display sample text
                print(f"Sample text: {test_examples[0]['text']}")
            except Exception as e:
                print(f"Error while testing dataset: {e}")

        # Clear memory before training
        gc.collect()
        torch.cuda.empty_cache() if torch.cuda.is_available() else None

        # Run the training process
        print("\nStarting training process with reduced dataset...")
        model, processor, vocoder = train_tts_model(
            language=LANGUAGE,
            max_samples=MAX_SAMPLES,
            max_steps=MAX_STEPS
        )

        # Verify model was loaded or trained correctly
        if model is None or processor is None or vocoder is None:
            print("Model training failed. Loading pretrained model...")
            processor, model, vocoder = setup_model()

        if model is None:
            print("Failed to load even the pretrained model. Exiting.")
            exit(1)

        print("Training completed!")

        # Load a small test set for final evaluation
        test_dataset = load_indicvoices_dataset(
            language=LANGUAGE,
            split="test",
            streaming=True,
            max_samples=5  # Reduced from 10 to 5
        )
        test_dataset = prepare_dataset(test_dataset, processor, 5)

        # Evaluate the model
        evaluate_model(model, processor, vocoder, test_dataset, LANGUAGE)

        print("Evaluation completed. Model and samples saved to:", SAVE_DIR)
    except Exception as e:
        print(f"Unexpected error in main execution: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
Testing dataset loading...
Loading Hindi dataset, train split (max 3 samples)...


Resolving data files:   0%|          | 0/246 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/99 [00:00<?, ?it/s]

Successfully loaded Hindi dataset with first example: dict_keys(['text', 'lang', 'samples', 'verbatim', 'normalized', 'speaker_id', 'scenario', 'task_name', 'gender', 'age_group', 'job_type', 'qualification', 'area', 'district', 'state', 'occupation', 'audio', 'utterance_pitch_mean', 'utterance_pitch_std', 'snr', 'c50', 'speaking_rate', 'cer', 'duration'])
Dataset structure:
{'text': 'हमें कृषि के लिए उपयोग में आने वाली विभिन्न विभिन्न यात्रा औजारों का प्रयोग किया जाता है जैसे कि हम', 'lang': 'hi', 'samples': 356064, 'verbatim': 'हमें कृषि के लिए उपयोग में आने वाली विभिन्न विभिन्न यात्रा औजारों का प्रयोग किया जाता है जैसे कि हम', 'normalized': 'हमें कृषि के लिए उपयोग में आने वाली विभिन्न विभिन्न यात्रा औजारों का प्रयोग किया जाता है जैसे कि हम', 'speaker_id': 'S4259138800367594', 'scenario': 'Extempore', 'task_name': 'DOI - Agriculture', 'gender': 'Female', 'age_group': '18-30', 'job_type': 'Student', 'qualification': 'Upto 12th', 'area': 'Rural', 'district': 'Katni', 'state': 'Madhya P

Sample text: हमें कृषि के लिए उपयोग में आने वाली विभिन्न विभिन्न यात्रा औजारों का प्रयोग किया जाता है जैसे कि हम

Starting training process with reduced dataset...
Setting up SpeechT5 model and processor...
Loading Hindi dataset, train split (max 100 samples)...


Resolving data files:   0%|          | 0/246 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/99 [00:00<?, ?it/s]

Successfully loaded Hindi dataset with first example: dict_keys(['text', 'lang', 'samples', 'verbatim', 'normalized', 'speaker_id', 'scenario', 'task_name', 'gender', 'age_group', 'job_type', 'qualification', 'area', 'district', 'state', 'occupation', 'audio', 'utterance_pitch_mean', 'utterance_pitch_std', 'snr', 'c50', 'speaking_rate', 'cer', 'duration'])
Loading Hindi dataset, validation split (max 10 samples)...


Resolving data files:   0%|          | 0/246 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/99 [00:00<?, ?it/s]

Error loading dataset: Bad split: validation. Available splits: ['train', 'test']
Trying alternative language code: hindi


Resolving data files:   0%|          | 0/246 [00:00<?, ?it/s]

Trying alternative language code: hi


Resolving data files:   0%|          | 0/246 [00:00<?, ?it/s]

Failed to load dataset after trying alternatives. Please check the dataset availability.
Failed to load validation dataset. Will train without validation.
Preparing datasets for training (max 100 samples)...
Unexpected error in train_tts_model: You have set `args.eval_strategy` to IntervalStrategy.STEPS but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. 
Setting up SpeechT5 model and processor...


  trainer = Seq2SeqTrainer(


Training completed!
Loading Hindi dataset, test split (max 5 samples)...


Resolving data files:   0%|          | 0/246 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/99 [00:00<?, ?it/s]

Successfully loaded Hindi dataset with first example: dict_keys(['text', 'lang', 'samples', 'verbatim', 'normalized', 'speaker_id', 'scenario', 'task_name', 'gender', 'age_group', 'job_type', 'qualification', 'area', 'district', 'state', 'occupation', 'audio', 'utterance_pitch_mean', 'utterance_pitch_std', 'snr', 'c50', 'speaking_rate', 'cer', 'duration'])
Evaluating model...
Error during evaluation: object of type 'IterableDataset' has no len()
Error occurred. Falling back to default phrase: 'नमस्ते, आप कैसे हैं?'


Evaluation completed. Model and samples saved to: /content/drive/MyDrive/tts_indicvoices
