<a href="https://colab.research.google.com/github/UtpalikaAcharya/Integration-of-text-to-speech-TTS-generated-multilingual-data-for-LID/blob/main/tacotron2hindiTranscript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# First, explicitly set the locale
!apt-get update && apt-get install -y locales
!locale-gen en_US.UTF-8
!export LANG=en_US.UTF-8
!export LANGUAGE=en_US:en
!export LC_ALL=en_US.UTF-8

# Now install the required packages
!pip install --upgrade pip
!pip install TTS
!pip install torch torchaudio

# Import required libraries
import os
import sys
import torch
import warnings
from TTS.api import TTS
from IPython.display import Audio
from google.colab import drive
import pandas as pd
from tqdm.notebook import tqdm

# Set environment variables for UTF-8
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LC_ALL'] = 'en_US.UTF-8'

# Suppress warnings
warnings.filterwarnings('ignore')

# Mount Google Drive
drive.mount('/content/drive')

# Your specified paths (keeping the same as original)
INPUT_FOLDER = '/content/drive/MyDrive/Audio_and_Transcripts/06_Transcriptions/hindi transcriptions-20250226T174533Z-001/hindi transcriptions/transcript hindi'
OUTPUT_FOLDER = '/content/drive/MyDrive/Audio_and_Transcripts/Machine Generated Audios/machineGeneratedAudiosInHindi'
SPEAKER_WAV = "/content/drive/MyDrive/Audio_and_Transcripts/downloaded audio/downloaded audio/hindi audio/51.wav"

# Create output directory
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Set up device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    # Initialize TTS with multilingual model that supports Hindi
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    sys.exit(1)

def generate_hindi_speech(text, output_file, speaker_file):
    """Generate speech from Hindi text"""
    try:
        tts.tts_to_file(
            text=text,
            file_path=output_file,
            speaker_wav=speaker_file,
            language="hi"  # Changed to Hindi language code
        )
        return True
    except Exception as e:
        print(f"Error generating speech: {e}")
        return False

def process_files():
    """Process all text files in the input folder"""
    # Verify speaker file exists
    if not os.path.exists(SPEAKER_WAV):
        print(f"Speaker reference file not found at: {SPEAKER_WAV}")
        return

    # Get list of text files
    try:
        files = [f for f in os.listdir(INPUT_FOLDER) if f.endswith('.txt')]
    except Exception as e:
        print(f"Error accessing input folder: {e}")
        return

    if not files:
        print("No .txt files found in input folder")
        return

    results = []

    for file in tqdm(files, desc="Processing files"):
        try:
            # Read text file with explicit UTF-8 encoding
            with open(os.path.join(INPUT_FOLDER, file), 'r', encoding='utf-8') as f:
                text = f.read().strip()

            output_file = os.path.join(OUTPUT_FOLDER, f"{os.path.splitext(file)[0]}.wav")

            success = generate_hindi_speech(text, output_file, SPEAKER_WAV)

            results.append({
                'file': file,
                'status': 'success' if success else 'failed',
                'output': output_file if success else None
            })

        except Exception as e:
            print(f"Error processing {file}: {e}")
            results.append({
                'file': file,
                'status': 'failed',
                'error': str(e)
            })

    # Save processing report
    df = pd.DataFrame(results)
    report_path = os.path.join(OUTPUT_FOLDER, 'processing_report.csv')
    df.to_csv(report_path, index=False)

    return df

# Test with a single Hindi text first
def run_test():
    print("Running test...")
    test_text = "नमस्ते"  # "Hello" in Hindi
    test_output = os.path.join(OUTPUT_FOLDER, "test_output.wav")

    if generate_hindi_speech(test_text, test_output, SPEAKER_WAV):
        print("Test successful!")
        return Audio(test_output)
    else:
        print("Test failed!")
        return None

# Execute the code
print("Starting test...")
test_result = run_test()
if test_result:
    display(test_result)

    print("\nStarting main processing...")
    results = process_files()

    if results is not None:
        print("\nProcessing Summary:")
        print(f"Total files processed: {len(results)}")
        print(f"Successful conversions: {len(results[results['status'] == 'success'])}")
        print(f"Failed conversions: {len(results[results['status'] == 'failed'])}")

        # Show failed files if any
        failed = results[results['status'] == 'failed']
        if len(failed) > 0:
            print("\nFailed files:")
            print(failed[['file', 'error']])

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.83)] [Connecting to security.ubuntu.com (91.189.91.8                                                                                                    Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
0% [Waiting for headers] [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)]                                                                                                    Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
0% [Waiting for headers] [Connected to r2u.stat.illinois.edu (192.17.190.167)] [Connected to ppa.lau                                                                                                    Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://a

100%|█████████▉| 1.86G/1.87G [00:26<00:00, 93.5MiB/s]
100%|██████████| 1.87G/1.87G [00:26<00:00, 69.9MiB/s]
100%|██████████| 4.37k/4.37k [00:00<00:00, 17.9kiB/s]
 96%|█████████▌| 345k/361k [00:00<00:00, 2.60MiB/s]
100%|██████████| 361k/361k [00:00<00:00, 1.00MiB/s]
100%|██████████| 32.0/32.0 [00:00<00:00, 112iB/s]
 48%|████▊     | 3.73M/7.75M [00:00<00:00, 37.3MiB/s]

 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.


100%|██████████| 7.75M/7.75M [00:12<00:00, 37.3MiB/s]

 > Using model: xtts


GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Model loaded successfully!
Starting test...
Running test...
 > Text splitted to sentences.
['नमस्ते']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 > Processing time: 5.43300461769104
 > Real-time factor: 1.296063612386267
Test successful!



Starting main processing...


Processing files:   0%|          | 0/50 [00:00<?, ?it/s]

 > Text splitted to sentences.
['इनाट गरहों में से चार गैस जायंट से और चार रॉकी प्लेनेट्स लेकिन मैंने अपनी स्कूल के दिनों में कुछ और ही पड़ा था']
 > Processing time: 4.620997667312622
 > Real-time factor: 0.4812447978738915
 > Text splitted to sentences.
['जिसे आगे चलकर हमारे सौरमंडल का 9 प्लानेट गौसित कर दिया घया था उस समय खगोलवीदिया मानते थे कि प्लूटो आकार में मर्करी से भी बढ़ा होगा']
 > Processing time: 4.5704710483551025
 > Real-time factor: 0.38820834597931436
 > Text splitted to sentences.
['जी हा, जॉपिटर के फ्लाइबाई ने इस इस इस्पेसक्राफ्ट को एक ग्राविटेशनल स्लिंग सॉर्ट प्रवाइड किया जिस्छे इसकी गती बीस प्रतिसत और तेज हो गई और 15 जन्वरी 2015 को निव हौराइजन्स आखिरकार अपनी मंजिल लूडो तक पहुछने में काम्याब हो गया']
 > Processing time: 8.975735425949097
 > Real-time factor: 0.43020503543581884
 > Text splitted to sentences.
['साल नुजिसो सततर में वाजर इस्पेस्क्राफ्ट को लाँज के आ घया था पहले वाजर टू को']
 > Processing time: 3.935530424118042
 > Real-time factor: 0.37619843696592054
 > T

In [None]:
!drive.mount('/content/drive')


/bin/bash: -c: line 1: syntax error near unexpected token `'/content/drive''
/bin/bash: -c: line 1: `drive.mount('/content/drive')'


In [None]:
!drive.mount('/content/drive')