<a href="https://colab.research.google.com/github/Somie12/Speech-Synthesis/blob/main/Generating%20Speech%20from%20TTS/XTTS_Audio_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# First, explicitly set the locale
!apt-get update && apt-get install -y locales
!locale-gen en_US.UTF-8
!export LANG=en_US.UTF-8
!export LANGUAGE=en_US:en
!export LC_ALL=en_US.UTF-8

# Now install the required packages
!pip install --upgrade pip
!pip install TTS
!pip install torch torchaudio

# Import required libraries
import os
import sys
import torch
import warnings
from TTS.api import TTS
from IPython.display import Audio
from google.colab import drive
import pandas as pd
from tqdm.notebook import tqdm
import re

# Set environment variables for UTF-8
os.environ['PYTHONIOENCODING'] = 'utf-8'
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LC_ALL'] = 'en_US.UTF-8'

# Suppress warnings
warnings.filterwarnings('ignore')

# Mount Google Drive
drive.mount('/content/drive')

# Your specified paths - update these to your English paths
INPUT_FOLDER = '/content/drive/MyDrive/Samsung 03/english transcriptions'
OUTPUT_FOLDER = '/content/drive/MyDrive/Samsung 03/english ouput xtts'
SPEAKER_WAV = "/content/drive/MyDrive/Samsung 03/eng human generated"

# Create output directory
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Set up device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    # Initialize TTS with multilingual model (same model, just changing language)
    tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    sys.exit(1)

def generate_english_speech(text, output_file, speaker_file):
    """Generate speech from English text"""
    try:
        tts.tts_to_file(
            text=text,
            file_path=output_file,
            speaker_wav=speaker_file,
            language="en"  # English language
        )
        return True
    except Exception as e:
        print(f"Error generating speech: {e}")
        return False

def extract_number_from_filename(filename):
    """Extract the number from filename (e.g., '101.txt' -> 101)"""
    match = re.search(r'(\d+)', filename)
    if match:
        return int(match.group(1))
    return None

def process_files():
    """Process all text files in the input folder"""
    # Verify speaker file exists
    if not os.path.exists(SPEAKER_WAV):
        print(f"Speaker reference file not found at: {SPEAKER_WAV}")
        return

    # Check if speaker_wav is a directory
    if os.path.isdir(SPEAKER_WAV):
        # Find the first audio file
        for root, dirs, files in os.walk(SPEAKER_WAV):
            for file in files:
                if file.endswith(('.wav', '.mp3')):
                    speaker_file = os.path.join(root, file)
                    print(f"Using speaker file: {speaker_file}")
                    break
            if 'speaker_file' in locals():
                break

        if 'speaker_file' not in locals():
            print(f"No audio files found in speaker directory: {SPEAKER_WAV}")
            return
    else:
        # Use the specified file directly
        speaker_file = SPEAKER_WAV

    # Get list of text files
    try:
        files = [f for f in os.listdir(INPUT_FOLDER) if f.endswith('.txt')]

        # Sort files to process them in order
        files.sort(key=extract_number_from_filename)
    except Exception as e:
        print(f"Error accessing input folder: {e}")
        return

    if not files:
        print("No .txt files found in input folder")
        return

    results = []

    for file in tqdm(files, desc="Processing files"):
        try:
            # Read text file with explicit UTF-8 encoding
            with open(os.path.join(INPUT_FOLDER, file), 'r', encoding='utf-8') as f:
                text = f.read().strip()

            # Extract number from filename to preserve numbering
            file_number = extract_number_from_filename(file)

            if file_number is not None and 101 <= file_number <= 150:
                # Use the original number in the output filename
                output_file = os.path.join(OUTPUT_FOLDER, f"{file_number}.wav")
            else:
                # Fallback to original filename without extension
                output_file = os.path.join(OUTPUT_FOLDER, f"{os.path.splitext(file)[0]}.wav")

            success = generate_english_speech(text, output_file, speaker_file)

            results.append({
                'file': file,
                'output_number': file_number,
                'status': 'success' if success else 'failed',
                'output': output_file if success else None
            })

        except Exception as e:
            print(f"Error processing {file}: {e}")
            results.append({
                'file': file,
                'status': 'failed',
                'error': str(e)
            })

    # Save processing report
    df = pd.DataFrame(results)
    report_path = os.path.join(OUTPUT_FOLDER, 'processing_report.csv')
    df.to_csv(report_path, index=False)

    return df

# Test with a single English text first
def run_test():
    print("Running test...")

    # Check if speaker_wav is a directory
    if os.path.isdir(SPEAKER_WAV):
        # Find the first audio file
        for root, dirs, files in os.walk(SPEAKER_WAV):
            for file in files:
                if file.endswith(('.wav', '.mp3')):
                    speaker_file = os.path.join(root, file)
                    print(f"Using speaker file for test: {speaker_file}")
                    break
            if 'speaker_file' in locals():
                break

        if 'speaker_file' not in locals():
            print(f"No audio files found in speaker directory: {SPEAKER_WAV}")
            return None
    else:
        # Use the specified file directly
        speaker_file = SPEAKER_WAV

    test_text = "This is a test of the English text-to-speech system."
    test_output = os.path.join(OUTPUT_FOLDER, "test_output.wav")

    if generate_english_speech(test_text, test_output, speaker_file):
        print("Test successful!")
        return Audio(test_output)
    else:
        print("Test failed!")
        return None

# Execute the code
print("Starting test...")
test_result = run_test()
if test_result:
    display(test_result)

    print("\nStarting main processing...")
    results = process_files()

    if results is not None:
        print("\nProcessing Summary:")
        print(f"Total files processed: {len(results)}")
        print(f"Successful conversions: {len(results[results['status'] == 'success'])}")
        print(f"Failed conversions: {len(results[results['status'] == 'failed'])}")

        # Show files in numerical order
        successful = results[results['status'] == 'success']
        if not successful.empty:
            print("\nSuccessfully processed files:")
            print(successful[['file', 'output_number', 'output']].sort_values('output_number'))

        # Show failed files if any
        failed = results[results['status'] == 'failed']
        if len(failed) > 0:
            print("\nFailed files:")
            print(failed[['file', 'error']])

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,319 kB]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,661 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 http://securit

100%|█████████▉| 1.87G/1.87G [00:22<00:00, 90.9MiB/s]
100%|██████████| 1.87G/1.87G [00:23<00:00, 80.2MiB/s]
100%|██████████| 4.37k/4.37k [00:00<00:00, 15.9kiB/s]
 55%|█████▍    | 198k/361k [00:00<00:00, 487kiB/s] 
100%|██████████| 361k/361k [00:00<00:00, 480kiB/s]
100%|██████████| 32.0/32.0 [00:00<00:00, 105iB/s]
 72%|███████▏  | 5.58M/7.75M [00:00<00:00, 55.8MiB/s]

 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.


100%|██████████| 7.75M/7.75M [00:15<00:00, 55.8MiB/s]

 > Using model: xtts


GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


Model loaded successfully!
Starting test...
Running test...
Using speaker file for test: /content/drive/MyDrive/Samsung 03/eng human generated/101.wav
 > Text splitted to sentences.
['This is a test of the English text-to-speech system.']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 > Processing time: 8.78627872467041
 > Real-time factor: 0.7404507043011318
Test successful!



Starting main processing...
Using speaker file: /content/drive/MyDrive/Samsung 03/eng human generated/101.wav


Processing files:   0%|          | 0/50 [00:00<?, ?it/s]

 > Text splitted to sentences.
['Good evening and welcome to Tucker Carlson tonight.', 'Looking back, January of 2017 seems like another age.', 'So much has happened in the years since then, but in other ways, not that much has changed at all.', 'Donald Trump had not even taken the oath of office yet.']
 > Processing time: 10.427647352218628
 > Real-time factor: 0.4268326454578916
 > Text splitted to sentences.
["But by the first week of the new year of 2017, in case you don't remember, permanent Washington had already committed to destroying his presidency, and Trump seemed to know it."]
 > Processing time: 4.577110528945923
 > Real-time factor: 0.413655351019975
 > Text splitted to sentences.
['A little after 8 p.m. on the night of January 3rd, 2017, the president-elect wrote a tweet.', 'He took a veiled dig at U.S. intelligence agencies for their handling of the then-newly initiated Russia investigation.']
 > Processing time: 6.679820775985718
 > Real-time factor: 0.4211560072698928