In [1]:
import os
import torch
from pathlib import Path

os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
torch.serialization.add_safe_globals(['TTS.tts.models.xtts.XttsAudioConfig'])

BASE_DIR = r"C:\Users\ASUS\Desktop\Andes\Maestria\ANTI-SPOOF\Latin_America_Spanish_anti_spoofing_dataset\FinalDataset_16khz\Real"
OUTPUT_DIR = "data/coqui"

COUNTRIES = ["Argentina", "Chile", "Colombia", "Peru", "Venezuela"]

print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")


def get_first_audio_from_country(country_path):
    try:
        for root, dirs, files in os.walk(country_path):
            wav_files = [f for f in files if f.endswith('.wav')]
            if wav_files:
                return os.path.join(root, wav_files[0])
        
        print(f"No wav files found in {country_path}")
        return None
    except Exception as e:
        print(f"Error searching in {country_path}: {str(e)}")
        return None


def clone_with_coqui(audio_path, output_path):
    try:
        from TTS.api import TTS
        import whisper
        
        print(f"Loading Coqui TTS model...")
        tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda" if torch.cuda.is_available() else "cpu")
        
        audio_path = os.path.normpath(audio_path)
        output_path = os.path.normpath(output_path)
        
        print(f"Transcribing audio: {audio_path}")
        model = whisper.load_model("base")
        result = model.transcribe(audio_path, language="es")
        text = result["text"].strip()
        print(f"Text: {text}")
        
        print(f"Cloning voice...")
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        tts.tts_to_file(
            text=text,
            speaker_wav=audio_path,
            language="es",
            file_path=output_path
        )
        
        print(f"Saved: {output_path}")
        return True
        
    except Exception as e:
        print(f"Error: {str(e)}")
        import traceback
        print(traceback.format_exc())
        return False


def main():
    print("=" * 70)
    print("COQUI XTTS-V2 VOICE CLONING TEST")
    print("=" * 70)
    
    results = []
    
    for country in COUNTRIES:
        print(f"\nProcessing country: {country}")
        print("-" * 70)
        
        country_path = os.path.join(BASE_DIR, country)
        
        if not os.path.exists(country_path):
            print(f"Country folder not found: {country_path}")
            results.append((country, "Not found", None))
            continue
        
        audio_path = get_first_audio_from_country(country_path)
        
        if audio_path is None:
            results.append((country, "No audio", None))
            continue
        
        filename = os.path.basename(audio_path)
        output_path = os.path.join(OUTPUT_DIR, country.lower(), f"clone_{filename}")
        
        success = clone_with_coqui(audio_path, output_path)
        
        if success:
            results.append((country, "Success", output_path))
        else:
            results.append((country, "Failed", None))
    
    print("\n" + "=" * 70)
    print("RESULTS SUMMARY")
    print("=" * 70)
    
    for country, status, path in results:
        print(f"{country:15} | {status:15} | {path if path else 'N/A'}")
    
    successful = sum(1 for _, status, _ in results if status == "Success")
    print(f"\nTotal successful: {successful}/{len(results)}")
    print(f"Output directory: {OUTPUT_DIR}/")


if __name__ == "__main__":
    if not os.path.exists(BASE_DIR):
        print(f"Error: Base directory not found: {BASE_DIR}")
    else:
        main()

GPU Available: True
GPU Name: NVIDIA GeForce RTX 4060 Laptop GPU
COQUI XTTS-V2 VOICE CLONING TEST

Processing country: Argentina
----------------------------------------------------------------------
Loading Coqui TTS model...
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)


Transcribing audio: C:\Users\ASUS\Desktop\Andes\Maestria\ANTI-SPOOF\Latin_America_Spanish_anti_spoofing_dataset\FinalDataset_16khz\Real\Argentina\arf_00295\arf_00295_00000740990.wav
Text: ¿Quieres que te mande el artículo?
Cloning voice...
 > Text splitted to sentences.
['¿Quieres que te mande el artículo?']
 > Processing time: 3.9433858394622803
 > Real-time factor: 1.1170562404951603
Saved: data\coqui\argentina\clone_arf_00295_00000740990.wav

Processing country: Chile
----------------------------------------------------------------------
Loading Coqui TTS model...
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts
Transcribing audio: C:\Users\ASUS\Desktop\Andes\Maestria\ANTI-SPOOF\Latin_America_Spanish_anti_spoofing_dataset\FinalDataset_16khz\Real\Chile\clf_00610\clf_00610_00025628111.wav
Text: Según mis datos estésábados 2 de junio es el día de la madre.
Cloning voice...
 > Text splitted to sentences.
['Según mis datos estésábados 2 de juni