In [3]:
# Install the stable version of PyTorch first
!pip install torch==2.5.1 torchaudio --index-url https://download.pytorch.org/whl/cu118

# Install specific, compatible versions of transformers and TTS
!pip install transformers==4.39.3
!pip install TTS==0.22.0

Looking in indexes: https://download.pytorch.org/whl/cu118


In [None]:
from TTS.api import TTS

In [4]:
import json

with open("/kaggle/input/eng-to-hindi-text-art38/Article_38_text_translated_hi.json", encoding="utf-8") as f:
    segments = json.load(f)

print(f"Loaded {len(segments)} transcribed segments")

Loaded 20 transcribed segments


# **Creating audio references**

In [5]:
from pydub import AudioSegment
import os

# --- This is the new code to automatically create reference clips ---

# Make sure you have the path to your original multi-speaker WAV file in your Kaggle input
audio_file = "/kaggle/input/ip-audio/Article_38.wav" 

# And you have the 'segments' list from the diarization step

print("Creating speaker reference clips...")

# 1. Load the main audio file (using .from_wav for a WAV file)
full_audio = AudioSegment.from_wav(audio_file)

# 2. Create a directory in Kaggle's writable output folder
ref_clips_dir = "/kaggle/working/speaker_references/"
os.makedirs(ref_clips_dir, exist_ok=True)

# 3. Find unique speakers
unique_speakers = set(seg['speaker'] for seg in segments)

# This dictionary will hold the path to the reference clip for each speaker
speaker_references = {}

# 4. Extract the longest segment for each speaker
for speaker_id in unique_speakers:
    # Find all segments for the current speaker
    speaker_segments = [s for s in segments if s['speaker'] == speaker_id]
    
    # Find the longest segment among them
    longest_seg = max(speaker_segments, key=lambda s: s['end'] - s['start'])
    
    # Get start and end times in milliseconds
    start_ms = int(longest_seg['start'] * 1000)
    end_ms = int(longest_seg['end'] * 1000)
    
    # Slice the audio to get the reference clip
    reference_clip = full_audio[start_ms:end_ms]
    
    # Define the output path for the clip
    clip_path = os.path.join(ref_clips_dir, f"reference_{speaker_id}.wav")
    
    # Export the clip as a WAV file
    reference_clip.export(clip_path, format="wav")
    
    # Store the path in our dictionary
    speaker_references[speaker_id] = clip_path
    
    print(f"  > Saved reference clip for {speaker_id} at {clip_path}")

print("\nFinished creating reference clips. The 'speaker_references' dictionary is now ready:")
print(speaker_references)

# --- Now you can proceed with the TTS synthesis using this dictionary ---

Creating speaker reference clips...
  > Saved reference clip for SPEAKER_00 at /kaggle/working/speaker_references/reference_SPEAKER_00.wav
  > Saved reference clip for SPEAKER_01 at /kaggle/working/speaker_references/reference_SPEAKER_01.wav

Finished creating reference clips. The 'speaker_references' dictionary is now ready:
{'SPEAKER_00': '/kaggle/working/speaker_references/reference_SPEAKER_00.wav', 'SPEAKER_01': '/kaggle/working/speaker_references/reference_SPEAKER_01.wav'}


# **Voice Cloning**

In [None]:
import torch
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
import os

speaker_references = {
    "SPEAKER_00": "/kaggle/working/speaker_references/reference_SPEAKER_00.wav",
    "SPEAKER_01": "/kaggle/working/speaker_references/reference_SPEAKER_01.wav",
}

# --- NEW, MORE ROBUST FIX ---
# This command tells PyTorch to trust this class for the entire session.
torch.serialization.add_safe_globals([XttsConfig])


device = "cuda" if torch.cuda.is_available() else "cpu"

# Now, load the model normally (without the 'with' block)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)


# --- The rest of your code continues as normal ---

# Output directory
output_dir = "/kaggle/working/dubbed_clips/"
os.makedirs(output_dir, exist_ok=True)

# Generate dubbed audio
print("Starting audio synthesis for all segments...")
for i, seg in enumerate(segments):
    output_path = os.path.join(output_dir, f"segment_{i}_{seg['speaker']}.wav")

    print(f"  > Generating clip for {seg['speaker']} saying: \"{seg['translated_text'][:30]}...\"")

    tts.tts_to_file(
        text=seg['translated_text'],
        speaker_wav=speaker_references[seg['speaker']],
        language="hi",
        file_path=output_path
    )
    seg['dubbed_audio_path'] = output_path
    
print("\n All dubbed clips have been generated successfully!")
print("They are saved in:", output_dir)

In [6]:
from TTS.api import TTS
import torch
import os
# from TTS.tts.configs.xtts_config import XttsConfig

device = "cuda" if torch.cuda.is_available() else "cpu"


tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

# Reference audio
speaker_references = {
    "SPEAKER_00": "/kaggle/working/speaker_references/reference_SPEAKER_00.wav",
    "SPEAKER_01": "/kaggle/working/speaker_references/reference_SPEAKER_01.wav",
}


# Output directory
output_dir = "/kaggle/working/dubbed_clips/"
os.makedirs(output_dir, exist_ok=True)

# Generate dubbed audio
print("Starting audio synthesis for all segments...")
for i, seg in enumerate(segments):
    output_path = os.path.join(output_dir, f"segment_{i}_{seg['speaker']}.wav")

    tts.tts_to_file(
        text=seg['translated_text'],
        speaker_wav=speaker_references[seg['speaker']],
        language="hi",
        file_path=output_path
    )
    seg['dubbed_audio_path'] = output_path
    print(f"Generated dubbed clip: {output_path}")

print("All dubbed clips are saved in:", output_dir)

 > You must confirm the following:
 | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"
 | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]


 | | >  y


 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2


100%|█████████▉| 1.86G/1.87G [00:18<00:00, 86.5MiB/s]
100%|██████████| 1.87G/1.87G [00:18<00:00, 101MiB/s] 
4.37kiB [00:00, 18.4kiB/s]

361kiB [00:00, 1.30MiB/s]0 [00:00<?, ?iB/s][A
100%|██████████| 32.0/32.0 [00:00<00:00, 88.2iB/s]
 62%|██████▏   | 4.82M/7.75M [00:00<00:00, 48.2MiB/s]

 > Model's license - CPML
 > Check https://coqui.ai/cpml.txt for more info.
 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)


Starting audio synthesis for all segments...
 > Text splitted to sentences.
['यह उचित नहीं है कि आप बाजार कम कीमतों के साथ बाढ़ कर सकते हैं, क्योंकि आपके पास अधिक साधन हैं.']
 > Processing time: 5.702317953109741
 > Real-time factor: 0.5881675719728585
Generated dubbed clip: /kaggle/working/dubbed_clips/segment_0_SPEAKER_01.wav
 > Text splitted to sentences.
['मैं सिर्फ अपने व्यापार के लिए सबसे अच्छा क्या कर रहा हूँ.']
 > Processing time: 3.2470827102661133
 > Real-time factor: 0.44039817538485265
Generated dubbed clip: /kaggle/working/dubbed_clips/segment_1_SPEAKER_00.wav
 > Text splitted to sentences.
['अगर मैं कम कीमत दे सकते हैं, कि सिर्फ प्रतियोगिता है.', 'यह आप के लिए ऊपर रखने या कदम रखने के लिए ऊपर या कदम रखने के लिए है.']
 > Processing time: 6.493540525436401
 > Real-time factor: 0.4420961632554609
Generated dubbed clip: /kaggle/working/dubbed_clips/segment_2_SPEAKER_00.wav
 > Text splitted to sentences.
['लेकिन यह सिर्फ एक बुरी प्रतियोगिता नहीं है.']
 > Processing time: 2.7241

In [7]:
segments

[{'start': 2.2078437500000003,
  'end': 12.653468750000002,
  'speaker': 'SPEAKER_01',
  'text': " Ahmeed, your prices are driving all the small vendors like me out of business. It's not fair that you can just flood the market with lower prices, because you have more resources.",
  'translated_text': 'यह उचित नहीं है कि आप बाजार कम कीमतों के साथ बाढ़ कर सकते हैं, क्योंकि आपके पास अधिक साधन हैं.',
  'dubbed_audio_path': '/kaggle/working/dubbed_clips/segment_0_SPEAKER_01.wav'},
 {'start': 13.125968750000002,
  'end': 15.437843750000004,
  'speaker': 'SPEAKER_00',
  'text': " I'm just doing what's best for my business.",
  'translated_text': 'मैं सिर्फ अपने व्यापार के लिए सबसे अच्छा क्या कर रहा हूँ.',
  'dubbed_audio_path': '/kaggle/working/dubbed_clips/segment_1_SPEAKER_00.wav'},
 {'start': 15.67409375,
  'end': 21.51284375,
  'speaker': 'SPEAKER_00',
  'text': " If I can offer lower prices, that's just competition. It's up to you to keep up or step aside.",
  'translated_text': 'अगर मैं

In [9]:
from pydub import AudioSegment
import os

# Your 'segments' list from the diarization/transcription/translation steps
# segments = [...]

# 1. Create an empty AudioSegment to build upon
final_audio = AudioSegment.empty()

# 2. Define the directory where your clips are stored
clips_dir = "/kaggle/working/dubbed_clips/"

print("Assembling the final audio by concatenating clips sequentially...")

# 3. Loop through the segments in order and append each clip
for i, seg in enumerate(segments):
    # Construct the path to the current dubbed clip
    speaker_id = seg['speaker']
    clip_path = os.path.join(clips_dir, f"segment_{i}_{speaker_id}.wav")
    
    # Load the clip
    current_clip = AudioSegment.from_wav(clip_path)
    
    # Append the clip to the end of our final audio
    final_audio += current_clip

# 4. Export the final result
# We'll use a new name to avoid overwriting the previous version
final_output_path = "/kaggle/working/final_dubbed_concatenated.mp3"
final_audio.export(final_output_path, format="mp3")

print(f"\n Success! Sequentially concatenated audio saved to: {final_output_path}")

Assembling the final audio by concatenating clips sequentially...

🎉 Success! Sequentially concatenated audio saved to: /kaggle/working/final_dubbed_concatenated.mp3
