# ***Voice Modelling using Tortoise-TTS***
### ***By Lt Col Ritesh Lamba***

# Install Required Libraries

In [2]:
!pip install "tortoise-tts>=3.0.0"

import gdown
import os
import re
import torch
import torchaudio
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
import warnings
warnings.filterwarnings("ignore")
device = "Cuda" if torch.cuda.is_available() else "CPU"



# TTS - Generate Audio in own Voice (Transfer Learning / Speaker Embedding)

In [3]:
# @title FUNCTION TO SPLIT THE TEXT INTO SMALLER CHUNKS
def split_text(text, max_words=400):
    # Break into sentences/phrases based on punctuation
    sentences = re.split(r'(?<=[।.?!])\s+', text)
    chunks = []
    current_chunk = []
    current_len = 0

    for sentence in sentences:
        word_count = len(sentence.split())
        if current_len + word_count <= max_words:
            current_chunk.append(sentence)
            current_len += word_count
        else:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_len = word_count

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [4]:
# @title DOWNLOAD THE TRANSCRIPT & SAMPLE AUDIO FILE
# https://drive.google.com/file/d/1ahUfB4E4j3eJvZ5bR3G02ilm36lJwD5r/view?usp=sharing
# Download recorded voice sample from Google Drive using file ID
file_id = "1ahUfB4E4j3eJvZ5bR3G02ilm36lJwD5r"
url = f"https://drive.google.com/uc?id={file_id}"
original_sample_path = "original_sample.wav"
gdown.download(url, original_sample_path, quiet=False)
assert os.path.exists("original_sample.wav"), "WAV file not found!"

# https://drive.google.com/file/d/1x-AYvELvfmI_3h5bajLbECTKuPgBOBIB/view?usp=sharing
# Download hindi transcript generated from Google Drive using file ID
file_id = "1x-AYvELvfmI_3h5bajLbECTKuPgBOBIB"
url = f"https://drive.google.com/uc?id={file_id}"
hindi_transcript = "hindi_transcript.txt"
gdown.download(url, hindi_transcript, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1ahUfB4E4j3eJvZ5bR3G02ilm36lJwD5r
To: /content/original_sample.wav
100%|██████████| 1.30M/1.30M [00:00<00:00, 87.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1x-AYvELvfmI_3h5bajLbECTKuPgBOBIB
To: /content/hindi_transcript.txt
100%|██████████| 33.6k/33.6k [00:00<00:00, 8.13MB/s]


'hindi_transcript.txt'

In [5]:
# @title Step 0: Make the Audio compatible
waveform, sr = torchaudio.load("original_sample.wav")
# Resample to 24kHz if needed
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=24000)
resampled_waveform = resampler(waveform)
# Convert to mono
if resampled_waveform.shape[0] > 1:
    resampled_waveform = resampled_waveform.mean(dim=0, keepdim=True)
# Save in correct format
torchaudio.save("voice_sample.wav", resampled_waveform, 24000, encoding="PCM_S", bits_per_sample=16)

In [6]:
# @title Step 1: Load Tortoise model
tts = TextToSpeech()

Some weights of the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jbetker/wav2vec2-large-robust-ft-libritts-voxpopuli and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.o

In [7]:
# @title Step 2: Load own voice
# Load waveform as tensor
waveform, sr = torchaudio.load("voice_sample.wav")
if sr != 24000:
    resample = torchaudio.transforms.Resample(orig_freq=sr, new_freq=24000)
    waveform = resample(waveform)
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)
waveform = waveform.to(tts.device)  # Move to model's device
voice_samples = [waveform]

# Call get_conditioning_latents() with waveforms
conditioning_latents = tts.get_conditioning_latents(voice_samples)

In [8]:
# @title Step 3: Hindi text to synthesize
# Load the translated Hindi text
with open("hindi_transcript.txt", "r", encoding="utf-8") as f:
    hindi_text = f.read()

# Step 2: Extract the first 5% of the text
cutoff = int(len(hindi_text) * 0.05)  # 5% of total characters
demo_text = hindi_text[:cutoff]

print(f"Using {cutoff} characters for demo:\n")
print(demo_text)


Using 674 characters for demo:

सब ठीक है, अंतिम वर्ग, हमने क्या चर्चा की? विंडिंग मैकेनिज्म। विंडिंग मैकेनिज्म, राइट? अब, विंडिंग तंत्र सभी प्रकार के कार्यों के लिए लागू होते हैं, है ना? क्योंकि आपके पास बड़े ऑडियो सिग्नल हैं, इसलिए आपको इसे संसाधित करने के लिए भागों, भागों, भागों में इसे तोड़ना होगा, है ना? इसलिए, विंडिंग, कर्लिंग, और अब आइए देखें कि वास्तविक कार्य क्या हैं जो हम प्रदर्शन कर सकते हैं। भाषण वृद्धि उन विषयों में से एक थी जो किसी को सूचीबद्ध करती है, है ना? तो, भाषण वृद्धि क्या है? बहुत सरल, है ना? आपके पास कोई भी ऑडियो सिग्नल विभिन्न प्रकार के शोर हो सकता है। इसलिए, भाषण वृद्धि भाषण को साफ करने, शोर को हटाने के बारे में बात करती है, जबकि यह सुनिश्चित करती है कि वास्तविक सामग्री सं


In [None]:
# @title Step 4: Generate audio in own voice
# Step 4.1: Split into smaller chunks
segments = split_text(demo_text, max_words=40)  # adjust as needed
# Step 4.2: Run TTS on each segment
all_audios = []
for i, segment in enumerate(segments):
    print(f"Generating segment {i+1}/{len(segments)}: {segment[:50]}...")
    audio = tts.tts_with_preset(
        segment,
        voice_samples=voice_samples,
        conditioning_latents=conditioning_latents,
        preset="fast"
    )
    all_audios.append(audio)
# Step 4.3: Concatenate all generated audio
final_audio = torch.cat(all_audios, dim=-1)

Generating segment 1/5: सब ठीक है, अंतिम वर्ग, हमने क्या चर्चा की? विंडिंग...
Generating autoregressive samples..


  0%|          | 0/96 [00:00<?, ?it/s]

In [None]:
# @title Step 5: Save output
torchaudio.save("M23CSA544_SU_Major-Q1_hindi_tts_own_voice.wav", final_audio.squeeze(0).cpu(), 24000)
print("TTS audio generated in own voice.")
# Download the transcript to Local Computer
from google.colab import files
files.download("M23CSA544_SU_Major-Q1_hindi_tts_own_voice.wav")