<a href="https://colab.research.google.com/github/Vishal8500/Text-to-Speech-DL-Project/blob/main/Text_to_Speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install librosa numpy pandas matplotlib tqdm



In [2]:
DATASET_PATH = "/content/drive/MyDrive/LJSpeech-1.1"

In [5]:
import os
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

# Set dataset paths
DATASET_PATH = "/content/drive/MyDrive/LJSpeech-1.1"  # Update this if needed
AUDIO_PATH = os.path.join(DATASET_PATH, "wavs")
METADATA_FILE = os.path.join(DATASET_PATH, "metadata.csv")
OUTPUT_PATH = "/content/processed_data"

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Load metadata
df = pd.read_csv(METADATA_FILE, sep="|", header=None, names=["file_id", "transcript", "normalized_text"])

# Handle missing or non-string values
df["normalized_text"] = df["normalized_text"].fillna("").astype(str)

# Function to clean text
def clean_text(text):
    text = text.lower().strip()
    text = text.replace(",", "").replace(".", "").replace(";", "").replace("?", "")
    return text

# Apply text cleaning
df["clean_text"] = df["normalized_text"].apply(clean_text)

# Function to convert audio to Mel-spectrogram
def process_audio(file_id, sr=22050, n_mels=80):
    file_path = os.path.join(AUDIO_PATH, f"{file_id}.wav")
    y, _ = librosa.load(file_path, sr=sr)

    # Convert to Mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    return mel_spec_db

# Process all audio files and save
mel_data = {}

for idx, row in tqdm(df.iterrows(), total=len(df)):
    file_id = row["file_id"]
    mel_spec_db = process_audio(file_id)

    # Save spectrogram as numpy file
    np.save(os.path.join(OUTPUT_PATH, f"{file_id}.npy"), mel_spec_db)

    # Store metadata (text and spectrogram shape)
    mel_data[file_id] = {"text": row["clean_text"], "mel_shape": mel_spec_db.shape}

# Save metadata to JSON
with open(os.path.join(OUTPUT_PATH, "metadata.json"), "w") as f:
    json.dump(mel_data, f, indent=4)

print("✅ Preprocessing complete! Mel-spectrograms and metadata saved.")


100%|██████████| 13100/13100 [2:53:43<00:00,  1.26it/s]

✅ Preprocessing complete! Mel-spectrograms and metadata saved.





In [6]:
import shutil

# Zip the processed_data folder
shutil.make_archive('/content/processed_data', 'zip', '/content/processed_data')

# Download the zip file
from google.colab import files
files.download('/content/processed_data.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
# Install required libraries
!pip install tensorflow librosa matplotlib




In [8]:
# Example: TensorFlow Dataset loading (you can use PyTorch similarly)
import tensorflow as tf

def load_data(metadata_path, audio_path, batch_size=32):
    with open(metadata_path, "r") as f:
        metadata = json.load(f)

    text_data = [entry["text"] for entry in metadata.values()]
    mel_data = [np.load(os.path.join(audio_path, f"{file_id}.npy")) for file_id in metadata.keys()]

    # Convert to TensorFlow Dataset (you can also use PyTorch Dataset)
    dataset = tf.data.Dataset.from_tensor_slices((text_data, mel_data))
    dataset = dataset.batch(batch_size)
    return dataset


In [18]:
!git clone https://github.com/Rayhane-mamah/Tacotron-2.git /content/taco


Cloning into '/content/taco'...
remote: Enumerating objects: 1082, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 1082 (delta 2), reused 0 (delta 0), pack-reused 1076 (from 1)[K
Receiving objects: 100% (1082/1082), 9.16 MiB | 18.46 MiB/s, done.
Resolving deltas: 100% (694/694), done.


In [16]:
import tensorflow as tf
print(tf.__version__)


2.18.0


In [20]:
# Move into the cloned directory
%cd /content/taco


/content/taco


In [22]:
!apt-get install -y libsndfile1

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsndfile1 is already the newest version (1.0.31-2ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


In [23]:
!pip install -r requirements.txt

Collecting falcon==1.2.0 (from -r requirements.txt (line 1))
  Using cached falcon-1.2.0-py2.py3-none-any.whl.metadata (21 kB)
Collecting inflect==0.2.5 (from -r requirements.txt (line 2))
  Using cached inflect-0.2.5-py2.py3-none-any.whl.metadata (50 kB)
Collecting audioread==2.1.5 (from -r requirements.txt (line 3))
  Using cached audioread-2.1.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting librosa==0.5.1 (from -r requirements.txt (line 4))
  Using cached librosa-0.5.1.tar.gz (1.5 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting matplotlib==2.0.2 (from -r requirements.txt (line 5))
  Using cached matplotlib-2.0.2.tar.gz (53.9 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.14.0 (from -r requirements.txt (line 6))
  Using cached numpy-1.14.0.zip (4.9 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting scipy==1.0.0 (from -r requirements.txt (line 7))
  Using cached scipy-1.0.0.tar.gz (15.2

In [13]:
# Example using a pre-built Tacotron 2 model from a repository like GitHub
from tacotron2_model import Tacotron2  # Import your model

model = Tacotron2()  # Initialize model

# Compile the model with optimizer and loss functions
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model on your dataset
model.fit(train_data, epochs=100, batch_size=32)


ModuleNotFoundError: No module named 'tacotron2_model'

In [40]:
!git config --global user.email "vishalmahender8@gmail.com"
!git config --global user.name "Vishal8500"


In [42]:
!git clone https://github.com/Vishal8500/Text-to-Speech-DL-Project.git


Cloning into 'Text-to-Speech-DL-Project'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [43]:
%cd Text-to-Speech-DL-Project


/content/git/Text-to-Speech-DL-Project


In [44]:
!cp /content/Text-to-Speech.ipynb .  # Copy notebook to repo folder
!git add Text-to-Speech.ipynb


cp: cannot stat '/content/Text-to-Speech.ipynb': No such file or directory
fatal: pathspec 'Text-to-Speech.ipynb' did not match any files
