<a href="https://colab.research.google.com/github/Vishal8500/Text-to-Speech-DL-Project/blob/main/Text_to_Speech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install librosa numpy pandas matplotlib tqdm



In [2]:
DATASET_PATH = "/content/drive/MyDrive/LJSpeech-1.1"

In [5]:
import os
import librosa
import librosa.display
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json

# Set dataset paths
DATASET_PATH = "/content/drive/MyDrive/LJSpeech-1.1"  # Update this if needed
AUDIO_PATH = os.path.join(DATASET_PATH, "wavs")
METADATA_FILE = os.path.join(DATASET_PATH, "metadata.csv")
OUTPUT_PATH = "/content/processed_data"

# Create output directory
os.makedirs(OUTPUT_PATH, exist_ok=True)

# Load metadata
df = pd.read_csv(METADATA_FILE, sep="|", header=None, names=["file_id", "transcript", "normalized_text"])

# Handle missing or non-string values
df["normalized_text"] = df["normalized_text"].fillna("").astype(str)

# Function to clean text
def clean_text(text):
    text = text.lower().strip()
    text = text.replace(",", "").replace(".", "").replace(";", "").replace("?", "")
    return text

# Apply text cleaning
df["clean_text"] = df["normalized_text"].apply(clean_text)

# Function to convert audio to Mel-spectrogram
def process_audio(file_id, sr=22050, n_mels=80):
    file_path = os.path.join(AUDIO_PATH, f"{file_id}.wav")
    y, _ = librosa.load(file_path, sr=sr)

    # Convert to Mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    return mel_spec_db

# Process all audio files and save
mel_data = {}

for idx, row in tqdm(df.iterrows(), total=len(df)):
    file_id = row["file_id"]
    mel_spec_db = process_audio(file_id)

    # Save spectrogram as numpy file
    np.save(os.path.join(OUTPUT_PATH, f"{file_id}.npy"), mel_spec_db)

    # Store metadata (text and spectrogram shape)
    mel_data[file_id] = {"text": row["clean_text"], "mel_shape": mel_spec_db.shape}

# Save metadata to JSON
with open(os.path.join(OUTPUT_PATH, "metadata.json"), "w") as f:
    json.dump(mel_data, f, indent=4)

print("✅ Preprocessing complete! Mel-spectrograms and metadata saved.")


100%|██████████| 13100/13100 [2:53:43<00:00,  1.26it/s]

✅ Preprocessing complete! Mel-spectrograms and metadata saved.





In [6]:
import shutil

# Zip the processed_data folder
shutil.make_archive('/content/processed_data', 'zip', '/content/processed_data')

# Download the zip file
from google.colab import files
files.download('/content/processed_data.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
# Install required libraries
!pip install tensorflow librosa matplotlib




In [8]:
# Example: TensorFlow Dataset loading (you can use PyTorch similarly)
import tensorflow as tf

def load_data(metadata_path, audio_path, batch_size=32):
    with open(metadata_path, "r") as f:
        metadata = json.load(f)

    text_data = [entry["text"] for entry in metadata.values()]
    mel_data = [np.load(os.path.join(audio_path, f"{file_id}.npy")) for file_id in metadata.keys()]

    # Convert to TensorFlow Dataset (you can also use PyTorch Dataset)
    dataset = tf.data.Dataset.from_tensor_slices((text_data, mel_data))
    dataset = dataset.batch(batch_size)
    return dataset
