In [3]:
from elevenlabs.client import ElevenLabs
from elevenlabs import play, save
import os
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import glob
from pydub import AudioSegment

In [None]:
load_dotenv()

client = ElevenLabs(api_key=os.getenv("ELEVEN_LABS_KEY"))

## Database Generation

## Get All the Voices from the official website
Because trying to get it from the library itself will only give me the default voices without including the legacy voices too

In [4]:
# Step 1: Send a GET request to the page
url = "https://elevenlabs.io/docs/voices/default-voices"
response = requests.get(url)

# Step 2: Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Step 3: Find the specific tables
tables = soup.find_all('table', {'width': '100%', 'height': '500'})

# List to store extracted data
all_data = []

# Step 4: Process each table found
for table in tables:
    # Step 4.1: Extract headers if they exist
    headers = [th.get_text(strip=True) for th in table.find_all('th')]
    
    # If no headers found, skip this table (assuming it's essential for structure)
    if not headers:
        continue

    # Step 4.2: Extract data rows from the table
    rows_data = []
    rows = table.find_all('tr')
    for row in rows:
        cells = [cell.get_text(strip=True) for cell in row.find_all('td')]
        if cells:
            rows_data.append(cells)
    
    # Step 4.3: Create a DataFrame for the current table
    if rows_data:
        df = pd.DataFrame(rows_data, columns=headers)  # Assign headers as column names
        all_data.append(df)

# Step 5: Combine all extracted tables into one DataFrame
final_df = pd.concat(all_data, ignore_index=True)
final_df.head()

Unnamed: 0,name,voice_id,gender,age,accent,description,use_case,preview_url
0,Alice,Xb7hH8MSUJpSbSDYk0k2,female,middle-aged,British,confident,news,Sample
1,Aria,9BWtsMINqrJLrRacOk9x,female,middle-aged,American,expressive,social media,Sample
2,Bill,pqHfZKP75CvOlQylNhV4,male,old,American,trustworthy,narration,Sample
3,Brian,nPczCjzI2devNBz1zQrb,male,middle-aged,American,deep,narration,Sample
4,Callum,N2lVS1w4EtoT3dr4eOWO,male,middle-aged,Transatlantic,intense,characters,Sample


In [5]:
final_df.to_csv('ElevenLabs_All_Voices.csv', index=False)

In [53]:
final_df.shape

(51, 8)

In [6]:
df = pd.read_csv('ElevenLabs_All_Voices.csv')

## Generate the "Hey Shadow" Database

In [4]:
# Listing of all possible variations of "Hey shadow"
text_variations = ["Hey Shadow!", "Hey shadow", "Hey Shadow", "hey shadow", "hey shadow!", "hey shadow ?", "Hey shadow ?"]

def GenerateDataset(df, num_variations=5):

    output_dir = "hey_shadow_dataset"
    os.makedirs(output_dir, exist_ok=True)

    # Loop through each voice ID in the dataframe
    for voice_id in df['voice_id']:
        # Generate multiple audio files for each voice
        for i in range(num_variations):
            # Select a random variation of the text
            text = random.choice(text_variations)
            
            # Generate the audio with the selected voice ID and text
            audio = client.generate(
                text=text,
                voice=voice_id
            )
            
            # Save the audio file with a unique name
            file_path = os.path.join(output_dir, f"hey_shadow_{voice_id}_{i}.mp3")
            save(audio, file_path)

# Generating the dataset with "Hey Shadow"
# GenerateDataset(final_df)

In [1]:
# calculate how many files we have in the directory hey_shadow_dataset
path = 'hey_shadow_dataset'
num_files = len(glob.glob1(path, "*.mp3"))
print(num_files)

262


## Generate Random Words and phrases DataBase that are not related to "Hey Shadow"

In [7]:
# List of phrases unrelated to "hey shadow"
not_hey_shadow_texts = [
    "Good morning!",
    "Hello",
    "Hello there",
    "Good afternoon!",
    "Good evening!",
    "What time is it?",
    "What's up?",
    "how's the weather?",
    "Thank you!",
    "can you help me?",
    "can you hear me?"
]

# Number of samples to generate for "not_hey_shadow" dataset
target_samples = 262

def GenerateNotHeyShadowDataset(df, target_samples):
    output_dir = "not_hey_shadow_dataset"
    os.makedirs(output_dir, exist_ok=True)

    # Total samples created so far
    samples_created = 0

    # Loop through each voice ID and generate clips until reaching target samples
    while samples_created < target_samples:
        for voice_id in df['voice_id']:
            # Stop if target samples is reached
            if samples_created >= target_samples:
                break
            
            # Select a random phrase from the unrelated text list
            text = random.choice(not_hey_shadow_texts)
            
            # Generate the audio with the selected voice ID and text
            audio = client.generate(
                text=text,
                voice=voice_id
            )
            
            # Save the audio file with a unique name in the new directory
            file_path = os.path.join(output_dir, f"not_hey_shadow_{voice_id}_{samples_created}.mp3")
            save(audio, file_path)
            
            # Increment the samples counter
            samples_created += 1


GenerateNotHeyShadowDataset(df, target_samples)


### Checking if the FFMPEG works or not

In [None]:
#Checking if FFmpeg Framework works or not


# import os
# import subprocess

# # Set the ffmpeg path in the environment variable for the duration of the script
# os.environ["PATH"] += os.pathsep + r"C:\ffmpeg\ffmpeg-2024-10-27-git-bb57b78013-full_build\bin"

# from pydub import AudioSegment

# # Load a single audio file as a test
# try:
#     audio = AudioSegment.from_file(r"C:\Users\rouka\Desktop\hey_shadow\hey_shadow_dataset\hey_shadow_5Q0t7uMcjvnagumLfvZi_1.mp3")
#     print(f"Loaded file duration: {len(audio)} milliseconds")
# except Exception as e:
#     print(f"Error: {e}")


In [18]:
# Set the ffmpeg path in the environment variable for the duration of the script
os.environ["PATH"] += os.pathsep + r"C:\ffmpeg\ffmpeg-2024-10-27-git-bb57b78013-full_build\bin"

## Data Prepration (Adjusting all the voices to exactly 1 second)

In [19]:
def AdjustDuration(input_directory, output_directory, target_duration):

    os.makedirs(output_directory, exist_ok=True)
    for filename in os.listdir(input_directory):
        if filename.endswith(".mp3"):
            # Load the audio file
            audio_path = os.path.join(input_directory, filename)
            audio = AudioSegment.from_file(audio_path)

            # Adjust the duration to exactly 1 second
            if len(audio) > target_duration:
                # Trim if audio is longer than 1 second
                audio = audio[:target_duration]
            else:
                # Pad with silence if audio is shorter than 1 second
                silence = AudioSegment.silent(duration=target_duration - len(audio))
                audio = audio + silence

            # Save the modified file to the output directory
            output_path = os.path.join(output_directory, filename)
            audio.export(output_path, format="mp3")

    print("All audio files have been adjusted to 1 second.")

In [17]:
# Directory containing the audio files
input_directory = r"C:\Users\rouka\Desktop\hey_shadow\hey_shadow_dataset"
output_directory = r"C:\Users\rouka\Desktop\hey_shadow\hey_shadow_dataset_1s"
# Desired duration in milliseconds
target_duration = 1000
AdjustDuration(input_directory, output_directory, target_duration)

All audio files have been adjusted to 1 second.


In [20]:
# Directory containing the audio files
input_directory = r"C:\Users\rouka\Desktop\hey_shadow\not_hey_shadow_dataset"
output_directory = r"C:\Users\rouka\Desktop\hey_shadow\not_hey_shadow_dataset_1s"
# Desired duration in milliseconds
target_duration = 1000
AdjustDuration(input_directory, output_directory, target_duration)

All audio files have been adjusted to 1 second.


## Data Augmentation

## Data Preprocessing

## Modeling