# Scrape Youtube for audio data of celebs

In [4]:
!pip install yt-dlp youtube-search-python SpeechRecognition pydub moviepy

Collecting yt-dlp
  Downloading yt_dlp-2024.10.22-py3-none-any.whl.metadata (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.6/171.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting youtube-search-python
  Downloading youtube_search_python-1.6.6-py3-none-any.whl.metadata (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.5/99.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting SpeechRecognition
  Downloading SpeechRecognition-3.11.0-py2.py3-none-any.whl.metadata (28 kB)
Collecting brotli (from yt-dlp)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)
Collecting mutagen (from yt-dlp)
  Downloading mutagen-1.47.0-py3-none-any.whl.metadata (1.7 kB)
Collecting pycryptodomex (from yt-dlp)
  Downloading pycryptodomex-3.21.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting websockets>=13

In [None]:
import os
import yt_dlp
import speech_recognition as sr
from youtubesearchpython import VideosSearch
import moviepy.editor as mp
import subprocess

# Install the dependencies
#!pip install yt-dlp youtube-search-python SpeechRecognition pydub moviepy

# Function to search YouTube and get URLs for given celebrity names
def get_youtube_urls(celeb_names, max_results=10):
    celeb_videos = {}
    for celeb in celeb_names:
        search = VideosSearch(celeb, limit=max_results)
        results = search.result()['result']
        urls = [video['link'] for video in results]
        celeb_videos[celeb] = urls
    return celeb_videos

# Function to download audio from YouTube URL and save it
def download_audio_from_url(url, save_path):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': save_path,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'no_check_certificate': True,  # Bypass SSL certificate issues
        'cookies': 'cookies.txt'  # Load cookies to handle age-restricted or logged-in content
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
    except yt_dlp.utils.DownloadError as e:
        print(f"Download error: {e}")


# Main function to get the audio dataset for each celebrity
def get_audio_datasets(celebs):
    celeb_videos = get_youtube_urls(celebs)
    for celeb, urls in celeb_videos.items():
        celeb_folder = f"audio_dataset/{celeb.replace(' ', '_')}"
        os.makedirs(celeb_folder, exist_ok=True)

        for index, url in enumerate(urls):
            audio_path = f"{celeb_folder}/{celeb.replace(' ', '_')}_raw_{index}.mp3"
            filtered_path = f"{celeb_folder}/{celeb.replace(' ', '_')}_filtered_{index}.mp3"

            print(f"Downloading audio for {celeb} from URL: {url}")
            download_audio_from_url(url, audio_path)

# Define celebrities for dataset collection
celebrities = ["Barack Obama", "Taylor Swift", "Donald Trump", "Elon Musk", "Jensen Huang"]
get_audio_datasets(celebrities)

print("Audio dataset collection completed.")

# Separate audio of separate people

In [5]:
# Install the dependencies
!pip install yt-dlp youtube-search-python SpeechRecognition pydub moviepy
!pip install pyannote.audio torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
!pip install speechbrain

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.4.0-py3-none-any.whl.metadata (38 kB)
Collecting omegaconf<3.0,>=2.1 (from pyannote.audio)
  Downloading omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.0-py3-none-any.whl.metadata (1.2 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloadi



In [7]:
import os
import yt_dlp
import speech_recognition as sr
from youtubesearchpython import VideosSearch
import moviepy.editor as mp
import subprocess

from pyannote.audio import Pipeline

# Put your Hugging face account token at 'HuggingFaceToken' below
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1",
                                    use_auth_token="HuggingFaceToken")

# Initialize the pipeline
try:
    diarization_pipeline = pipeline
except Exception as e:
    print(f"Error initializing diarization pipeline: {e}")
    diarization_pipeline = None  # Explicitly set to None if initialization fails

def separate_speakers(audio_path, save_folder):
    if diarization_pipeline is None:
        print("Diarization pipeline is not initialized. Skipping speaker separation.")
        return

    diarization = diarization_pipeline(audio_path)
    audio = mp.AudioFileClip(audio_path)
    audio_duration = audio.duration  # Get the duration of the audio clip

    for turn, _, speaker in diarization.itertracks(yield_label=True):
        start_time = max(turn.start, 0)
        end_time = min(turn.end, audio_duration)

        # Skip segments where start time is equal to or greater than end time
        if start_time >= end_time:
            continue

        segment_audio = audio.subclip(start_time, end_time)
        speaker_folder = os.path.join(save_folder, speaker)
        os.makedirs(speaker_folder, exist_ok=True)
        segment_path = os.path.join(speaker_folder, f"{speaker}_{start_time:.2f}_{end_time:.2f}.wav")
        segment_audio.write_audiofile(segment_path)

def iterate_audio_files(folder_path):
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith((".wav", ".mp3")):
                audio_file_path = os.path.join(root, file)
                print(f"Processing {audio_file_path}")
                save_folder = "/content/separated_voices/Taylor"  # Update this path
                separate_speakers(audio_file_path, save_folder)

# Example usage: Replace with your folder path
folder_path = "/content/audio_dataset/Taylor_Swift"
iterate_audio_files(folder_path)



Could not download 'pyannote/speaker-diarization' pipeline.
It might be because the pipeline is private or gated so make
sure to authenticate. Visit https://hf.co/settings/tokens to
create your access token and retry with:

   >>> Pipeline.from_pretrained('pyannote/speaker-diarization',
   ...                          use_auth_token=YOUR_AUTH_TOKEN)

If this still does not work, it might be because the pipeline is gated:
visit https://hf.co/pyannote/speaker-diarization to accept the user conditions.
Processing /content/audio_dataset/Taylor_Swift/Taylor_Swift_raw_0.mp3.mp3
Diarization pipeline is not initialized. Skipping speaker separation.
Processing /content/audio_dataset/Taylor_Swift/Taylor_Swift_raw_1.mp3.mp3
Diarization pipeline is not initialized. Skipping speaker separation.
Processing /content/audio_dataset/Taylor_Swift/Taylor_Swift_raw_2.mp3.mp3
Diarization pipeline is not initialized. Skipping speaker separation.
Processing /content/audio_dataset/Taylor_Swift/Taylor_Swift_ra

In [1]:
import os
import yt_dlp
from youtubesearchpython import VideosSearch

# Function to search YouTube and get URLs for given celebrity names
def get_youtube_urls(celeb_names, max_results=10, search_suffix="interview"):
    celeb_videos = {}
    for celeb in celeb_names:
        search_query = f"{celeb} {search_suffix}"
        search = VideosSearch(search_query, limit=max_results)
        results = search.result()['result']
        urls = [video['link'] for video in results]
        celeb_videos[celeb] = urls
    return celeb_videos

# Function to download audio from YouTube URL and save it
def download_audio_from_url(url, save_path):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': save_path,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'no_check_certificate': True,  # Bypass SSL certificate issues
        'cookies': 'cookies.txt'  # Load cookies to handle age-restricted or logged-in content
    }
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
    except yt_dlp.utils.DownloadError as e:
        print(f"Download error: {e}")

# Main function to get the audio dataset for each celebrity
def get_audio_datasets(celebs, search_suffix="interview"):
    celeb_videos = get_youtube_urls(celebs, search_suffix=search_suffix)
    for celeb, urls in celeb_videos.items():
        celeb_folder = f"audio_dataset/{celeb.replace(' ', '_')}"
        os.makedirs(celeb_folder, exist_ok=True)

        for index, url in enumerate(urls):
            audio_path = f"{celeb_folder}/{celeb.replace(' ', '_')}_raw_{index}.mp3"

            print(f"Downloading audio for {celeb} from URL: {url}")
            download_audio_from_url(url, audio_path)

# Define celebrities for dataset collection
celebrities = [ "Taylor Swift"]
get_audio_datasets(celebrities, search_suffix="speech")

print("Audio dataset collection completed.")


Downloading audio for Taylor Swift from URL: https://www.youtube.com/watch?v=OBG50aoUwlI
[youtube] Extracting URL: https://www.youtube.com/watch?v=OBG50aoUwlI
[youtube] OBG50aoUwlI: Downloading webpage
[youtube] OBG50aoUwlI: Downloading ios player API JSON
[youtube] OBG50aoUwlI: Downloading mweb player API JSON
[youtube] OBG50aoUwlI: Downloading player fb725ac8
[youtube] OBG50aoUwlI: Downloading m3u8 information
[info] OBG50aoUwlI: Downloading 1 format(s): 251
[download] Destination: audio_dataset/Taylor_Swift/Taylor_Swift_raw_0.mp3
[download] 100% of   21.20MiB in 00:00:00 at 26.28MiB/s  
[ExtractAudio] Destination: audio_dataset/Taylor_Swift/Taylor_Swift_raw_0.mp3.mp3
Deleting original file audio_dataset/Taylor_Swift/Taylor_Swift_raw_0.mp3 (pass -k to keep)
Downloading audio for Taylor Swift from URL: https://www.youtube.com/watch?v=ZVpkFb9-fts
[youtube] Extracting URL: https://www.youtube.com/watch?v=ZVpkFb9-fts
[youtube] ZVpkFb9-fts: Downloading webpage
[youtube] ZVpkFb9-fts: Downl

In [3]:
!zip -r /content/clips.zip /content/output_clips

  adding: content/output_clips/ (stored 0%)
  adding: content/output_clips/clip_75.mp3 (deflated 3%)
  adding: content/output_clips/clip_56.mp3 (deflated 2%)
  adding: content/output_clips/clip_57.mp3 (deflated 2%)
  adding: content/output_clips/clip_78.mp3 (deflated 4%)
  adding: content/output_clips/clip_86.mp3 (deflated 3%)
  adding: content/output_clips/clip_79.mp3 (deflated 3%)
  adding: content/output_clips/clip_10.mp3 (deflated 1%)
  adding: content/output_clips/clip_55.mp3 (deflated 2%)
  adding: content/output_clips/clip_76.mp3 (deflated 3%)
  adding: content/output_clips/clip_50.mp3 (deflated 2%)
  adding: content/output_clips/clip_3.mp3 (deflated 1%)
  adding: content/output_clips/clip_11.mp3 (deflated 1%)
  adding: content/output_clips/clip_64.mp3 (deflated 1%)
  adding: content/output_clips/clip_53.mp3 (deflated 2%)
  adding: content/output_clips/clip_85.mp3 (deflated 2%)
  adding: content/output_clips/clip_77.mp3 (deflated 3%)
  adding: content/output_clips/clip_23.mp3 (d

In [2]:
pip install pydub ffmpeg

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting ffmpeg
  Downloading ffmpeg-1.4.tar.gz (5.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Building wheels for collected packages: ffmpeg
  Building wheel for ffmpeg (setup.py) ... [?25l[?25hdone
  Created wheel for ffmpeg: filename=ffmpeg-1.4-py3-none-any.whl size=6082 sha256=1924f4aa657e074d22582f30cc186eaca96d0700b9f644fb816c47878d50e52d
  Stored in directory: /root/.cache/pip/wheels/8e/7a/69/cd6aeb83b126a7f04cbe7c9d929028dc52a6e7d525ff56003a
Successfully built ffmpeg
Installing collected packages: pydub, ffmpeg
Successfully installed ffmpeg-1.4 pydub-0.25.1


In [36]:
from pydub import AudioSegment
import math
import os

def split_audio(file_path, output_folder, clip_length=2100):
    """
    Splits the audio file into clips of specified length.

    :param file_path: Path to the input audio file.
    :param output_folder: Directory to save the output clips.
    :param clip_length: Length of each clip in milliseconds.
    """
    # Load the audio file
    audio = AudioSegment.from_file(file_path)
    total_length = len(audio)
    num_clips = math.ceil(total_length / clip_length)

    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    for i in range(num_clips):
        start_time = i * clip_length
        end_time = min((i + 1) * clip_length, total_length)
        clip = audio[start_time:end_time]
        output_file = os.path.join(output_folder, f"clip_{i+1}.mp3")
        clip.export(output_file, format="mp3")
        print(f"Exported {output_file}")

# Example usage
if __name__ == "__main__":
    input_file = "/content/raw/clip_91.mp3"  # Replace with your audio file path
    output_dir = "output"
    split_audio(input_file, output_dir)


Exported output/clip_1.mp3
Exported output/clip_2.mp3
Exported output/clip_3.mp3
Exported output/clip_4.mp3
Exported output/clip_5.mp3


In [37]:
from pydub import AudioSegment
import os

def join_audio(input_folder, output_file):
    """
    Joins multiple audio files from a folder into a single audio file.

    :param input_folder: Directory containing the audio files to join.
    :param output_file: Path to the output audio file.
    """
    # List all audio files in the input folder
    audio_files = [f for f in os.listdir(input_folder) if f.endswith('.mp3') or f.endswith('.wav')]
    audio_files.sort()  # Ensure files are in order

    # Initialize an empty AudioSegment
    combined = AudioSegment.empty()

    for file_name in audio_files:
        file_path = os.path.join(input_folder, file_name)
        print(f"Adding {file_path}")
        audio = AudioSegment.from_file(file_path)
        combined += audio

    # Export the combined audio
    combined.export(output_file, format="mp3")
    print(f"Exported combined audio to {output_file}")

# Example usage
if __name__ == "__main__":
    input_dir = "/content/output"  # Replace with your input directory path
    output_audio = "clip_91.mp3"  # Replace with your desired output file path
    join_audio(input_dir, output_audio)


Adding /content/output/clip_1.mp3
Adding /content/output/clip_2.mp3
Adding /content/output/clip_3.mp3
Adding /content/output/clip_4.mp3
Exported combined audio to clip_91.mp3
