# Download Audio Segments

In [3]:
import pandas as pd
from pytube import YouTube
from pytube.exceptions import AgeRestrictedError, VideoUnavailable
from pydub import AudioSegment
import os
import subprocess

# Load the dataset
df = pd.read_csv('./data/musiccaps-public.csv')

# Function to download and extract audio segments using pytube
def download_audio_pytube(video_id, start_time, end_time, output_path):
    try:
        url = f"https://www.youtube.com/watch?v={video_id}"
        yt = YouTube(url)
        stream = yt.streams.filter(only_audio=True).first() 
        audio_path = stream.download(filename="temp_audio.mp4")
        audio = AudioSegment.from_file(audio_path)
        start_ms = start_time * 1000
        end_ms = end_time * 1000
        segment = audio[start_ms:end_ms]
        segment.export(output_path, format="wav")
        os.remove(audio_path)
    except AgeRestrictedError:
        print(f"Video {video_id} is age restricted and will be skipped.")
    except VideoUnavailable:
        print(f"Video {video_id} is unavailable and will be skipped.")
    except Exception as e:
        print(f"An error occurred for video {video_id} with pytube: {str(e)}")
        return False
    return True

# Function to download and extract audio segments using yt-dlp
def download_audio_ytdlp(video_id, start_time, end_time, output_path):
    try:
        url = f"https://www.youtube.com/watch?v={video_id}"
        temp_audio_path = "temp_audio.mp4"
        command = [
            'yt-dlp',
            '-f', 'bestaudio',
            '--output', temp_audio_path,
            url
        ]
        subprocess.run(command, check=True)
        audio = AudioSegment.from_file(temp_audio_path)
        start_ms = start_time * 1000
        end_ms = end_time * 1000
        segment = audio[start_ms:end_ms]
        segment.export(output_path, format="wav")
        os.remove(temp_audio_path)
    except subprocess.CalledProcessError as e:
        print(f"yt-dlp command failed for video {video_id}: {e}")
    except Exception as e:
        print(f"An error occurred for video {video_id} with yt-dlp: {str(e)}")

# Create output directory if not exists
os.makedirs('audio_segments', exist_ok=True)

# Load the list of completed downloads
if os.path.exists('completed_downloads.txt'):
    with open('completed_downloads.txt', 'r') as file:
        completed_downloads = file.read().splitlines()
else:
    completed_downloads = []

# Loop through the dataset and download segments
for index, row in df.iterrows():
    video_id = row['ytid']
    start_time = row['start_s']
    end_time = row['end_s']
    output_path = f"audio_segments/{index}.wav"

    # Skip already downloaded files
    if f"{index}" in completed_downloads:
        print(f"Skipping already downloaded file {index}")
        continue

    success = download_audio_pytube(video_id, start_time, end_time, output_path)
    if not success:
        download_audio_ytdlp(video_id, start_time, end_time, output_path)
    
    with open('completed_downloads.txt', 'a') as file:
        file.write(f"{index}\n")


Skipping already downloaded file 0
Skipping already downloaded file 1
Skipping already downloaded file 2
Skipping already downloaded file 3
Skipping already downloaded file 4
Skipping already downloaded file 5
Skipping already downloaded file 6
Skipping already downloaded file 7
Skipping already downloaded file 8
Skipping already downloaded file 9
Skipping already downloaded file 10
Skipping already downloaded file 11
Skipping already downloaded file 12
Skipping already downloaded file 13
Skipping already downloaded file 14
Skipping already downloaded file 15
Skipping already downloaded file 16
Skipping already downloaded file 17
Skipping already downloaded file 18
Skipping already downloaded file 19
Skipping already downloaded file 20
Skipping already downloaded file 21
Skipping already downloaded file 22
Skipping already downloaded file 23
Skipping already downloaded file 24
Skipping already downloaded file 25
Skipping already downloaded file 26
Skipping already downloaded file 27
Sk