In [1]:
# numpy==1.3.5
# pandas==1.3.5
# youtube-transcript-api==0.6.2
# yt-dlp==2023.11.16

# pip install youtube-transcript-api
# python3 -m pip install -U yt-dlp

In [2]:
import json
import io
import os
import html
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
from youtube_transcript_api.formatters import JSONFormatter
from pathlib import Path
import subprocess
import shutil
import time
import numpy as np
import pandas as pd

# Data Collection & Filtering

In [3]:
def search_videos(keywords, ytdlp_path, region_code, max_results):
    """Search for videos on YouTube based on keywords."""
    video_ids = []
    
    for keyword in keywords:
        # Added '--geo-bypass-country' with 'IE' to simulate access from Ireland
        command = [ytdlp_path, '--geo-bypass-country', region_code, '--get-id', f'ytsearch{max_results}:{keyword}']
        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        if result.stdout:
            # Extend the video_ids list with the new ids found
            video_ids.extend(result.stdout.strip().split('\n'))
    return video_ids

# Transcript Extraction and Filtering

In [4]:
def create_transcript_text_files(video_ids, main_output_directory, language):
    irish_video_ids = []
    skip = 0
    for video_id in video_ids:
        output_directory = os.path.join(main_output_directory, video_id)
        
        # Initialize the path for the transcript file without creating the directory yet
        output_file_path = os.path.join(output_directory, f"{video_id}.txt")

        # Check if the transcript file already exists and is not empty
        if os.path.exists(output_file_path) and os.path.getsize(output_file_path) > 0:
            print(f"Text Transcript file already exists and is not empty for {video_id}")
            continue  # Skip to the next file

        try:
            # Attempt to fetch the transcript in the Irish language
            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
            if transcript:
                # If the transcript exists, create the directory (if it doesn't exist already)
                if not os.path.exists(output_directory):
                    os.makedirs(output_directory)
                
                formatter = TextFormatter()

                # .format_transcript(transcript) turns the transcript into a text string.
                text_formatted = formatter.format_transcript(transcript)

                # Write the formatted text content to a file
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(text_formatted)
                print(f"Text Transcript file created for video id: {video_id}")
                irish_video_ids.append(video_id)
        except Exception as e:
            skip += 1
#             print(f"An error occurred for videos_id {video_id}: {e}")
            # If there's an error, and directory was created without files, remove it
            if os.path.exists(output_directory) and not os.listdir(output_directory):
                os.rmdir(output_directory)
            
    print(f"No. of Video ids Skipped: {skip}")        
    return irish_video_ids


# Audio Extraction

In [5]:
def download_audio_with_ytdlp(irish_video_ids, ytdlp_path, ffmeg_path, main_output_directory, output_format='wav', sample_rate=24000):
    
    for video_id in irish_video_ids:
        # Define output filename format
        
        output_directory = main_output_directory + video_id
        
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        
        output_filepath = os.path.join(output_directory, f'{video_id}.{output_format}')
        temp_filepath = os.path.join(output_directory, f'temp_{video_id}.{output_format}')

        # Check if the audio file already exists to avoid re-downloading
        if not os.path.exists(output_filepath):
            # Run yt-dlp to download the best audio
            yt_dlp_command = [
                ytdlp_path,  # Make sure yt-dlp is installed and in your PATH
                '--extract-audio',
                '--audio-format', output_format,
                '--audio-quality', '0',  # Best quality
                '--output', f'{output_directory}/%(id)s.%(ext)s',
                video_id
            ]
            subprocess.run(yt_dlp_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

            # Convert downloaded audio to desired sample rate and mono channel
            ffmpeg_command = [
                ffmeg_path,
                '-i', output_filepath,  # Input file
                '-ac', '1',  # Mono channel
                '-ar', str(sample_rate),  # Sample rate
                '-y',  # Overwrite output file if it exists
                temp_filepath  # Temporary output file
            ]
            subprocess.run(ffmpeg_command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

            # Move the converted file back to the original file path
            shutil.move(temp_filepath, output_filepath)
            print(f"Audio saved as {output_filepath}")
        else:
            print(f"Audio file already exists: {output_filepath}")


# Data Processing and Formatting

In [6]:
def create_transcript_json_files(irish_video_ids, main_output_directory, language):
    
    for video_id in irish_video_ids:
        
        output_directory = main_output_directory + video_id
        
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)        
        
        output_file_path = os.path.join(output_directory, video_id + '.json')  # Define path for transcript file

        # Check if the transcript file already exists and is not empty
        if os.path.exists(output_file_path) and os.path.getsize(output_file_path) > 0:
            print(f"JSON Transcript file already exists and is not empty for {video_id}")
            continue  # Skip to the next file

        try:
            # Attempt to fetch the transcript in the Irish language
            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
            
            formatter = JSONFormatter()

            # .format_transcript(transcript) turns the transcript into a JSON string.
            json_formatted = formatter.format_transcript(transcript, indent=2)
            
            # Write the formatted text content to a file
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write(json_formatted)
            print(f"Text Transcript file created for video id: {video_id}")
            
        except Exception as e:
            print(f"An error occurred for video_id {video_id}: {e}")
            # Skip file if there's an error (you can decide to delete the audio file or keep it for manual review)

In [7]:
def create_text_files(main_directory, speaker_id='0000'):
    # Iterate over all subdirectories in the main directory
    for subdir in next(os.walk(main_directory))[1]:
        video_id = subdir
        json_path = Path(main_directory) / video_id / f"{video_id}.json"
        output_dir = Path(main_directory) / video_id / 'segments'

        # Ensure the output directory exists
        output_dir.mkdir(parents=True, exist_ok=True)

        # Load the transcript data from the JSON file if it exists
        if json_path.exists():
            with open(json_path, 'r', encoding='utf-8') as file:
                transcripts = json.load(file)

            # Iterate over the transcript entries and create a text file for each
            for entry in transcripts:
                # Construct the filename using the speaker ID and start time as unique identifier
                start_time_str = "{:05d}".format(int(entry['start'] * 1000))  # Convert start time to milliseconds and format
                filename = f"{speaker_id}_{video_id}_{start_time_str}.txt"
                file_path = output_dir / filename

                # Write the transcript text to the file
                with open(file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(entry['text'])
            print(f"Text files created for video ID: {video_id}")
        else:
            print(f"No JSON found for video ID: {video_id}")


In [8]:
import os
video_ids = []
def main():
    # Parameters - setting default values
    default_output_directory = 'Libri-YoutubeTTS/'
    default_search_keywords = [
        'Gaeilge le haghaidh',  'Foghlaim Gaeilge', 'Cúrsa Gaeilge', 'Nuacht Gaeilge', 'Stair Éireann',
        'Gaeilge Labhartha', 'Bunús na Gaeilge', 'Gaeilge Bheo', 'Cultúr na hÉireann', 'Gaeilge Coláiste',
        'Gaeilge Scoile', 'Éireannach', 'TG4 Gaeilge', 'RTÉ Gaeilge', 'Pobal Gaeilge', 'Gaeilge Aclaí',
        'Gaeilge Físeán', 'Ceachtanna Gaeilge', 'Gaeilge Amhrán'  # 'Gaeilge Amhrán' cautiously included, can skip if music is often returned
    ]

    default_region_code = 'IE'
    
    default_language = 'ga'  # ISO 639-1 code for Irish
    default_max_results = 70
    output_directory = input(f"Enter output directory [{default_output_directory}]: ") or default_output_directory
    search_keywords = input(f"Enter search keywords separated by comma (e.g. Gaeilge,Ireland,Irish language), or use the default one") or default_search_keywords
    language = input(f"Enter the language code (ISO 639-1) for transcripts (default is 'ga' for Irish): ") or default_language
    region_code = input(f"Enter the Region code for extracting channels for specific region (default is 'IE' for Ireland): ") or default_region_code    
    max_results = input(f"Enter the max results per keyword (default is 70): ") or default_max_results  
    
#     ytdlp_path = '/Users/nikhiljindal/opt/anaconda3/envs/myenv/bin/yt-dlp'
#     ffmeg_path = '/opt/homebrew/bin/ffmpeg'
    
    print('--------------------------------------------------------------------')

    print('output_directory:', output_directory)
    print('search_keywords:', search_keywords)
    print('language:', language)
    print('region_code:', region_code)    
    print('max results per keyword:', max_results)    

    print('--------------------------------------------------------------------')

    
    # Instructions for yt-dlp path
    print("YT-DLP is a command-line program to download videos from YouTube and other video sites.")
    print("If you haven't installed yt-dlp, you can install it via pip: pip install yt-dlp")
    print("After installation, you can find the path by typing 'which yt-dlp' in your terminal (Unix) or 'where yt-dlp' in your command prompt (Windows).")
    ytdlp_path = input("Enter your yt-dlp path (leave blank to use default 'yt-dlp'): ")
    print(f"YT-DLP path set to: {ytdlp_path}")
    print('--------------------------------------------------------------------')
    
    # Instructions for FFmpeg path
    print("\nFFmpeg is a complete, cross-platform solution to record, convert and stream audio and video.")
    print("If you haven't installed FFmpeg, download it from https://ffmpeg.org/download.html and follow the installation instructions for your operating system.")
    print("After installation, you can find the path by typing 'which ffmpeg' in your terminal (Unix) or 'where ffmpeg' in your command prompt (Windows).")
    ffmpeg_path = input("Enter your FFmpeg path (leave blank to use default 'ffmpeg'): ")
    print(f"FFmpeg path set to: {ffmpeg_path}")
    print('--------------------------------------------------------------------')

    
    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
        print(f"Created directory: {output_directory}")
    else:
        print(f"Directory already exists: {output_directory}")
    
    start_time = time.time()

    # A. Data Collection
    print("Collecting Video IDs...")
    video_ids = search_videos(search_keywords,ytdlp_path,region_code, max_results)
    print("Number of overall video_ids: ", len(video_ids))
    
    # B. Filter Video_ids and Extract Text Transcript Files
    irish_video_ids = create_transcript_text_files(video_ids,output_directory,language )
    print("Number of irish_video_ids: ", len(irish_video_ids))
    
    # C. Audio Extraction 
    print("\n--- Audio Extraction ---")
    download_audio_with_ytdlp(irish_video_ids, ytdlp_path, ffmpeg_path, output_directory)
    
    # D.
    print("Extracting JSON transcripts...")
    create_transcript_json_files(irish_video_ids, output_directory, language)
    
    # E.
    print("Preparing Segments...")
    create_text_files(output_directory)


    print("Dataset Created")
    end_time = time.time()
    
    print("Time taken in minutes: ", (end_time - start_time)//60)
    return irish_video_ids

# Execute the pipeline
if __name__ == '__main__':
    video_ids = main()


Enter output directory [Libri-YoutubeTTS/]: 
Enter search keywords separated by comma (e.g. Gaeilge,Ireland,Irish language), or use the default one
Enter the language code (ISO 639-1) for transcripts (default is 'ga' for Irish): 
Enter the Region code for extracting channels for specific region (default is 'IE' for Ireland): 
Enter the max results per keyword (default is 70): 
--------------------------------------------------------------------
output_directory: Libri-YoutubeTTS/
search_keywords: ['Gaeilge le haghaidh', 'Foghlaim Gaeilge', 'Cúrsa Gaeilge', 'Nuacht Gaeilge', 'Stair Éireann', 'Gaeilge Labhartha', 'Bunús na Gaeilge', 'Gaeilge Bheo', 'Cultúr na hÉireann', 'Gaeilge Coláiste', 'Gaeilge Scoile', 'Éireannach', 'TG4 Gaeilge', 'RTÉ Gaeilge', 'Pobal Gaeilge', 'Gaeilge Aclaí', 'Gaeilge Físeán', 'Ceachtanna Gaeilge', 'Gaeilge Amhrán']
language: ga
region_code: IE
max results per keyword: 70
--------------------------------------------------------------------
YT-DLP is a command-lin

Audio saved as Libri-YoutubeTTS/d7eXvjo__lA/d7eXvjo__lA.wav
Audio saved as Libri-YoutubeTTS/7UaeLWmxM7U/7UaeLWmxM7U.wav
Audio saved as Libri-YoutubeTTS/D9-LXij2EGI/D9-LXij2EGI.wav
Audio saved as Libri-YoutubeTTS/EALnfaatfig/EALnfaatfig.wav
Audio saved as Libri-YoutubeTTS/bXQBzva2YCE/bXQBzva2YCE.wav
Audio saved as Libri-YoutubeTTS/n-4LLi0u-yg/n-4LLi0u-yg.wav
Audio saved as Libri-YoutubeTTS/_8ZqvtgCqKQ/_8ZqvtgCqKQ.wav
Audio saved as Libri-YoutubeTTS/b2_ROobLaE0/b2_ROobLaE0.wav
Audio saved as Libri-YoutubeTTS/yx2_iBpVVig/yx2_iBpVVig.wav
Audio saved as Libri-YoutubeTTS/Q3iJVwtdDa4/Q3iJVwtdDa4.wav
Audio saved as Libri-YoutubeTTS/gS-FdpqMu1g/gS-FdpqMu1g.wav
Audio saved as Libri-YoutubeTTS/4paK1U8r9c4/4paK1U8r9c4.wav
Audio saved as Libri-YoutubeTTS/TlemJSJOcL4/TlemJSJOcL4.wav
Audio saved as Libri-YoutubeTTS/IiFawsZVrRY/IiFawsZVrRY.wav
Audio saved as Libri-YoutubeTTS/opuF6zZ2Xes/opuF6zZ2Xes.wav
Audio saved as Libri-YoutubeTTS/2Cna3dvjeF0/2Cna3dvjeF0.wav
Audio saved as Libri-YoutubeTTS/UMV8ydmH

In [9]:
len(video_ids)

57