In [16]:
import os
import time
from datasets import load_dataset
import yt_dlp
import imageio_ffmpeg

In [5]:
# --- CONFIGURATION ---
# The shard folder name as expected by OLMoASR structure
SHARD_NAME = "shard_00000"
# Directory to save the data
OUTPUT_DIR = os.path.join(os.getcwd(), "olmo_data", SHARD_NAME)
# Number of samples to download for this test
NUM_SAMPLES = 5 

In [17]:
def download_and_organize(video_id, base_output_dir):
    pair_dir = os.path.join(base_output_dir, video_id)
    os.makedirs(pair_dir, exist_ok=True)

    url = f"https://www.youtube.com/watch?v={video_id}"
    
    # Get the path to the binary provided by the python package
    ffmpeg_path = imageio_ffmpeg.get_ffmpeg_exe() # <--- GET BINARY PATH

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'{pair_dir}/audio_{video_id}.%(ext)s',
        
        # --- CRITICAL FIX FOR PACE-ICE ---
        'ffmpeg_location': ffmpeg_path,  # <--- TELL YT-DLP WHERE IT IS
        # ---------------------------------
        
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'skip_download': False,
        'quiet': True,
        'no_warnings': True,
    }

    print(f"Processing ID: {video_id}...")
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])
            
        # Post-processing rename (vtt -> transcript)
        for file in os.listdir(pair_dir):
            if file.endswith(".vtt"):
                old_path = os.path.join(pair_dir, file)
                new_path = os.path.join(pair_dir, f"transcript_{video_id}.vtt")
                os.rename(old_path, new_path)
                print(f"  -> Saved transcript: {new_path}")

    except Exception as e:
        print(f"  [ERROR] Failed to download {video_id}: {e}")


In [18]:
print("--- 1. Loading Hugging Face Dataset (Streaming Mode) ---")
# We use streaming=True to avoid downloading the metadata for all 16M rows
try:
    dataset = load_dataset("allenai/OLMoASR-Pool", split="train", streaming=True)
except Exception as e:
    print("Error loading dataset. Note: You may need to run 'huggingface-cli login' in your terminal first.")
    print(f"Error details: {e}")

 

--- 1. Loading Hugging Face Dataset (Streaming Mode) ---


In [19]:
print(f"--- 2. Starting Download of {NUM_SAMPLES} samples ---")
print(f"Output Directory: {OUTPUT_DIR}")
# Iterate through the dataset
count = 0
for sample in dataset:
    if count >= NUM_SAMPLES:
        break
        
    video_id = sample.get('id')
    
    if video_id:
        download_and_organize(video_id, OUTPUT_DIR)
        count += 1
        # Be polite to YouTube servers
        time.sleep(1) 
    
print("\n--- Done! ---")
print(f"Check your folder: {OUTPUT_DIR}")

--- 2. Starting Download of 5 samples ---
Output Directory: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/data/olmo_data/shard_00000
Processing ID: VeVUL07UG-Q...
  -> Saved transcript: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/data/olmo_data/shard_00000/VeVUL07UG-Q/transcript_VeVUL07UG-Q.vtt
Processing ID: uOnvnNAufeA...
  -> Saved transcript: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/data/olmo_data/shard_00000/uOnvnNAufeA/transcript_uOnvnNAufeA.vtt
Processing ID: o9-IWbFxUtk...
  -> Saved transcript: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/data/olmo_data/shard_00000/o9-IWbFxUtk/transcript_o9-IWbFxUtk.vtt
Processing ID: XmI260swiss...
  -> Saved transcript: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/data/olmo_data/shard_00000/XmI260swiss/transcript_XmI260swiss.vtt
Processing ID: En2Gn5ViJjI...
  -> Saved transcript: /storage/ice1/1/0/vchopra37/projects/edge_glass/code_base/data/olmo_data/shard_00000/En2Gn5ViJ