In [1]:
#!/usr/bin/env python3
"""
step_generate_download_script.py

Reads disfluency_detections.csv and generates a shell script
to download all unique full audio files using gsutil.
"""

import pandas as pd
import re
from pathlib import Path

# --- Configuration ---
INPUT_CSV = "disfluency_detections.csv"
OUTPUT_DIR = "data/raw_audio"
OUTPUT_SCRIPT = "download_full_audio.sh"

def generate_download_script():
    """Reads CSV, extracts unique URLs, and writes the gsutil shell script."""
    
    if not Path(INPUT_CSV).exists():
        print(f"[ERROR] Input CSV not found: {INPUT_CSV}. Run step3_detect_disfluencies.py first.")
        return

    df = pd.read_csv(INPUT_CSV)
    
    # Select only the relevant columns and drop duplicates
    unique_recordings = df[['recording_id', 'audio_url']].drop_duplicates()
    
    print(f"Found {len(unique_recordings)} unique full recordings to download.")
    
    # Start writing the shell script
    script_content = "#!/bin/bash\n"
    script_content += "# --- Script generated by step_generate_download_script.py ---\n"
    script_content += "# Ensure you have gsutil installed and authenticated (gcloud auth login)\n\n"
    script_content += f"mkdir -p {OUTPUT_DIR}\n"
    script_content += f"cd {OUTPUT_DIR}\n\n"
    
    # Iterate through unique recordings to generate gsutil commands
    for index, row in unique_recordings.iterrows():
        rec_id = str(row['recording_id'])
        full_url = str(row['audio_url'])
        
        # GCS URLs can sometimes be HTTP links pointing to GCS, or direct gs:// links.
        # We need to extract the gs:// path if it's an HTTP link.
        # A simple check: if it contains storage.googleapis.com, we can derive the gs:// path
        
        gcs_match = re.search(r'https?://storage\.googleapis\.com/(.*)', full_url)
        if gcs_match:
            gcs_path = f"gs://{gcs_match.group(1)}"
            
            # The destination filename is the recording ID plus .wav extension
            dest_filename = f"{rec_id}.wav"
            
            script_content += (
                f"# Downloading {rec_id}\n"
                f"echo \"Downloading {rec_id} from GCS...\"\n"
                f"gsutil cp \"{gcs_path}\" \"{dest_filename}\"\n"
                f"if [ $? -ne 0 ]; then\n"
                f"  echo \"[ERROR] gsutil failed for {rec_id}. Check gsutil setup/permissions.\"\n"
                f"fi\n\n"
            )
        else:
            print(f"[WARN] Skipping {rec_id}: URL does not look like a standard GCS link: {full_url}")


    script_content += "echo \"All download commands executed.\"\n"
    
    # Write the script file
    with open(OUTPUT_SCRIPT, "w") as f:
        f.write(script_content)
        
    print(f"Successfully generated download script: {OUTPUT_SCRIPT}")
    print("\n--- NEXT ACTION: You must run the script in your terminal: sh ./download_full_audio.sh ---")

if __name__ == "__main__":
    # Ensure you have pandas installed: pip install pandas
    generate_download_script()

Found 104 unique full recordings to download.
Successfully generated download script: download_full_audio.sh

--- NEXT ACTION: You must run the script in your terminal: sh ./download_full_audio.sh ---
