In [None]:
# Voice Extractor
import os

# First, clone the repository if it doesn't exist
if not os.path.exists('Voice_Extractor'):
    !git clone -q https://github.com/ReisCook/Voice_Extractor.git
    print("Repository cloned successfully")
else:
    print("Repository already exists, skipping clone")

# --- guarantee fixed versions for key dependencies ---
!pip uninstall -y -q datasets fsspec numpy  # Add numpy to the uninstall list

# Install NumPy 2.0 first to ensure compatibility with Numba
!pip install -q "numpy<2.0.0" 

# Install other general dependencies
!pip install -q ipywidgets pandas matplotlib huggingface_hub # 'datasets' removed from here

# Install requirements from the repo (which might include its own datasets/fsspec, but we'll override next)
!pip install -q -r Voice_Extractor/requirements.txt

# Force install the known good versions LAST
!pip install -q --force-reinstall "datasets==2.16.1" "fsspec==2023.9.2" "numpy<2.0.0"
# IMPORTANT: After this cell runs, RESTART THE COLAB RUNTIME (Runtime -> Restart runtime)
# ------------------------------------------------
os.environ["HF_DATASETS_CACHE"] = "/content/voice_extractor_cache"

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import ipywidgets as widgets
import subprocess
import shutil
import pandas as pd
from IPython.display import display, HTML
from google.colab import files as colab_files
from huggingface_hub import login
from pathlib import Path

# CSS for UI
display(HTML("""
<style>
    .widget-label {max-width: none !important; width: auto !important; overflow: visible !important; white-space: normal !important;}
    .widget-checkbox > .widget-label {min-width: 250px !important;}
    .section-header {font-size: 1.3em; font-weight: bold; color: #1A73E8; margin-top: 20px; 
                    margin-bottom: 15px; padding-bottom: 5px; border-bottom: 2px solid #1A73E8;}
    .warning-box {background-color: #fff3cd; color: #856404; padding: 10px; border: 1px solid #ffeeba; 
                 border-radius: 5px; margin: 10px 0; font-weight: bold;}
</style>
"""))

# Create UI components
def create_section(title):
    return widgets.HTML(f"<div class='section-header'>{title}</div>")

def create_text_input(description, placeholder="", required=False, password=False):
    widget_class = widgets.Password if password else widgets.Text
    return widget_class(
        description=f"{'*' if required else ''}{description}:",
        placeholder=placeholder,
        layout=widgets.Layout(width='100%')
    )

def create_dropdown(description, options, default_value=None):
    return widgets.Dropdown(
        description=f"{description}:",
        options=options,
        value=default_value or options[0],
        layout=widgets.Layout(width='100%')
    )

def create_slider(description, min_val, max_val, step, default):
    return widgets.FloatSlider(
        description=f"{description}:",
        min=min_val, max=max_val, step=step, value=default,
        layout=widgets.Layout(width='100%')
    )

def create_checkbox(description, initial_value=False):
    return widgets.Checkbox(
        description=description,
        value=initial_value,
        layout=widgets.Layout(width='auto')
    )

# Create sections
auth_section = create_section("Authentication & Setup")
input_section = create_section("Input Files & Target Name")
processing_section = create_section("Basic Processing Options")
performance_section = create_section("Performance & Memory Options")
advanced_section = create_section("Advanced Settings")
output_section = create_section("Output Handling & Export")

# Create warning about memory usage
memory_warning = widgets.HTML(
    "<div class='warning-box'>‚ö†Ô∏è 'Skip Demucs' is enabled by default to prevent memory errors in Colab. Disable only if processing small files.</div>"
)

# Create inputs
hf_token = create_text_input("HF Token", "hf_...", required=True, password=True)
audio_dir = create_text_input("Audio Directory", "/content/drive/MyDrive/your_audio_folder", required=True)
reference_file = create_text_input("Reference Audio", "/content/drive/MyDrive/your_reference.wav", required=True)
target_name = create_text_input("Target Name", "e.g., JohnDoe", required=True)
output_dir = create_text_input("Output Directory", "/content/drive/MyDrive/VoiceExtractor_Runs", required=True)

# Processing options
output_sr = create_dropdown("Output Sample Rate", [16000, 22050, 24000, 44100, 48000], 24000)
whisper_model = create_dropdown(
    "Whisper Model", 
    ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3'],
    'base.en'
)
language = create_text_input("Language Code", "en")

# Performance options - make Skip Demucs enabled by default
skip_demucs = create_checkbox("Skip Demucs Vocal Separation", initial_value=True)
skip_demucs_description = widgets.HTML(
    "<span style='color:#856404;'>Recommended for Colab. If your audio already has isolated vocals, keep this checked.</span>"
)

# Advanced options
min_duration = create_slider("Min Segment Duration", 0.5, 10.0, 0.1, 1.0)
merge_gap = create_slider("Merge Gap", 0.0, 2.0, 0.05, 0.25)
verification_threshold = create_slider("Verification Threshold", 0.0, 1.0, 0.01, 0.69)
concat_silence = create_slider("Concatenation Silence", 0.0, 5.0, 0.1, 0.5)
disable_speechbrain = create_checkbox("Disable SpeechBrain Verification")
skip_rejected_transcripts = create_checkbox("Skip Transcribing Rejected Segments")
diar_model = create_dropdown(
    "Diarization Model", 
    ["pyannote/speaker-diarization-3.1", "pyannote/speaker-diarization-3.0"],
    "pyannote/speaker-diarization-3.1"
)
osd_model = create_dropdown(
    "OSD Model", 
    ["pyannote/overlapped-speech-detection", "pyannote/segmentation-3.0"],
    "pyannote/overlapped-speech-detection"
)
dry_run = create_checkbox("Dry Run (Process first 60s only)")
debug_log = create_checkbox("Enable Verbose Debug Logging")
keep_temp_files = create_checkbox("Keep Temporary Processing Files")

# Output options
output_method = create_dropdown(
    "Output Methods", 
    [
        "Save ZIP to GDrive & Download to Computer", 
        "Download ZIP to Computer (No GDrive save of .zip)", 
        "Save ZIP to GDrive Only"
    ]
)
push_to_hf = create_checkbox("Push Final Dataset to Hugging Face Hub")
hf_dataset_repo = create_text_input("HF Dataset Repo", "your_username/dataset_name")
hf_dataset_private = create_checkbox("Make HF Dataset Private", initial_value=True)
hf_dataset_repo.disabled = True
hf_dataset_private.disabled = True

# Status elements
status_message = widgets.HTML("<div style='margin-top:15px; text-align:center; padding:10px; background:#e0e0e0; border-radius:5px;'>Status: Ready. Configure and click Start.</div>")
validation_message = widgets.HTML()
log_output = widgets.Output(layout={'height': '400px', 'overflow_y': 'scroll', 'border': '1px solid #ccc', 'margin-top': '10px'})
results_output = widgets.Output(layout={'margin-top': '10px'})

# Toggle HF dataset fields
def toggle_hf_fields(change):
    hf_dataset_repo.disabled = not change['new']
    hf_dataset_private.disabled = not change['new']
    validate_inputs()

push_to_hf.observe(toggle_hf_fields, names='value')

# Run button
start_btn = widgets.Button(
    description="üöÄ Start Extraction",
    button_style='success',
    icon='play',
    disabled=True,
    layout={'width': '250px', 'height': '40px', 'margin': '10px 0'}
)

# Validation function
def validate_inputs(*args):
    required_fields = [hf_token, audio_dir, reference_file, target_name, output_dir]
    all_valid = all(w.value.strip() for w in required_fields)
    
    if push_to_hf.value:
        all_valid = all_valid and hf_dataset_repo.value.strip()
    
    start_btn.disabled = not all_valid
    validation_message.value = "<span style='color: green;'>All required fields are filled.</span>" if all_valid else "<span style='color: red;'>Please fill all required fields marked with *.</span>"

# Add observers
for w in [hf_token, audio_dir, reference_file, target_name, output_dir, hf_dataset_repo]:
    w.observe(lambda change: validate_inputs(), names='value')

# Main function to run extraction
def run_extraction(b):
    from IPython.display import Audio
    # It's good practice to re-import datasets here if its version is critical
    # and might be affected by runtime state, though the pip installs should handle it.
    import datasets
    print(f"INFO: Using datasets version: {datasets.__version__}")
    import fsspec
    print(f"INFO: Using fsspec version: {fsspec.__version__}")
    import numpy
    print(f"INFO: Using numpy version: {numpy.__version__}")

    log_output.clear_output()
    results_output.clear_output()
    
    start_btn.disabled = True
    start_btn.description = "üîÑ Processing..."
    start_btn.icon = "spinner"
    status_message.value = "<div style='margin-top:15px; text-align:center; padding:10px; background:#fff3cd; color:#856404; border:1px solid #ffeeba; border-radius:5px;'>Status: Initializing... Authenticating with Hugging Face...</div>"
    
    with log_output:
        try:
            print(f"Authenticating with Hugging Face using token starting with: {hf_token.value[:4]}...")
            login(token=hf_token.value, add_to_git_credential=False)
            print("‚úÖ Authentication successful")
            
            audio_dir_path = Path(audio_dir.value)
            if not audio_dir_path.exists() or not audio_dir_path.is_dir():
                raise FileNotFoundError(f"Audio directory not found: {audio_dir_path}")
            
            audio_files_list = []
            for ext in ['.wav', '.mp3', '.m4a', '.flac']:
                audio_files_list.extend(list(audio_dir_path.glob(f"*{ext}")))
            
            if not audio_files_list:
                raise FileNotFoundError(f"No audio files found in {audio_dir_path}")
            
            input_audio_file = audio_files_list[0]
            print(f"Found audio file: {input_audio_file}")
            
            cmd_list = [
                "python", "Voice_Extractor/run_extractor.py",
                "--input-audio", f'"{str(input_audio_file)}"',
                "--reference-audio", f'"{str(reference_file.value)}"',
                "--target-name", target_name.value,
                "--output-base-dir", f'"{str(output_dir.value)}"',
                "--token", hf_token.value,
                "--output-sr", str(output_sr.value),
                "--whisper-model", whisper_model.value,
                "--min-duration", str(min_duration.value),
                "--merge-gap", str(merge_gap.value),
                "--verification-threshold", str(verification_threshold.value),
                "--concat-silence", str(concat_silence.value),
                "--diar-model", diar_model.value,
                "--osd-model", osd_model.value
            ]
            
            if language.value.strip():
                cmd_list.extend(["--language", language.value.strip()])
            
            if skip_demucs.value: cmd_list.append("--skip-demucs")
            if disable_speechbrain.value: cmd_list.append("--disable-speechbrain")
            if skip_rejected_transcripts.value: cmd_list.append("--skip-rejected-transcripts")
            if dry_run.value: cmd_list.append("--dry-run")
            if debug_log.value: cmd_list.append("--debug")
            if keep_temp_files.value: cmd_list.append("--keep-temp-files")
            
            status_message.value = "<div style='margin-top:15px; text-align:center; padding:10px; background:#fff3cd; color:#856404; border:1px solid #ffeeba; border-radius:5px;'>Status: Running Voice Extractor script...</div>"
            cmd_str = " ".join(cmd_list)
            print(f"Executing command: {cmd_str}\n--- LOG START ---")
            
            process = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                                     text=True, bufsize=1, universal_newlines=True, shell=True)
            for line in process.stdout:
                print(line, end='')
            exit_code = process.wait()
            
            if exit_code == 0:
                status_message.value = "<div style='margin-top:15px; text-align:center; padding:10px; background:#d4edda; color:#155724; border:1px solid #c3e6cb; border-radius:5px;'>Status: Voice extraction completed successfully!</div>"
                
                run_output_dir_name = f"{target_name.value.replace(' ', '_')}_{input_audio_file.stem}_SOLO_Split"
                actual_run_output_dir = Path(output_dir.value) / run_output_dir_name
                
                import datetime
                timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
                base_name_for_zip = actual_run_output_dir.parent / f"{target_name.value.replace(' ', '_')}_dataset_{timestamp}"
                zip_file_path = f"{base_name_for_zip}.zip"
                
                print(f"\nCreating ZIP archive of results: {zip_file_path}")
                shutil.make_archive(str(base_name_for_zip), 'zip', root_dir=actual_run_output_dir.parent, base_dir=actual_run_output_dir.name)
                print(f"‚úÖ ZIP created successfully: {zip_file_path}")
                
                if "Download to Computer" in output_method.value:
                    print("\nPreparing to download ZIP file...")
                    colab_files.download(zip_file_path)
                    print("‚úÖ Download initiated. Check your browser downloads.")
                
                if push_to_hf.value:
                    print(f"\nPreparing to push dataset to Hugging Face: {hf_dataset_repo.value}")
                    
                    import tempfile
                    temp_dir = tempfile.mkdtemp()
                    print(f"Created temporary directory: {temp_dir}")
                    
                    verified_csv_path_list = list(actual_run_output_dir.glob("transcripts_solo_verified/*.csv"))
                    if not verified_csv_path_list:
                         raise FileNotFoundError(f"No verified transcript CSV found in {actual_run_output_dir / 'transcripts_solo_verified'}")
                    verified_csv_path = verified_csv_path_list[0]
                    local_csv_path = os.path.join(temp_dir, os.path.basename(verified_csv_path))
                    shutil.copy(verified_csv_path, local_csv_path)
                    print(f"Copied CSV to: {local_csv_path}")
                    
                    local_audio_dir = os.path.join(temp_dir, "audio")
                    os.makedirs(local_audio_dir, exist_ok=True)
                    
                    df_to_modify_for_hf = pd.read_csv(verified_csv_path) 
                    
                    verified_segments_dir = actual_run_output_dir / "target_segments_solo" / f"{target_name.value.replace(' ', '_')}_solo_verified"
                    if not verified_segments_dir.exists():
                        print(f"Warning: Could not find expected segments directory: {verified_segments_dir}")
                        for root, dirs, files_in_walk in os.walk(actual_run_output_dir):
                            if files_in_walk and any(f.endswith(('.wav', '.mp3', '.flac')) for f in files_in_walk):
                                verified_segments_dir = Path(root)
                                print(f"Found audio directory at: {verified_segments_dir}")
                                break
                    
                    copied_count = 0
                    for idx, row in df_to_modify_for_hf.iterrows():
                        filename = os.path.basename(row['filename'])
                        src_path = verified_segments_dir / filename
                        if not src_path.exists():
                            print(f"Warning: Could not find {src_path}, trying direct path from actual_run_output_dir / row['filename']")
                            src_path = actual_run_output_dir / row['filename'] 
                        
                        if src_path.exists():
                            dst_path = os.path.join(local_audio_dir, filename)
                            shutil.copy(str(src_path), dst_path)
                            df_to_modify_for_hf.at[idx, 'filename'] = os.path.join('audio', filename)
                            copied_count += 1
                        else:
                            print(f"Error: Could not find audio file for row: {row['filename']} at {src_path} or fallback.")
                    
                    df_to_modify_for_hf.to_csv(local_csv_path, index=False)
                    print(f"Copied {copied_count} audio files and updated CSV paths in {local_csv_path}")

                    # KEY CHANGE: Use Dataset.from_pandas instead of load_dataset
                    from datasets import Dataset, Audio as HFAudio
                    print(f"Reading updated CSV for HF from: {local_csv_path}")
                    df_for_hf = pd.read_csv(local_csv_path)
                    
                    print("Making 'filename' paths absolute for Hugging Face dataset creation...")
                    df_for_hf['filename'] = df_for_hf['filename'].apply(
                        lambda rel_path: os.path.join(temp_dir, rel_path)
                    )
                    
                    print("Creating Hugging Face Dataset from pandas DataFrame...")
                    ds = Dataset.from_pandas(df_for_hf)
                    
                    print("Casting 'filename' column to Audio object...")
                    ds = ds.cast_column('filename', HFAudio())
                    
                    print(f"Pushing dataset to Hugging Face Hub: {hf_dataset_repo.value}")
                    ds.push_to_hub(
                        hf_dataset_repo.value,
                        private=hf_dataset_private.value,
                        token=hf_token.value,
                        embed_external_files=True
                    )
                    print(f"‚úÖ Dataset pushed successfully to https://huggingface.co/datasets/{hf_dataset_repo.value}")
                    
                    shutil.rmtree(temp_dir)
                    print("Cleaned up temporary directory")
                    
                with results_output:
                    print("## Extraction Results Summary\n")
                    try:
                        concat_files_list = list(actual_run_output_dir.glob("concatenated_audio_solo/*.wav"))
                        if concat_files_list:
                            concat_file = concat_files_list[0]
                            print(f"### Concatenated audio: {concat_file.name}\n")
                            display(Audio(str(concat_file)))
                        else:
                            print("### No concatenated audio file found\n")
                    except Exception as e:
                        print(f"### Error displaying audio: {str(e)}\n")
                    try:
                        transcript_csv_files_list = list(actual_run_output_dir.glob("transcripts_solo_verified/*.csv"))
                        if transcript_csv_files_list:
                            transcript_csv_for_display = transcript_csv_files_list[0]
                            df_display = pd.read_csv(transcript_csv_for_display)
                            print(f"\n### Transcript sample (from {transcript_csv_for_display.name}):\n")
                            display(df_display.head())
                            print(f"\nTotal segments: {len(df_display)}")
                        else:
                            print("\n### No transcript CSV found for display")
                    except Exception as e:
                        print(f"\n### Error displaying transcript: {str(e)}")
            else:
                if "Demucs failed! (RC: -9)" in str(process.stdout):
                    status_message.value = f"""<div style='margin-top:15px; text-align:center; padding:10px; background:#f8d7da; color:#721c24; border:1px solid #f5c6cb; border-radius:5px;'>
                    Error: Demucs ran out of memory. Please check "Skip Demucs" option and try again.</div>"""
                    print("\n‚ùå ERROR: Demucs ran out of memory (RC: -9). This is common in Colab with large files.")
                    print("SOLUTION: Check the 'Skip Demucs Vocal Separation' option and try again.")
                else:
                    status_message.value = f"<div style='margin-top:15px; text-align:center; padding:10px; background:#f8d7da; color:#721c24; border:1px solid #f5c6cb; border-radius:5px;'>Error: Voice extraction failed with exit code {exit_code}.</div>"
                    print(f"\n‚ùå Process failed with exit code: {exit_code}")
        except Exception as e:
            status_message.value = f"<div style='margin-top:15px; text-align:center; padding:10px; background:#f8d7da; color:#721c24; border:1px solid #f5c6cb; border-radius:5px;'>Error: {str(e)}</div>"
            print(f"‚ùå Error: {str(e)}")
    
    start_btn.disabled = False
    start_btn.description = "üöÄ Start Extraction"
    start_btn.icon = "play"
    validate_inputs()

start_btn.on_click(run_extraction)

segment_params = widgets.VBox([min_duration, merge_gap, verification_threshold, concat_silence, 
                             disable_speechbrain, skip_rejected_transcripts])
model_params = widgets.VBox([diar_model, osd_model])
debug_params = widgets.VBox([dry_run, debug_log, keep_temp_files])

advanced_accordion = widgets.Accordion(
    children=[segment_params, model_params, debug_params],
    titles=('Segment Parameters', 'Model Options', 'Debug & Temp Files')
)

main_layout = widgets.VBox([
    widgets.HTML("<h1 style='text-align:center; color:#1A73E8;'>Voice Extractor - Google Colab Interface</h1>"),
    widgets.HTML("<p style='text-align:center;'>Extract solo voice segments of a target speaker from multi-speaker recordings</p>"),
    auth_section,
    hf_token,
    input_section,
    audio_dir,
    reference_file,
    target_name,
    output_dir,
    processing_section,
    output_sr,
    whisper_model,
    language,
    performance_section,
    memory_warning,
    widgets.HBox([skip_demucs, skip_demucs_description]),
    advanced_section,
    advanced_accordion,
    output_section,
    output_method,
    push_to_hf,
    hf_dataset_repo,
    hf_dataset_private,
    widgets.HBox([start_btn, validation_message]),
    status_message,
    create_section("Processing Log"),
    log_output,
    create_section("Results"),
    results_output
])

validate_inputs()
display(main_layout)
print("Voice Extractor Colab UI ready. IMPORTANT: If you've just run the setup cell for the first time or changed library versions, please RESTART THE RUNTIME now (Runtime -> Restart runtime) before proceeding.")

# Voice Extractor - Usage Instructions

This notebook provides a graphical interface for the [Voice Extractor](https://github.com/ReisCook/Voice_Extractor) tool, which identifies, isolates, and transcribes clean solo segments of a target speaker from multi-speaker audio recordings.

## How to Use

1. **Authentication**: Enter your HuggingFace User Access Token. This is required to access PyAnnote models.
2. **Input Files**:
   - Specify the folder containing your audio (first compatible audio file will be processed)
   - Select a clean reference audio of ONLY your target speaker (5-30 seconds)
   - Enter a name for your target speaker
   - Choose an output directory for results
3. **Processing Options**: Configure sample rate, transcription model, and other settings
4. **Advanced Options**: Fine-tune segment parameters, model selection, and debug settings
5. **Output Handling**: Choose how to save results and optionally push to Hugging Face
6. **Start Processing**: Click the "Start Extraction" button when all required fields are filled

## Important Notes

- You need to accept the terms of use for the following PyAnnote models on Hugging Face (need to add the other gated model links):
  - [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
  - [pyannote/overlapped-speech-detection](https://huggingface.co/pyannote/overlapped-speech-detection)
  - [pyannote/segmentation-3.0](https://huggingface.co/pyannote/segmentation-3.0)
- For optimal results, provide a clean reference audio with only the target speaker's voice
- The "Dry Run" option is helpful for testing as it processes only the first 60 seconds
- GPU acceleration is automatically used when available

For more detailed documentation, visit the [Voice Extractor GitHub repository](https://github.com/ReisCook/Voice_Extractor).
