In [None]:
import os

# Skip clone if folder already exists
if not os.path.exists('Voice_Extractor'):
    !git clone -q https://github.com/ReisCook/Voice_Extractor.git

# Install requirements directly from repo
!pip uninstall -y fastai
!pip install -q -r Voice_Extractor/requirements.txt
!pip install -q ipywidgets pandas matplotlib huggingface_hub datasets

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import ipywidgets as widgets
import subprocess
import shutil
import pandas as pd
from IPython.display import display, HTML
from google.colab import files
from huggingface_hub import login
from pathlib import Path

# CSS for UI
display(HTML("""

"""))

# Create UI components
def create_section(title):
    return widgets.HTML(f"{title}")

def create_text_input(description, placeholder="", required=False, password=False):
    widget_class = widgets.Password if password else widgets.Text
    return widget_class(
        description=f"{'*' if required else ''}{description}:",
        placeholder=placeholder,
        layout=widgets.Layout(width='100%')
    )

def create_dropdown(description, options, default_value=None):
    return widgets.Dropdown(
        description=f"{description}:",
        options=options,
        value=default_value or options[0],
        layout=widgets.Layout(width='100%')
    )

def create_slider(description, min_val, max_val, step, default):
    return widgets.FloatSlider(
        description=f"{description}:",
        min=min_val, max=max_val, step=step, value=default,
        layout=widgets.Layout(width='100%')
    )

def create_checkbox(description, initial_value=False):
    return widgets.Checkbox(
        description=description,
        value=initial_value,
        layout=widgets.Layout(width='auto')
    )

# Create sections
auth_section = create_section("Authentication & Setup")
input_section = create_section("Input Files & Target Name")
processing_section = create_section("Basic Processing Options")
performance_section = create_section("Performance & Memory Options")
advanced_section = create_section("Advanced Settings")
output_section = create_section("Output Handling & Export")

# Create warning about memory usage
memory_warning = widgets.HTML(
    "⚠️ 'Skip Demucs' is enabled by default to prevent memory errors in Colab. Disable only if processing small files."
)

# Create inputs
hf_token = create_text_input("HF Token", "hf_...", required=True, password=True)
audio_dir = create_text_input("Audio Directory", "/content/drive/MyDrive/your_audio_folder", required=True)
reference_file = create_text_input("Reference Audio", "/content/drive/MyDrive/your_reference.wav", required=True)
target_name = create_text_input("Target Name", "e.g., JohnDoe", required=True)
output_dir = create_text_input("Output Directory", "/content/drive/MyDrive/VoiceExtractor_Runs", required=True)

# Processing options
output_sr = create_dropdown("Output Sample Rate", [16000, 22050, 24000, 44100, 48000], 24000)
whisper_model = create_dropdown(
    "Whisper Model", 
    ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3'],
    'large-v3'
)
language = create_text_input("Language Code", "en")

# Performance options - make Skip Demucs enabled by default
skip_demucs = create_checkbox("Skip Demucs Vocal Separation", initial_value=True)
skip_demucs_description = widgets.HTML(
    "Recommended for Colab. If your audio already has isolated vocals, keep this checked."
)

# Advanced options
min_duration = create_slider("Min Segment Duration", 0.5, 10.0, 0.1, 1.0)
merge_gap = create_slider("Merge Gap", 0.0, 2.0, 0.05, 0.25)
verification_threshold = create_slider("Verification Threshold", 0.0, 1.0, 0.01, 0.69)
concat_silence = create_slider("Concatenation Silence", 0.0, 5.0, 0.1, 0.5)
disable_speechbrain = create_checkbox("Disable SpeechBrain Verification")
skip_rejected_transcripts = create_checkbox("Skip Transcribing Rejected Segments")
diar_model = create_dropdown(
    "Diarization Model", 
    ["pyannote/speaker-diarization-3.1", "pyannote/speaker-diarization-3.0"],
    "pyannote/speaker-diarization-3.1"
)
osd_model = create_dropdown(
    "OSD Model", 
    ["pyannote/overlapped-speech-detection"],
    "pyannote/overlapped-speech-detection"
)
dry_run = create_checkbox("Dry Run (Process first 60s only)")
debug_log = create_checkbox("Enable Verbose Debug Logging")
keep_temp_files = create_checkbox("Keep Temporary Processing Files")

# Output options
output_method = create_dropdown(
    "Output Methods", 
    [
        "Save ZIP to GDrive & Download to Computer", 
        "Download ZIP to Computer (No GDrive save of .zip)", 
        "Save ZIP to GDrive Only"
    ]
)
push_to_hf = create_checkbox("Push Final Dataset to Hugging Face Hub")
hf_dataset_repo = create_text_input("HF Dataset Repo", "your_username/dataset_name")
hf_dataset_private = create_checkbox("Make HF Dataset Private", initial_value=True)
hf_dataset_repo.disabled = True
hf_dataset_private.disabled = True

# Status elements
status_message = widgets.HTML("Status: Ready. Configure and click Start.")
validation_message = widgets.HTML()
log_output = widgets.Output(layout={'height': '400px', 'overflow_y': 'scroll', 'border': '1px solid #ccc', 'margin-top': '10px'})
results_output = widgets.Output(layout={'margin-top': '10px'})

# Toggle HF dataset fields
def toggle_hf_fields(change):
    hf_dataset_repo.disabled = not change['new']
    hf_dataset_private.disabled = not change['new']
    validate_inputs()

push_to_hf.observe(toggle_hf_fields, names='value')

# Run button
start_btn = widgets.Button(
    description="🚀 Start Extraction",
    button_style='success',
    icon='play',
    disabled=True,
    layout={'width': '250px', 'height': '40px', 'margin': '10px 0'}
)

# Validation function
def validate_inputs(*args):
    required_fields = [hf_token, audio_dir, reference_file, target_name, output_dir]
    all_valid = all(w.value.strip() for w in required_fields)
    
    if push_to_hf.value:
        all_valid = all_valid and hf_dataset_repo.value.strip()
    
    start_btn.disabled = not all_valid
    validation_message.value = "All required fields are filled." if all_valid else "Please fill all required fields marked with *."

# Add observers
for w in [hf_token, audio_dir, reference_file, target_name, output_dir, hf_dataset_repo]:
    w.observe(lambda change: validate_inputs(), names='value')

# Main function to run extraction
def run_extraction(b):
    # Import here to ensure they're available in this function's scope
    from IPython.display import Audio
    
    log_output.clear_output()
    results_output.clear_output()
    
    # Update UI
    start_btn.disabled = True
    start_btn.description = "🔄 Processing..."
    start_btn.icon = "spinner"
    status_message.value = "Status: Initializing... Authenticating with Hugging Face..."
    
    # Login to HuggingFace
    with log_output:
        try:
            print(f"Authenticating with Hugging Face using token starting with: {hf_token.value[:4]}...")
            login(token=hf_token.value, add_to_git_credential=False)
            print("✅ Authentication successful")
            
            # Find audio files
            audio_dir_path = Path(audio_dir.value)
            if not audio_dir_path.exists() or not audio_dir_path.is_dir():
                raise FileNotFoundError(f"Audio directory not found: {audio_dir_path}")
            
            audio_files = []
            for ext in ['.wav', '.mp3', '.m4a', '.flac']:
                audio_files.extend(list(audio_dir_path.glob(f"*{ext}")))
            
            if not audio_files:
                raise FileNotFoundError(f"No audio files found in {audio_dir_path}")
            
            input_audio_file = audio_files[0]
            print(f"Found audio file: {input_audio_file}")
            
            # Build command
            cmd_list = [
                "python", "Voice_Extractor/run_extractor.py",
                "--input-audio", f'"{str(input_audio_file)}"',
                "--reference-audio", f'"{str(reference_file.value)}"',
                "--target-name", target_name.value,
                "--output-base-dir", f'"{str(output_dir.value)}"',
                "--token", hf_token.value,
                "--output-sr", str(output_sr.value),
                "--whisper-model", whisper_model.value,
                "--min-duration", str(min_duration.value),
                "--merge-gap", str(merge_gap.value),
                "--verification-threshold", str(verification_threshold.value),
                "--concat-silence", str(concat_silence.value),
                "--diar-model", diar_model.value,
                "--osd-model", osd_model.value
            ]
            
            # Only add language parameter if it's not empty
            if language.value.strip():
                cmd_list.extend(["--language", language.value.strip()])
            
            # Add boolean flags
            if skip_demucs.value: cmd_list.append("--skip-demucs")
            if disable_speechbrain.value: cmd_list.append("--disable-speechbrain")
            if skip_rejected_transcripts.value: cmd_list.append("--skip-rejected-transcripts")
            if dry_run.value: cmd_list.append("--dry-run")
            if debug_log.value: cmd_list.append("--debug")
            if keep_temp_files.value: cmd_list.append("--keep-temp-files")
            
            # Execute command
            status_message.value = "Status: Running Voice Extractor script..."
            cmd_str = " ".join(cmd_list)
            print(f"Executing command: {cmd_str}\n--- LOG START ---")
            
            process = subprocess.Popen(cmd_str, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                                     text=True, bufsize=1, universal_newlines=True, shell=True)
            for line in process.stdout:
                print(line, end='')
            exit_code = process.wait()
            
            # Handle result
            if exit_code == 0:
                status_message.value = "Status: Voice extraction completed successfully!"
                
                # Process outputs
                run_output_dir_name = f"{target_name.value.replace(' ', '_')}_{input_audio_file.stem}_SOLO_Split"
                actual_run_output_dir = Path(output_dir.value) / run_output_dir_name
                
                # Create ZIP
                base_name_for_zip = actual_run_output_dir.parent / f"{target_name.value.replace(' ', '_')}_dataset"
                zip_file_path = f"{base_name_for_zip}.zip"
                
                print(f"\nCreating ZIP archive of results: {zip_file_path}")
                shutil.make_archive(str(base_name_for_zip), 'zip', root_dir=actual_run_output_dir.parent, base_dir=actual_run_output_dir.name)
                print(f"✅ ZIP created successfully: {zip_file_path}")
                
                # Download if requested
                if "Download to Computer" in output_method.value:
                    print("\nPreparing to download ZIP file...")
                    files.download(zip_file_path)
                    print("✅ Download initiated. Check your browser downloads.")
                
                # Push to HF if requested
                if push_to_hf.value:
                    print(f"\nPreparing to push dataset to Hugging Face: {hf_dataset_repo.value}")
                    from datasets import load_dataset, Audio as HFAudio
                    
                    verified_csv_path = list(actual_run_output_dir.glob("transcripts_solo_verified/*.csv"))[0]
                    print(f"Loading dataset from {verified_csv_path}")
                    
                    ds = load_dataset('csv', data_files={'train': str(verified_csv_path)}, 
                                    data_dir=str(actual_run_output_dir))
                    ds = ds.cast_column('filename', HFAudio())
                    
                    print(f"Pushing dataset to Hugging Face Hub: {hf_dataset_repo.value}")
                    ds.push_to_hub(
                        hf_dataset_repo.value,
                        private=hf_dataset_private.value,
                        token=hf_token.value,
                        embed_external_files=True
                    )
                    print(f"✅ Dataset pushed successfully to https://huggingface.co/datasets/{hf_dataset_repo.value}")
                
                # Show results
                with results_output:
                    print("## Extraction Results Summary\n")
                    
                    try:
                        concat_files = list(actual_run_output_dir.glob("concatenated_audio_solo/*.wav"))
                        if concat_files:
                            concat_file = concat_files[0]
                            print(f"### Concatenated audio: {concat_file.name}\n")
                            display(Audio(str(concat_file)))
                        else:
                            print("### No concatenated audio file found\n")
                    except Exception as e:
                        print(f"### Error displaying audio: {str(e)}\n")
                    
                    try:
                        transcript_csv_files = list(actual_run_output_dir.glob("transcripts_solo_verified/*.csv"))
                        if transcript_csv_files:
                            transcript_csv = transcript_csv_files[0]
                            df = pd.read_csv(transcript_csv)
                            print(f"\n### Transcript sample (from {transcript_csv.name}):\n")
                            display(df.head())
                            print(f"\nTotal segments: {len(df)}")
                        else:
                            print("\n### No transcript CSV found")
                    except Exception as e:
                        print(f"\n### Error displaying transcript: {str(e)}")
            else:
                if "Demucs failed! (RC: -9)" in str(process.stdout):
                    status_message.value = f"""
                    Error: Demucs ran out of memory. Please check "Skip Demucs" option and try again."""
                    print("\n❌ ERROR: Demucs ran out of memory (RC: -9). This is common in Colab with large files.")
                    print("SOLUTION: Check the 'Skip Demucs Vocal Separation' option and try again.")
                else:
                    status_message.value = f"Error: Voice extraction failed with exit code {exit_code}."
                    print(f"\n❌ Process failed with exit code: {exit_code}")
                
        except Exception as e:
            status_message.value = f"Error: {str(e)}"
            print(f"❌ Error: {str(e)}")
    
    # Re-enable UI
    start_btn.disabled = False
    start_btn.description = "🚀 Start Extraction"
    start_btn.icon = "play"
    validate_inputs()

# Attach event to button
start_btn.on_click(run_extraction)

# Group related settings
segment_params = widgets.VBox([min_duration, merge_gap, verification_threshold, concat_silence, 
                             disable_speechbrain, skip_rejected_transcripts])
model_params = widgets.VBox([diar_model, osd_model])
debug_params = widgets.VBox([dry_run, debug_log, keep_temp_files])

# Create accordion for advanced settings
advanced_accordion = widgets.Accordion(
    children=[segment_params, model_params, debug_params]
)
advanced_accordion.set_title(0, 'Segment Parameters')
advanced_accordion.set_title(1, 'Model Options')
advanced_accordion.set_title(2, 'Debug & Temp Files')

# Create final layout
main_layout = widgets.VBox([
    widgets.HTML("Voice Extractor - Google Colab Interface"),
    widgets.HTML("Extract solo voice segments of a target speaker from multi-speaker recordings"),
    
    auth_section,
    hf_token,
    
    input_section,
    audio_dir,
    reference_file,
    target_name,
    output_dir,
    
    processing_section,
    output_sr,
    whisper_model,
    language,
    
    performance_section,
    memory_warning,
    widgets.HBox([skip_demucs, skip_demucs_description]),
    
    advanced_section,
    advanced_accordion,
    
    output_section,
    output_method,
    push_to_hf,
    hf_dataset_repo,
    hf_dataset_private,
    
    widgets.HBox([start_btn, validation_message]),
    status_message,
    create_section("Processing Log"),
    log_output,
    create_section("Results"),
    results_output
])

# Initialize validation
validate_inputs()

# Display the UI
display(main_layout)
print("Voice Extractor ready to use!")

# Voice Extractor - Usage Instructions

This notebook provides a graphical interface for the [Voice Extractor](https://github.com/ReisCook/Voice_Extractor) tool, which identifies, isolates, and transcribes clean solo segments of a target speaker from multi-speaker audio recordings.

## How to Use

1. **Authentication**: Enter your HuggingFace User Access Token. This is required to access PyAnnote models.
2. **Input Files**:
   - Specify the folder containing your audio (first compatible audio file will be processed)
   - Select a clean reference audio of ONLY your target speaker (5-30 seconds)
   - Enter a name for your target speaker
   - Choose an output directory for results
3. **Processing Options**: Configure sample rate, transcription model, and other settings
4. **Advanced Options**: Fine-tune segment parameters, model selection, and debug settings
5. **Output Handling**: Choose how to save results and optionally push to Hugging Face (huggingface upload is broken atm)
6. **Start Processing**: Click the "Start Extraction" button when all required fields are filled

## Important Notes

- You need to accept the terms of use for the following PyAnnote models on Hugging Face:
  - [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
  - [pyannote/overlapped-speech-detection](https://huggingface.co/pyannote/overlapped-speech-detection)
  - [pyannote/segmentation-3.0](https://huggingface.co/pyannote/segmentation-3.0)
  - [pyannote/segmentation](https://huggingface.co/pyannote/segmentation)
- You'll need a Hugging Face access token which you can create at: [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)
- For optimal results, provide a clean reference audio with only the target speaker's voice
- The "Dry Run" option is helpful for testing as it processes only the first 60 seconds
- GPU acceleration is automatically used when available
- **Colab Users**: Select T4 GPU runtime for 10-20x faster processing (Runtime → Change runtime type → T4)
- **Need Help?** If you encounter any issues or have questions, feel free to contact me at: reiscook@gmail.com

For more detailed documentation, visit the [Voice Extractor GitHub repository](https://github.com/ReisCook/Voice_Extractor).
