In [None]:
import os

# Skip clone if folder already exists
if not os.path.exists('Voice_Extractor'):
    !git clone -q https://github.com/ReisCook/Voice_Extractor.git

# Install requirements directly from repo
!pip uninstall -y fastai
!pip install -q -r Voice_Extractor/requirements.txt
!pip install -q ipywidgets pandas matplotlib huggingface_hub datasets

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import ipywidgets as widgets
import subprocess
import shutil
import pandas as pd
from IPython.display import display, HTML, Audio # Added Audio here for earlier availability if needed
from google.colab import files
from huggingface_hub import login
from pathlib import Path

# Removed empty HTML display call that was here

# Create UI components
def create_section(title):
    return widgets.HTML(f"<h3>{title}</h3>") # Made title a bit more prominent

def create_text_input(description, placeholder="", required=False, password=False):
    widget_class = widgets.Password if password else widgets.Text
    return widget_class(
        description=f"{'*' if required else ''}{description}:",
        placeholder=placeholder,
        layout=widgets.Layout(width='100%')
    )

def create_dropdown(description, options, default_value=None):
    # Ensure default_value is one of the options, otherwise use the first option
    actual_default = default_value if default_value in options else (options[0] if options else None)
    return widgets.Dropdown(
        description=f"{description}:",
        options=options,
        value=actual_default,
        layout=widgets.Layout(width='100%')
    )

def create_slider(description, min_val, max_val, step, default):
    return widgets.FloatSlider(
        description=f"{description}:",
        min=min_val, max=max_val, step=step, value=default,
        layout=widgets.Layout(width='100%')
    )

def create_checkbox(description, initial_value=False):
    return widgets.Checkbox(
        description=description,
        value=initial_value,
        layout=widgets.Layout(width='auto')
    )

# Create sections
auth_section = create_section("Authentication & Setup")
input_section = create_section("Input Files & Target Name")
processing_section = create_section("Basic Processing Options")
performance_section = create_section("Performance & Memory Options")
advanced_section = create_section("Advanced Settings")
output_section = create_section("Output Handling & Export")

# Create warning about memory usage
memory_warning = widgets.HTML(
    "<p style='color:orange;'>⚠️ 'Skip Demucs' is enabled by default to prevent memory errors in Colab. Disable only if processing small files or if your audio is already vocally isolated.</p>"
)

# Create inputs
hf_token = create_text_input("HF Token", "hf_...", required=True, password=True)
audio_dir = create_text_input("Audio Directory", "/content/drive/MyDrive/your_audio_folder", required=True)
reference_file = create_text_input("Reference Audio", "/content/drive/MyDrive/your_reference.wav", required=True)
target_name = create_text_input("Target Name", "e.g., JohnDoe", required=True)
output_dir = create_text_input("Output Directory", "/content/drive/MyDrive/VoiceExtractor_Runs", required=True)

# Processing options
output_sr = create_dropdown("Output Sample Rate", [16000, 22050, 24000, 44100, 48000], 24000)
whisper_model = create_dropdown(
    "Whisper Model", 
    ['tiny.en', 'tiny', 'base.en', 'base', 'small.en', 'small', 'medium.en', 'medium', 'large-v1', 'large-v2', 'large-v3'],
    'large-v3'
)
language = create_text_input("Language Code (e.g., en, es)", "en")

# Performance options - make Skip Demucs enabled by default
skip_demucs = create_checkbox("Skip Demucs Vocal Separation", initial_value=True)
skip_demucs_description = widgets.HTML(
    "<small>Recommended for Colab. If your audio already has isolated vocals, keep this checked.</small>"
)

# Advanced options
min_duration = create_slider("Min Segment Duration (s)", 0.5, 10.0, 0.1, 1.0)
merge_gap = create_slider("Merge Gap (s)", 0.0, 2.0, 0.05, 0.25)
verification_threshold = create_slider("Verification Threshold (0-1)", 0.0, 1.0, 0.01, 0.69)
concat_silence = create_slider("Concatenation Silence (s)", 0.0, 5.0, 0.1, 0.5)
disable_speechbrain = create_checkbox("Disable SpeechBrain Verification")
skip_rejected_transcripts = create_checkbox("Skip Transcribing Rejected Segments")
diar_model = create_dropdown(
    "Diarization Model", 
    ["pyannote/speaker-diarization-3.1", "pyannote/speaker-diarization-3.0"],
    "pyannote/speaker-diarization-3.1"
)
osd_model = create_dropdown(
    "OSD Model", 
    ["pyannote/overlapped-speech-detection"],
    "pyannote/overlapped-speech-detection"
)
dry_run = create_checkbox("Dry Run (Process first 60s only)")
debug_log = create_checkbox("Enable Verbose Debug Logging")
keep_temp_files = create_checkbox("Keep Temporary Processing Files")

# Output options
output_method = create_dropdown(
    "Output Methods", 
    [
        "Save ZIP to GDrive & Download to Computer", 
        "Download ZIP to Computer (No GDrive save of .zip)", 
        "Save ZIP to GDrive Only"
    ],
    "Save ZIP to GDrive & Download to Computer"
)
push_to_hf = create_checkbox("Push Final Dataset to Hugging Face Hub")
hf_dataset_repo = create_text_input("HF Dataset Repo", "your_username/dataset_name")
hf_dataset_private = create_checkbox("Make HF Dataset Private", initial_value=True)
hf_dataset_repo.disabled = True
hf_dataset_private.disabled = True

# Status elements
status_message = widgets.HTML("Status: Ready. Configure and click Start.")
validation_message = widgets.HTML()
log_output = widgets.Output(layout={'height': '400px', 'overflow_y': 'scroll', 'border': '1px solid #ccc', 'margin_top': '10px'})
results_output = widgets.Output(layout={'margin_top': '10px'})

# Toggle HF dataset fields
def toggle_hf_fields(change):
    hf_dataset_repo.disabled = not change.new # Direct attribute access for 'new' value
    hf_dataset_private.disabled = not change.new
    validate_inputs()

push_to_hf.observe(toggle_hf_fields, names='value')

# Run button
start_btn = widgets.Button(
    description="🚀 Start Extraction",
    button_style='success',
    icon='play',
    disabled=True,
    layout={'width': '250px', 'height': '40px', 'margin': '10px 0'}
)

# Validation function
def validate_inputs(*args): # *args catches any observer event data
    required_field_widgets = [hf_token, audio_dir, reference_file, target_name, output_dir]
    all_valid = all(w.value.strip() for w in required_field_widgets)
    
    if push_to_hf.value:
        all_valid = all_valid and hf_dataset_repo.value.strip()
    
    start_btn.disabled = not all_valid
    if all_valid:
        validation_message.value = "<p style='color:green;'>All required fields are filled.</p>"
    else:
        validation_message.value = "<p style='color:red;'>Please fill all required fields marked with *.</p>"

# Add observers (using direct function reference as suggested and previously implemented)
for w in [hf_token, audio_dir, reference_file, target_name, output_dir, hf_dataset_repo]:
    w.observe(validate_inputs, names='value')

# Main function to run extraction
def run_extraction(b): # b is the button instance from on_click
    log_output.clear_output()
    results_output.clear_output()
    
    # Update UI
    start_btn.disabled = True
    start_btn.description = "🔄 Processing..."
    start_btn.icon = "spinner"
    status_message.value = "Status: Initializing... Authenticating with Hugging Face..."
    
    with log_output:
        try:
            print(f"Authenticating with Hugging Face using token starting with: {hf_token.value[:4]}...")
            login(token=hf_token.value, add_to_git_credential=False)
            print("✅ Authentication successful")
            
            audio_dir_path = Path(audio_dir.value)
            if not audio_dir_path.exists() or not audio_dir_path.is_dir():
                raise FileNotFoundError(f"Audio directory not found or not a directory: {audio_dir_path}")
            
            audio_files = []
            for ext in ['.wav', '.mp3', '.m4a', '.flac']:
                audio_files.extend(list(audio_dir_path.rglob(f"*{ext}"))) # Use rglob for recursive search
            
            if not audio_files:
                raise FileNotFoundError(f"No compatible audio files (.wav, .mp3, .m4a, .flac) found in {audio_dir_path} or its subdirectories.")
            
            input_audio_file = audio_files[0]
            print(f"Processing audio file: {input_audio_file}")
            
            # Build command list (shell=False, no manual quotes needed for paths)
            cmd_list = [
                "python", "Voice_Extractor/run_extractor.py",
                "--input-audio", str(input_audio_file),
                "--reference-audio", str(reference_file.value),
                "--target-name", target_name.value,
                "--output-base-dir", str(output_dir.value),
                "--token", hf_token.value,
                "--output-sr", str(output_sr.value),
                "--whisper-model", whisper_model.value,
                "--min-duration", str(min_duration.value),
                "--merge-gap", str(merge_gap.value),
                "--verification-threshold", str(verification_threshold.value),
                "--concat-silence", str(concat_silence.value),
                "--diar-model", diar_model.value,
                "--osd-model", osd_model.value
            ]
            
            if language.value.strip():
                cmd_list.extend(["--language", language.value.strip()])
            
            if skip_demucs.value: cmd_list.append("--skip-demucs")
            if disable_speechbrain.value: cmd_list.append("--disable-speechbrain")
            if skip_rejected_transcripts.value: cmd_list.append("--skip-rejected-transcripts")
            if dry_run.value: cmd_list.append("--dry-run")
            if debug_log.value: cmd_list.append("--debug")
            if keep_temp_files.value: cmd_list.append("--keep-temp-files")
            
            status_message.value = "Status: Running Voice Extractor script..."
            print(f"Executing command: {' '.join(cmd_list)}\n--- LOG START ---")
            
            # Execute command and capture output (shell=False)
            process = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, 
                                     text=True, bufsize=1, universal_newlines=True, shell=False)
            
            full_process_output_lines = []
            for line in process.stdout:
                print(line, end='')
                full_process_output_lines.append(line)
            exit_code = process.wait()
            full_output_str = "".join(full_process_output_lines)
            
            print("--- LOG END ---")
            
            if exit_code == 0:
                status_message.value = "<p style='color:green;'>Status: Voice extraction completed successfully!</p>"
                
                run_output_dir_name = f"{target_name.value.replace(' ', '_')}_{input_audio_file.stem}_SOLO_Split"
                actual_run_output_dir = Path(output_dir.value) / run_output_dir_name
                
                if not actual_run_output_dir.exists():
                    print(f"\n⚠️ Warning: Expected output directory {actual_run_output_dir} not found. ZIP and HF push might fail.")
                    print(f"Contents of base output directory ({output_dir.value}):")
                    try:
                        for item in Path(output_dir.value).iterdir(): print(f"  - {item.name}")
                    except Exception as list_e:
                        print(f"    Could not list contents: {list_e}")
                else:
                    print(f"\nOutput generated in: {actual_run_output_dir}")
                
                # Path for the zip file, relative to output_dir.value
                base_zip_name = f"{target_name.value.replace(' ', '_')}_{input_audio_file.stem}_dataset"
                zip_file_path_obj = Path(output_dir.value) / f"{base_zip_name}.zip"
                
                zip_created_successfully = False
                if actual_run_output_dir.exists() and actual_run_output_dir.is_dir(): 
                    print(f"\nCreating ZIP archive of results: {zip_file_path_obj}")
                    try:
                        # shutil.make_archive expects base_name without .zip extension
                        # root_dir should be the parent of the directory to be zipped
                        # base_dir is the directory to be zipped, relative to root_dir
                        shutil.make_archive(str(Path(output_dir.value) / base_zip_name), 
                                          'zip', 
                                          root_dir=actual_run_output_dir.parent, 
                                          base_dir=actual_run_output_dir.name)
                        print(f"✅ ZIP created successfully: {zip_file_path_obj}")
                        zip_created_successfully = True
                    except Exception as zip_e:
                        print(f"❌ Error creating ZIP: {zip_e}")
                else:
                    print(f"\nSkipping ZIP creation as output directory {actual_run_output_dir} was not found or is not a directory.")

                if zip_created_successfully and "Download to Computer" in output_method.value:
                    print("\nPreparing to download ZIP file...")
                    files.download(str(zip_file_path_obj))
                    print("✅ Download initiated. Check your browser downloads.")
                elif "Download to Computer" in output_method.value:
                    print("\nSkipping download as ZIP file was not created successfully.")
                
                if push_to_hf.value and actual_run_output_dir.exists() and actual_run_output_dir.is_dir():
                    print(f"\nPreparing to push dataset to Hugging Face: {hf_dataset_repo.value}")
                    from datasets import load_dataset, Audio as HFAudio # Keep import local
                    
                    verified_csv_files = list(actual_run_output_dir.glob("transcripts_solo_verified/*.csv"))
                    if not verified_csv_files:
                        print(f"❌ Error: No verified transcript CSV found in {actual_run_output_dir / 'transcripts_solo_verified'}. Cannot push to Hugging Face.")
                    else:
                        verified_csv_path = verified_csv_files[0]
                        print(f"Loading dataset from {verified_csv_path}")
                        
                        try:
                            # Load dataset. Assumes 'filename' column contains paths relative to the CSV's location OR an absolute path.
                            # The `datasets` library often resolves relative paths against the CSV's directory.
                            # If `run_extractor.py` creates filenames that are relative to `actual_run_output_dir`
                            # (e.g., 'audio_clips/segment1.wav') and the CSV is in `transcripts_solo_verified`,
                            # then paths like '../audio_clips/segment1.wav' in the CSV would work.
                            # Or, if `data_dir` is used in `load_dataset` pointing to `actual_run_output_dir`,
                            # then paths in CSV can be relative to that.
                            
                            # Simplest assumption: CSV paths are relative to actual_run_output_dir or absolute.
                            # For casting, paths in 'filename' must be valid and accessible.
                            # Let's assume `run_extractor.py` prepares CSVs such that `load_dataset` with `data_files` works naturally.
                            # The `data_dir` argument in `load_dataset` can specify the root for relative paths in data_files.
                            ds = load_dataset('csv', data_files={'train': str(verified_csv_path)}, data_dir=str(actual_run_output_dir))

                            if 'filename' not in ds['train'].column_names:
                                raise ValueError("CSV file must contain a 'filename' column with paths to audio files.")

                            # The 'filename' column paths should be resolvable by HFAudio. 
                            # If they are relative, they are usually relative to the `data_dir` or the location of the CSV.
                            # `embed_external_files=True` will bundle local files, so paths must be correct.
                            ds = ds.cast_column('filename', HFAudio(sampling_rate=int(output_sr.value)))
                            
                            print(f"Pushing dataset to Hugging Face Hub: {hf_dataset_repo.value}")
                            ds.push_to_hub(
                                hf_dataset_repo.value,
                                private=hf_dataset_private.value,
                                token=hf_token.value,
                                embed_external_files=True
                            )
                            print(f"✅ Dataset pushed successfully to https://huggingface.co/datasets/{hf_dataset_repo.value}")
                        except Exception as hf_e:
                            print(f"❌ Error during Hugging Face dataset processing or push: {hf_e}")
                            print("Info: This might be due to incorrect paths in the CSV, issues with audio files, or Hugging Face API problems.")
                            import traceback
                            traceback.print_exc() # Print full traceback for HF errors
                elif push_to_hf.value:
                    print("\nSkipping Hugging Face push as output directory was not found or is not a directory.")
                
                if actual_run_output_dir.exists() and actual_run_output_dir.is_dir():
                    with results_output:
                        display(HTML("<h2>Extraction Results Summary</h2>"))
                        try:
                            concat_files = list(actual_run_output_dir.glob("concatenated_audio_solo/*.wav"))
                            if concat_files:
                                concat_file = concat_files[0]
                                display(HTML(f"<h3>Concatenated audio: {concat_file.name}</h3>"))
                                display(Audio(str(concat_file)))
                            else:
                                display(HTML("<h3>No concatenated audio file found</h3>"))
                        except Exception as e:
                            display(HTML(f"<h3>Error displaying audio: {str(e)}</h3>"))
                        
                        try:
                            transcript_csv_files = list(actual_run_output_dir.glob("transcripts_solo_verified/*.csv"))
                            if transcript_csv_files:
                                transcript_csv = transcript_csv_files[0]
                                df = pd.read_csv(transcript_csv)
                                display(HTML(f"<h3>Transcript sample (from {transcript_csv.name}):</h3>"))
                                display(df.head())
                                display(HTML(f"<p>Total segments: {len(df)}</p>"))
                            else:
                                display(HTML("<h3>No transcript CSV found</h3>"))
                        except Exception as e:
                            display(HTML(f"<h3>Error displaying transcript: {str(e)}</h3>"))
            else:
                # Check captured output for the Demucs error message
                if "Demucs failed! (RC: -9)" in full_output_str:
                    error_html = "<p style='color:red;'><b>Error: Demucs ran out of memory (RC: -9).</b> This is common in Colab with large files.</p>"
                    error_html += "<p style='color:orange;'><b>SOLUTION:</b> Ensure 'Skip Demucs Vocal Separation' option is checked and try again. If audio is not pre-separated, this step is vital but memory-intensive.</p>"
                    status_message.value = error_html
                    print("\n❌ ERROR: Demucs ran out of memory (RC: -9).")
                    print("SOLUTION: Check the 'Skip Demucs Vocal Separation' option and try again.")
                else:
                    status_message.value = f"<p style='color:red;'>Error: Voice extraction failed with exit code {exit_code}. Check logs for details.</p>"
                    print(f"\n❌ Process failed with exit code: {exit_code}")
                
        except FileNotFoundError as e:
            status_message.value = f"<p style='color:red;'>File Not Found Error: {str(e)}</p>"
            print(f"❌ File Not Found Error: {str(e)}")
        except Exception as e:
            status_message.value = f"<p style='color:red;'>An unexpected error occurred: {str(e)}</p>"
            print(f"❌ An unexpected error occurred: {str(e)}")
            import traceback
            traceback.print_exc()
    
    # Re-enable UI
    start_btn.disabled = False
    start_btn.description = "🚀 Start Extraction"
    start_btn.icon = "play"
    validate_inputs() # Re-validate to ensure button state is correct

# Attach event to button
start_btn.on_click(run_extraction)

# Group related settings
segment_params = widgets.VBox([min_duration, merge_gap, verification_threshold, concat_silence, 
                             disable_speechbrain, skip_rejected_transcripts])
model_params = widgets.VBox([diar_model, osd_model])
debug_params = widgets.VBox([dry_run, debug_log, keep_temp_files])

# Create accordion for advanced settings
advanced_accordion = widgets.Accordion(
    children=[segment_params, model_params, debug_params]
)
advanced_accordion.set_title(0, 'Segment Parameters')
advanced_accordion.set_title(1, 'Model Options')
advanced_accordion.set_title(2, 'Debug & Temp Files')

# Create final layout
main_layout = widgets.VBox([
    widgets.HTML("<h1>Voice Extractor - Google Colab Interface</h1>"),
    widgets.HTML("<p>Extract solo voice segments of a target speaker from multi-speaker recordings. Ensure you have accepted Hugging Face model terms (see instructions below).</p>"),
    
    auth_section,
    hf_token,
    
    input_section,
    audio_dir,
    reference_file,
    target_name,
    output_dir,
    
    processing_section,
    output_sr,
    whisper_model,
    language,
    
    performance_section,
    memory_warning,
    widgets.HBox([skip_demucs, skip_demucs_description]),
    
    advanced_section,
    advanced_accordion,
    
    output_section,
    output_method,
    push_to_hf,
    hf_dataset_repo,
    hf_dataset_private,
    
    widgets.HBox([start_btn, validation_message], layout=widgets.Layout(align_items='center')),
    status_message,
    create_section("Processing Log"),
    log_output,
    create_section("Results"),
    results_output
])

# Initialize validation
validate_inputs()

# Display the UI
display(main_layout)
print("Voice Extractor UI ready! Please fill the form and start extraction.")

# Voice Extractor - Usage Instructions

This notebook provides a graphical interface for the [Voice Extractor](https://github.com/ReisCook/Voice_Extractor) tool, which identifies, isolates, and transcribes clean solo segments of a target speaker from multi-speaker audio recordings.

## How to Use

1. **Authentication**: Enter your HuggingFace User Access Token (with `read` access, `write` if pushing datasets). This is required to download PyAnnote models.
2. **Input Files**:
   - **Audio Directory**: Specify the GDrive folder containing your audio files (e.g., `/content/drive/MyDrive/my_podcast_episodes`). The tool will search this folder and its subfolders for the first compatible audio file (`.wav`, `.mp3`, `.m4a`, `.flac`).
   - **Reference Audio**: Provide a path to a clean audio file (5-30 seconds, `.wav` recommended) containing *only* your target speaker's voice. This is crucial for speaker identification.
   - **Target Name**: A descriptive name for your target speaker (e.g., `JohnDoe`, `Host_Alice`). This will be used in output filenames.
   - **Output Directory**: A GDrive folder where all results (extracted audio, transcripts, ZIP file) will be saved (e.g., `/content/drive/MyDrive/VoiceExtractor_Results`).
3. **Processing Options**: 
   - **Output Sample Rate**: Sample rate for the extracted audio files. `24000` Hz is a good balance for voice.
   - **Whisper Model**: Choose the ASR model for transcription. Larger models are more accurate but slower. `large-v3` is recommended for high accuracy.
   - **Language Code**: ISO code for the language spoken in the audio (e.g., `en` for English, `es` for Spanish). Leave blank for auto-detection by Whisper (can be less reliable).
4. **Performance & Memory Options**:
   - **Skip Demucs Vocal Separation**: **Highly recommended to keep checked in Colab**, especially for longer files, to avoid out-of-memory errors. Only uncheck if your input audio is *not* already vocally isolated (e.g., raw music with vocals) AND you are processing very short files or have ample RAM. If your audio is already clean speech (like a podcast recording or dialogue), Demucs is often unnecessary.
5. **Advanced Settings (Accordion)**:
   - **Segment Parameters**: Fine-tune how audio segments are defined and verified.
   - **Model Options**: Select specific PyAnnote models if needed (defaults are usually fine).
   - **Debug & Temp Files**: Useful for troubleshooting. `Dry Run` is great for testing your setup quickly.
6. **Output Handling & Export**:
   - **Output Methods**: Choose whether to save a ZIP of the results to GDrive, download it to your computer, or both.
   - **Push to Hugging Face Hub**: Optionally, upload the processed dataset (audio clips and transcripts) to a Hugging Face Dataset repository. (Note: Author mentioned this might be "broken atm" - functionality may vary. Ensure your CSV paths are correct and accessible for the `datasets` library if using this).
7. **Start Processing**: Once all *required fields (marked with `*`) are filled, the "🚀 Start Extraction" button will become active. Click it to begin.

## Important Notes

- **Google Drive**: This notebook requires Google Drive to be mounted for accessing input audio and saving results.
- **Hugging Face Account & Token**: You *must* have a Hugging Face account and a User Access Token. Create one at [https://huggingface.co/settings/tokens](https://huggingface.co/settings/tokens).
- **Accept Model Terms**: Before first use, you need to visit and accept the terms of use for the following PyAnnote models on Hugging Face (while logged in):
  - [pyannote/speaker-diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1)
  - [pyannote/overlapped-speech-detection](https://huggingface.co/pyannote/overlapped-speech-detection)
  - (Implicitly through diarization) [pyannote/segmentation-3.0](https://huggingface.co/pyannote/segmentation-3.0) or similar.
  Failure to do so will result in authentication errors when the script tries to download these models.
- **Reference Audio Quality**: The quality and specificity of your reference audio significantly impact the accuracy of target speaker extraction.
   - The script will use the first compatible audio file found in the specified `Audio Directory` (and its subdirectories).
- **Colab Runtime**: For best performance, use a GPU runtime (Runtime → Change runtime type → T4 GPU). Processing can be very slow on CPU.
- **Long Audio Files**: Processing very long audio files can be time-consuming and memory-intensive. Consider splitting them into smaller chunks if you encounter issues.
- **First Run**: The first time you run the script, it will download models, which may take some time.

For more detailed documentation on the underlying tool, visit the [Voice Extractor GitHub repository](https://github.com/ReisCook/Voice_Extractor).
If you encounter issues specific to this Colab interface, check the Processing Log for error messages.
