# üéôÔ∏è PrecisionVoice - Vietnamese Speech-to-Text

Notebook ƒë∆°n gi·∫£n ƒë·ªÉ transcribe audio ti·∫øng Vi·ªát s·ª≠ d·ª•ng **faster-whisper** v√† **pyannote** (diarization).

### H∆∞·ªõng d·∫´n
1. **Ch·ªçn GPU**: `Runtime` ‚Üí `Change runtime type` ‚Üí **T4 GPU**
2. **C√†i ƒë·∫∑t Secrets**: Th√™m `HF_TOKEN` v√†o Colab Secrets (Key icon b√™n tr√°i) ƒë·ªÉ d√πng Pyannote.
3. **Ch·∫°y t·ª´ng cell** theo th·ª© t·ª± t·ª´ tr√™n xu·ªëng
4. **S·ª≠ d·ª•ng Gradio link** ·ªü cell cu·ªëi ƒë·ªÉ truy c·∫≠p UI

In [None]:
# @title 1. üîç Ki·ªÉm tra GPU
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"‚úÖ GPU Detected: {gpu_name}")
    print(f"   VRAM: {gpu_mem:.1f} GB")
else:
    print("‚ö†Ô∏è KH√îNG T√åM TH·∫§Y GPU!")
    print("üëâ V√†o Runtime ‚Üí Change runtime type ‚Üí T4 GPU")

In [None]:
# @title 2. üì¶ C√†i ƒë·∫∑t Dependencies
print("Installing dependencies...")
!pip install --upgrade torch torchvision torchaudio "pyannote.audio>=3.3.1" faster-whisper gradio librosa nest_asyncio lightning torchmetrics
!apt-get install -y -qq ffmpeg > /dev/null 2>&1
print("‚úÖ Dependencies installed successfully!")

In [None]:
# @title 3. ü§ñ Load Models (Whisper & Pyannote)
import torch
import time
import os
import librosa
import numpy as np
from google.colab import userdata
from faster_whisper import WhisperModel
from pyannote.audio import Pipeline

try:
    from pyannote.audio.core.task import Specifications, Problem, Resolution
    torch.serialization.add_safe_globals([Specifications, Problem, Resolution])
except Exception as e:
    print(f"Could not add custom globals: {e}")

device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"

# Danh s√°ch c√°c model Whisper h·ªó tr·ª£
AVAILABLE_MODELS = {
    "EraX-WoW-Turbo (Whisper Large V3 Turbo - Ti·∫øng Vi·ªát)": "erax-ai/EraX-WoW-Turbo-V1.1-CT2",
    "PhoWhisper Large (Ti·∫øng Vi·ªát)": "kiendt/PhoWhisper-large-ct2"
}

# Cache models
loaded_whisper_models = {}
diarization_pipeline = None

# L·∫•y HF_TOKEN
try:
    hf_token = userdata.get('HF_TOKEN')
except:
    hf_token = os.environ.get('HF_TOKEN')

# ==================== LOAD ALL WHISPER MODELS ====================
print("="*50)
print("üîÑ Pre-downloading ALL Whisper Models...")
print("="*50)

total_start = time.time()
for model_name, model_path in AVAILABLE_MODELS.items():
    print(f"\nüì• Loading: {model_name}")
    start = time.time()
    try:
        model = WhisperModel(
            model_path,
            device=device,
            compute_type=compute_type
        )
        loaded_whisper_models[f"{model_name}_{compute_type}"] = model
        print(f"   ‚úÖ Loaded in {time.time() - start:.1f}s")
    except Exception as e:
        print(f"   ‚ùå Failed to load: {e}")

print(f"\n‚úÖ All models loaded in {time.time() - total_start:.1f}s")
print(f"   Total models: {len(loaded_whisper_models)}")
print(f"   Device: {device}, Compute: {compute_type}")

# ==================== LOAD PYANNOTE ====================
print("\n" + "="*50)
print("üîÑ Loading Pyannote Diarization...")
print("="*50)

if not hf_token:
    print("‚ö†Ô∏è WARNING: HF_TOKEN not found!")
    print("   Diarization will be disabled.")
    print("   Please set HF_TOKEN in Colab Secrets.")
else:
    start = time.time()
    try:
        diarization_pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-community-1",
            token=hf_token
        )
        diarization_pipeline.to(torch.device(device))
        print(f"‚úÖ Pyannote loaded in {time.time() - start:.1f}s")
    except Exception as e:
        print(f"‚ùå Failed to load Pyannote: {e}")

print("\n" + "="*50)
print("üéâ All models loaded successfully!")
print("="*50)


In [None]:
# @title 4. üõ†Ô∏è Utilities & Helpers
import gradio as gr
import time
import nest_asyncio
import subprocess
import os

nest_asyncio.apply()

def convert_audio_to_wav(audio_path):
    """Chu·∫©n h√≥a audio v·ªÅ ƒë·ªãnh d·∫°ng WAV 16kHz Mono."""
    try:
        # T·∫°o file t·∫°m
        output_path = "temp_processed_audio.wav"
        
        # X√≥a file c≈© n·∫øu t·ªìn t·∫°i
        if os.path.exists(output_path):
            os.remove(output_path)
            
        # Command line ffmpeg
        # -i input: file ƒë·∫ßu v√†o
        # -ar 16000: Sample rate 16k
        # -ac 1: Mono channel (Pyannote t·ªët nh·∫•t v·ªõi mono)
        # -y: Overwrite output
        command = [
            "ffmpeg", 
            "-i", audio_path,
            "-ar", "16000",
            "-ac", "1",
            "-y",
            output_path
        ]
        
        subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        return output_path
    except Exception as e:
        print(f"Error converting audio: {e}")
        # Fallback: Tr·∫£ v·ªÅ file g·ªëc n·∫øu convert l·ªói (d√π r·ªßi ro)
        return audio_path

def load_whisper_model(model_name, comp_type):
    """Dynamic load Whisper model v·ªõi cache"""
    global loaded_whisper_models
    cache_key = f"{model_name}_{comp_type}"
    
    if cache_key in loaded_whisper_models:
        return loaded_whisper_models[cache_key]
    
    model_path = AVAILABLE_MODELS[model_name]
    print(f"Loading {model_name}...")
    start = time.time()
    
    model = WhisperModel(
        model_path,
        device=device,
        compute_type=comp_type
    )
    
    loaded_whisper_models[cache_key] = model
    print(f"‚úÖ Loaded in {time.time() - start:.1f}s")
    return model

def format_timestamp(seconds):
    """Format seconds to MM:SS.ms"""
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = seconds % 60
    if hours > 0:
        return f"{hours:02d}:{minutes:02d}:{secs:05.2f}"
    return f"{minutes:02d}:{secs:05.2f}"

def assign_speaker_to_segment(seg_start, seg_end, diarization_result):
    """G√°n speaker cho segment d·ª±a tr√™n t·ª∑ l·ªá overlap >= 30%."""
    if diarization_result is None:
        return "SPEAKER_00"
    
    seg_duration = seg_end - seg_start
    if seg_duration <= 0:
        return "SPEAKER_00"
    
    speaker_overlaps = {}
    
    for turn, _, speaker in diarization_result.speaker_diarization.itertracks(yield_label=True):
        overlap_start = max(seg_start, turn.start)
        overlap_end = min(seg_end, turn.end)
        overlap = max(0, overlap_end - overlap_start)
        
        if overlap > 0:
            if speaker not in speaker_overlaps:
                speaker_overlaps[speaker] = 0
            speaker_overlaps[speaker] += overlap
    
    if not speaker_overlaps:
        return "SPEAKER_00"
    
    best_speaker = max(speaker_overlaps, key=speaker_overlaps.get)
    best_overlap = speaker_overlaps[best_speaker]
    
    if best_overlap / seg_duration >= 0.3:
        return best_speaker
    
    return "SPEAKER_00"

def merge_consecutive_segments(segments, max_gap=0.5):
    """G·ªôp c√°c segment li√™n ti·∫øp c·ªßa c√πng m·ªôt speaker."""
    if not segments:
        return []
    
    merged = []
    current = segments[0].copy()
    
    for seg in segments[1:]:
        if seg['speaker'] == current['speaker'] and (seg['start'] - current['end']) <= max_gap:
            current['end'] = seg['end']
            current['text'] += ' ' + seg['text']
        else:
            merged.append(current)
            current = seg.copy()
    
    merged.append(current)
    return merged

In [None]:
# @title 5. ‚öôÔ∏è Processing Logic
def process_audio(audio_path, model_name, language, beam_size, vad_filter, vad_min_silence, vad_speech_pad, vad_min_speech, vad_threshold, temperature, best_of, patience, length_penalty, initial_prompt, prefix, condition_on_previous_text, no_speech_threshold, log_prob_threshold, compression_ratio_threshold, comp_type, merge_segs, p=gr.Progress()):
    """
    Quy tr√¨nh m·ªõi:
    0. Chu·∫©n h√≥a audio (convert mp3 -> wav 16k).
    1. Diarization ƒë·ªÉ t√°ch c√°c ƒëo·∫°n c·ªßa t·ª´ng ng∆∞·ªùi n√≥i.
    2. C·∫Øt audio theo c√°c ƒëo·∫°n n√†y.
    3. Transcribe t·ª´ng ƒëo·∫°n audio.
    4. G·ªôp k·∫øt qu·∫£.
    """
    if audio_path is None:
        msg = "‚ö†Ô∏è Vui l√≤ng upload ho·∫∑c ghi √¢m audio!"
        return msg, msg
    
    total_start_time = time.time()
    
    # Check Pyannote
    if diarization_pipeline is None:
        return "‚ùå L·ªói: Ch∆∞a load ƒë∆∞·ª£c Pyannote (ki·ªÉm tra HF_TOKEN).", "‚ùå L·ªói: Ch∆∞a load ƒë∆∞·ª£c Pyannote."

    # 0. Preprocessing Audio (Standardize)
    p(0.05, desc="ƒêang chu·∫©n h√≥a audio (16kHz WAV)...")
    try:
        # Lu√¥n convert v·ªÅ wav 16k mono ƒë·ªÉ tr√°nh l·ªói sample rate mismatch c·ªßa Pyannote
        clean_audio_path = convert_audio_to_wav(audio_path)
    except Exception as e:
        msg = f"‚ùå L·ªói convert audio: {e}"
        return msg, msg
        
    # 1. Load Standardized Audio for slicing later
    p(0.08, desc="ƒêang ƒë·ªçc file audio...")
    try:
        y, sr = librosa.load(clean_audio_path, sr=16000)
        # sr should be 16000 now exactly
    except Exception as e:
        return f"‚ùå L·ªói ƒë·ªçc audio: {e}", f"‚ùå L·ªói ƒë·ªçc audio: {e}"

    # 2. DIARIZATION
    p(0.1, desc="ƒêang ph√¢n t√°ch ng∆∞·ªùi n√≥i (Diarization)...")
    
    try:
        # S·ª≠ d·ª•ng file ƒë√£ chu·∫©n h√≥a
        diarization = diarization_pipeline(clean_audio_path)
    except Exception as e:
        return f"‚ùå L·ªói Diarization: {e}", f"‚ùå L·ªói Diarization: {e}"
        
    diarization_segments = []
    # D√πng c√°ch user ƒë√£ fix tr∆∞·ªõc ƒë√≥ (n·∫øu model tr·∫£ v·ªÅ object kh√°c)
    # M·∫∑c ƒë·ªãnh pipeline community tr·∫£ v·ªÅ Annotation tr·ª±c ti·∫øp, nh∆∞ng user fix th√†nh diarization.speaker_diarization
    # M√¨nh s·∫Ω try/except ƒë·ªÉ support c·∫£ 2 structure cho an to√†n
    try:
        # Tr∆∞·ªùng h·ª£p 1: Standard Annotation
        iterator = diarization.itertracks(yield_label=True)
        # Test th·ª≠ xem c√≥ ch·∫°y ko, n·∫øu kh√¥ng ph·∫£i Annotation n√≥ s·∫Ω l·ªói attribute
        _ = list(iterator)
        # Reset iterate
        iterator = diarization.itertracks(yield_label=True)
    except:
        # Tr∆∞·ªùng h·ª£p 2: User report structure (maybe wrapper)
        try:
             iterator = diarization.speaker_diarization.itertracks(yield_label=True)
        except:
             return "‚ùå L·ªói format result Diarization", "‚ùå L·ªói format result Diarization"

    for turn, _, speaker in iterator:
        diarization_segments.append({
            "start": turn.start,
            "end": turn.end,
            "speaker": speaker
        })
    
    # Sort segments by start time
    diarization_segments.sort(key=lambda x: x['start'])
    
    # Merge consecutive segments if requested
    if merge_segs and diarization_segments:
        p(0.3, desc="ƒêang g·ªôp segment li√™n ti·∫øp...")
        merged = []
        current = diarization_segments[0].copy()
        for seg in diarization_segments[1:]:
            if seg['speaker'] == current['speaker'] and (seg['start'] - current['end']) <= 0.5:
                current['end'] = seg['end']
            else:
                merged.append(current)
                current = seg.copy()
        merged.append(current)
        diarization_segments = merged
    
    # 3. TRANSCRIPTION LOOP
    p(0.4, desc="ƒêang t·∫£i model Whisper...")
    model = load_whisper_model(model_name, comp_type)
    
    processed_segments = []
    
    total_segs = len(diarization_segments)
    
    # Prepare VAD options
    if vad_filter:
        vad_options = dict(
            min_silence_duration_ms=vad_min_silence,
            speech_pad_ms=vad_speech_pad,
            min_speech_duration_ms=vad_min_speech,
            threshold=vad_threshold
        )
    else:
        vad_options = False
        
    prompt = initial_prompt.strip() if (initial_prompt and initial_prompt.strip()) else None
    prefix_text = prefix.strip() if (prefix and prefix.strip()) else None

    print(f"Processing {total_segs} segments...")
    
    for idx, seg in enumerate(diarization_segments):
        start_sec = seg['start']
        end_sec = seg['end']
        speaker = seg['speaker']
        
        # UI Progress
        progress_val = 0.4 + (0.5 * (idx / total_segs))
        p(progress_val, desc=f"Transcribing {idx+1}/{total_segs} ({speaker})...")
        
        # Audio slicing
        start_sample = int(start_sec * sr)
        end_sample = int(end_sec * sr)
        
        # Avoid empty slice
        if end_sample <= start_sample:
            continue
            
        y_seg = y[start_sample:end_sample]
        
        # Whisper Transcribe for this chunk
        try:
            # Note: We pass the numpy array 'y_seg' directly
            segments_gen, _ = model.transcribe(
                y_seg, 
                language=language if language != "auto" else None,
                beam_size=beam_size, 
                vad_filter=vad_options,
                temperature=temperature,
                best_of=best_of,
                patience=patience,
                length_penalty=length_penalty,
                initial_prompt=prompt,
                prefix=prefix_text,
                condition_on_previous_text=condition_on_previous_text,
                no_speech_threshold=no_speech_threshold,
                log_prob_threshold=log_prob_threshold,
                compression_ratio_threshold=compression_ratio_threshold,
                word_timestamps=False 
            )
            
            # Collect text
            seg_text_parts = []
            for s in segments_gen:
                seg_text_parts.append(s.text.strip())
            
            final_text = " ".join(seg_text_parts).strip()
            
            if final_text:
                # Store Result
                processed_segments.append({
                    "start": start_sec,
                    "end": end_sec,
                    "speaker": speaker,
                    "text": final_text
                })
                
        except Exception as e:
            print(f"Error transcribing segment {idx}: {e}")
            continue

    total_elapsed = time.time() - total_start_time
    
    p(0.95, desc="ƒêang xu·∫•t k·∫øt qu·∫£...")
    
    # ========== OUTPUT GENERATION ==========
    
    # Speaker colors
    speaker_colors = {
        'SPEAKER_00': 'üîµ',
        'SPEAKER_01': 'üü¢', 
        'SPEAKER_02': 'üü°',
        'SPEAKER_03': 'üü†',
        'SPEAKER_04': 'üî¥',
        'SPEAKER_05': 'üü£',
    }
    
    # 1. Plain Transcription Output
    transcribe_lines = []
    for item in processed_segments:
        ts = f"[{format_timestamp(item['start'])} ‚Üí {format_timestamp(item['end'])}]"
        transcribe_lines.append(f"{ts} {item['text']}")
        
    transcribe_header = f"""## üìù K·∫øt qu·∫£ Transcription

| Th√¥ng tin | Gi√° tr·ªã |
|-----------|----------|
| ‚è±Ô∏è T·ªïng th·ªùi gian x·ª≠ l√Ω | {total_elapsed:.1f}s |
| üìä T·ªïng s·ªë Segment | {len(processed_segments)} |

---

"""
    transcribe_output = transcribe_header + "\n".join(transcribe_lines)
    
    # 2. Diarization + Transcription Output
    diarize_lines = []
    unique_speakers = set()
    
    for item in processed_segments:
        unique_speakers.add(item['speaker'])
        ts = f"[{format_timestamp(item['start'])} ‚Üí {format_timestamp(item['end'])}]"
        icon = speaker_colors.get(item['speaker'], '‚ö™')
        diarize_lines.append(f"{ts} {icon} **{item['speaker']}**: {item['text']}")
        
    diarize_header = f"""## üé≠ K·∫øt qu·∫£ Transcription + Diarization

| Th√¥ng tin | Gi√° tr·ªã |
|-----------|----------|
| üë• S·ªë ng∆∞·ªùi n√≥i | {len(unique_speakers)} |
| ‚è±Ô∏è T·ªïng th·ªùi gian x·ª≠ l√Ω | {total_elapsed:.1f}s |
| üìä T·ªïng s·ªë Segment | {len(processed_segments)} |

---

"""
    diarize_output = diarize_header + "\n".join(diarize_lines)
    
    return transcribe_output, diarize_output

In [None]:
# @title 6. üöÄ Gradio UI
css = """
.gradio-container { max-width: 1200px !important; }
.output-markdown { font-family: 'JetBrains Mono', monospace !important; }
"""

with gr.Blocks(title="PrecisionVoice", theme=gr.themes.Soft(), css=css) as demo:
    gr.Markdown("""# üéôÔ∏è PrecisionVoice - Vietnamese Speech-to-Text
    
S·ª≠ d·ª•ng **Whisper** ƒë·ªÉ nh·∫≠n d·∫°ng vƒÉn b·∫£n v√† **Pyannote** ƒë·ªÉ ph√¢n bi·ªát ng∆∞·ªùi n√≥i.
""")
    
    with gr.Row():
        with gr.Column(scale=1):
            audio_input = gr.Audio(
                sources=["upload", "microphone"], 
                type="filepath", 
                label="üîä Audio Input"
            )
            
            gr.Markdown("### ‚öôÔ∏è C√†i ƒë·∫∑t Model")
            model_select = gr.Dropdown(
                choices=list(AVAILABLE_MODELS.keys()),
                value=list(AVAILABLE_MODELS.keys())[0],
                label="ü§ñ Whisper Model"
            )
            
            language = gr.Dropdown(
                choices=["auto", "vi", "en", "zh", "ja", "ko"],
                value="vi",
                label="üåê Ng√¥n ng·ªØ"
            )
            
            comp_type_select = gr.Dropdown(
                choices=["float16", "float32", "int8", "int8_float16"],
                value=compute_type,
                label="‚ö° Compute Type"
            )
            
            with gr.Accordion("üîß T√πy ch·ªçn n√¢ng cao", open=False):
                beam_size = gr.Slider(
                    minimum=1, maximum=10, value=5, step=1,
                    label="Beam Size",
                    info="Cao h∆°n = ch√≠nh x√°c h∆°n nh∆∞ng ch·∫≠m h∆°n"
                )
                vad_filter = gr.Checkbox(
                    value=True, 
                    label="VAD Filter",
                    info="L·ªçc kho·∫£ng l·∫∑ng t·ª± ƒë·ªông"
                )
                with gr.Row():
                    vad_min_silence = gr.Number(value=1000, label="Min Silence (ms)", info="min_silence_duration_ms")
                    vad_speech_pad = gr.Number(value=400, label="Speech Pad (ms)", info="speech_pad_ms")
                with gr.Row():
                    vad_min_speech = gr.Number(value=250, label="Min Speech (ms)", info="min_speech_duration_ms")
                    vad_threshold = gr.Slider(minimum=0, maximum=1, value=0.5, step=0.05, label="VAD Threshold")
            
            with gr.Accordion("üß† Tham s·ªë Generation (Whisper)", open=False):
                with gr.Row():
                    temperature = gr.Slider(0.0, 1.0, value=0.0, step=0.1, label="Temperature")
                    best_of = gr.Number(value=5, label="Best Of")
                with gr.Row():
                    patience = gr.Number(value=1.0, label="Patience", step=0.1)
                    length_penalty = gr.Number(value=1.0, label="Length Penalty", step=0.1)
                initial_prompt = gr.Textbox(label="Initial Prompt", placeholder="Ng·ªØ c·∫£nh ho·∫∑c t·ª´ v·ª±ng...")
                prefix = gr.Textbox(label="Prefix", placeholder="B·∫Øt ƒë·∫ßu c√¢u v·ªõi...")
                condition_on_previous_text = gr.Checkbox(value=True, label="Condition on previous text")
                
                gr.Markdown("**Filter Thresholds**")
                with gr.Row():
                    no_speech_threshold = gr.Slider(0.0, 1.0, value=0.6, step=0.05, label="No Speech Threshold")
                    log_prob_threshold = gr.Slider(-5.0, 0.0, value=-1.0, step=0.1, label="Log Prob Threshold")
                    compression_ratio_threshold = gr.Number(value=2.4, label="Compression Ratio Threshold")
            
            merge_segments = gr.Checkbox(
                value=True,
                label="G·ªôp Segment c√πng Speaker",
                info="G·ªôp c√°c c√¢u li√™n ti·∫øp c·ªßa c√πng ng∆∞·ªùi n√≥i"
            )
            
            btn_process = gr.Button("üöÄ X·ª≠ l√Ω Audio", variant="primary", size="lg")
        
        with gr.Column(scale=2):
            with gr.Tabs():
                with gr.Tab("üìù Transcription"):
                    output_transcribe = gr.Markdown(
                        value="*K·∫øt qu·∫£ transcription s·∫Ω hi·ªÉn th·ªã ·ªü ƒë√¢y...*",
                        elem_classes=["output-markdown"]
                    )
                with gr.Tab("üé≠ Transcription + Diarization"):
                    output_diarize = gr.Markdown(
                        value="*K·∫øt qu·∫£ transcription + diarization s·∫Ω hi·ªÉn th·ªã ·ªü ƒë√¢y...*",
                        elem_classes=["output-markdown"]
                    )
    
    btn_process.click(
        process_audio,
        inputs=[
            audio_input, model_select, language, beam_size, vad_filter, 
            vad_min_silence, vad_speech_pad, vad_min_speech, vad_threshold,
            temperature, best_of, patience, length_penalty, 
            initial_prompt, prefix, condition_on_previous_text,
            no_speech_threshold, log_prob_threshold, compression_ratio_threshold,
            comp_type_select, merge_segments
        ],
        outputs=[output_transcribe, output_diarize]
    )
    
    gr.Markdown("""---
    
### üìñ H∆∞·ªõng d·∫´n s·ª≠ d·ª•ng

1. **Upload audio** ho·∫∑c ghi √¢m tr·ª±c ti·∫øp
2. **Ch·ªçn Model**:
   - `EraX-WoW-Turbo`: Whisper Large V3 Turbo, t·ªëi ∆∞u cho ti·∫øng Vi·ªát
   - `PhoWhisper Large`: Model ƒë∆∞·ª£c hu·∫•n luy·ªán ri√™ng cho ti·∫øng Vi·ªát
3. **Setting n√¢ng cao**:
   - Ch·ªânh `temperature` n·∫øu mu·ªën model s√°ng t·∫°o h∆°n.
   - Th√™m `Initial Prompt` ƒë·ªÉ g·ª£i √Ω t·ª´ v·ª±ng chuy√™n ng√†nh.
4. **Nh·∫•n "üöÄ X·ª≠ l√Ω Audio"** ƒë·ªÉ nh·∫≠n k·∫øt qu·∫£ ·ªü c·∫£ 2 tab
""")

# Launch
import os
if "COLAB_GPU" in os.environ or "google.colab" in str(get_ipython()):
    demo.queue().launch(share=True, debug=True)
else:
    demo.launch(share=False)