# üé¨ UVG MAX - Premium Video Generator

**With Fish-Speech (OpenAudio S1) High-Quality TTS**

---

## ‚ö†Ô∏è IMPORTANT: HuggingFace Login Required

The Fish-Speech model requires you to:
1. Create a HuggingFace account at https://huggingface.co
2. Go to https://huggingface.co/fishaudio/openaudio-s1-mini and click 'Agree and access repository'
3. Create an access token at https://huggingface.co/settings/tokens
4. Run Cell 0 below and paste your token

---

In [None]:
#@title üîê Cell 0: Login to HuggingFace (Required Once)
#@markdown Get your token from https://huggingface.co/settings/tokens

from huggingface_hub import login
login()

In [None]:
#@title üîß Cell 1: Install Fish-Speech (~5-10 minutes)

import os
import sys

# Step 1: Install system dependencies
print('üì¶ Step 1/5: Installing system dependencies...')
!apt-get update -qq
!apt-get install -qq portaudio19-dev libsox-dev ffmpeg -y

# Step 2: Clone Fish-Speech
print('\nüì¶ Step 2/5: Cloning Fish-Speech...')
os.chdir('/content')
!rm -rf fish-speech
!git clone https://github.com/fishaudio/fish-speech.git
os.chdir('/content/fish-speech')

# Step 3: Install Python dependencies
print('\nüì¶ Step 3/5: Installing Python dependencies...')
!pip install -q loguru hydra-core omegaconf
!pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q -e . --no-build-isolation

# Step 4: Download model
print('\nüì• Step 4/5: Downloading OpenAudio S1-mini model (~2GB)...')
!huggingface-cli download fishaudio/openaudio-s1-mini --local-dir checkpoints/openaudio-s1-mini

# Step 5: Clone UVG MAX
print('\nüì¶ Step 5/5: Setting up UVG MAX...')
os.chdir('/content')
!rm -rf uvg-max-engine
!git clone https://github.com/Shya-Bubu/uvg-max-engine.git
!pip install -q python-dotenv requests tqdm Pillow numpy

# Set API keys
os.environ['PEXELS_KEY'] = '7QAyreSxu44EAJYIRfYXstHpvj1tof7v3Baj8tv8fvO4DV6l16I2FzlY'
os.environ['PIXABAY_KEY'] = '53451868-98a1100fbad21a5918d9610dd'

print('\n' + '='*50)
print('‚úÖ SETUP COMPLETE!')
print('='*50)
print('\nüëâ Run Cell 2 to test Fish-Speech TTS')

In [None]:
#@title üß™ Cell 2: Test Fish-Speech TTS

import os
import sys
os.chdir('/content/fish-speech')
sys.path.insert(0, '/content/fish-speech')

TEST_TEXT = "Hello world! This is a test of Fish Speech. The quality should be amazing."

print('üéôÔ∏è Testing Fish-Speech TTS...')
print(f'Text: {TEST_TEXT}')

# Generate semantic tokens
!python -m fish_speech.models.text2semantic.inference \
    --text "{TEST_TEXT}" \
    --checkpoint-path checkpoints/openaudio-s1-mini \
    --num-samples 1

# Check if tokens were generated
import glob
token_files = glob.glob('codes_*.npy')
if token_files:
    print(f'\n‚úÖ Tokens generated: {token_files[0]}')
    
    # Convert to audio
    !python -m fish_speech.models.dac.inference \
        -i {token_files[0]} \
        --checkpoint-path checkpoints/openaudio-s1-mini/codec.pth \
        -o /content/test_audio.wav
    
    # Play audio
    if os.path.exists('/content/test_audio.wav'):
        print('\nüéß Playing generated audio:')
        from IPython.display import Audio
        display(Audio('/content/test_audio.wav'))
    else:
        print('‚ùå Audio file not created')
else:
    print('‚ùå Token generation failed')

In [None]:
#@title üéôÔ∏è Cell 3: Generate TTS for All Scenes

import os
import sys
os.chdir('/content/fish-speech')
sys.path.insert(0, '/content/fish-speech')

# Your script scenes
SCENES = [
    "In a world of endless distractions, focus is your superpower.",
    "Every great achievement started with a single moment of clarity.",
    "Small steps every day lead to massive results.",
    "Today, choose focus. Tomorrow, celebrate success."
]

# Create output directory
os.makedirs('/content/tts_audio', exist_ok=True)

print('üéôÔ∏è Generating TTS audio with Fish-Speech...')
print(f'   Scenes: {len(SCENES)}\n')

import glob

for i, text in enumerate(SCENES):
    print(f'üìù Scene {i+1}: "{text[:40]}..."')
    
    # Clean up previous tokens
    for f in glob.glob('codes_*.npy'):
        os.remove(f)
    
    # Generate semantic tokens
    !python -m fish_speech.models.text2semantic.inference \
        --text "{text}" \
        --checkpoint-path checkpoints/openaudio-s1-mini \
        --num-samples 1 2>/dev/null
    
    # Find generated tokens
    token_files = glob.glob('codes_*.npy')
    if token_files:
        # Convert to audio
        output_path = f'/content/tts_audio/scene_{i+1}.wav'
        !python -m fish_speech.models.dac.inference \
            -i {token_files[0]} \
            --checkpoint-path checkpoints/openaudio-s1-mini/codec.pth \
            -o {output_path} 2>/dev/null
        
        if os.path.exists(output_path):
            print(f'   ‚úÖ Saved: {output_path}')
        else:
            print(f'   ‚ùå Audio conversion failed')
    else:
        print(f'   ‚ùå Token generation failed')

# Free GPU memory
import gc
import torch
gc.collect()
torch.cuda.empty_cache()

print('\n' + '='*50)
print('‚úÖ TTS generation complete!')
print('='*50)
print('\nüëâ Run Cell 4 to generate the video')

In [None]:
#@title üé¨ Cell 4: Generate Video with Pre-made Audio

import os
import sys
import glob

os.chdir('/content/uvg-max-engine')
sys.path.insert(0, '/content/uvg-max-engine')

# Check TTS audio exists
tts_files = sorted(glob.glob('/content/tts_audio/scene_*.wav'))
print(f'Found {len(tts_files)} TTS audio files')

if len(tts_files) < 1:
    print('‚ùå No TTS audio found! Run Cell 3 first.')
else:
    SCRIPT = {
        "version": "2.1",
        "video_meta": {
            "title": "The Power of Focus",
            "orientation": "portrait",
            "resolution": {"width": 1080, "height": 1920}
        },
        "scenes": [
            {"scene_id": 1, "text": "In a world of distractions...", "search_keywords": "meditation peaceful", "audio_path": "/content/tts_audio/scene_1.wav"},
            {"scene_id": 2, "text": "Every great achievement...", "search_keywords": "mountain sunrise", "audio_path": "/content/tts_audio/scene_2.wav"},
            {"scene_id": 3, "text": "Small steps lead to results...", "search_keywords": "running fitness", "audio_path": "/content/tts_audio/scene_3.wav"},
            {"scene_id": 4, "text": "Choose focus, celebrate success.", "search_keywords": "celebration happy", "audio_path": "/content/tts_audio/scene_4.wav"}
        ]
    }
    
    from uvg_core.uvg_pipeline import run_from_json
    
    print('\nüé¨ Generating video...')
    result = run_from_json(SCRIPT)
    
    if result.success:
        print(f'\n‚úÖ Video: {result.output_path}')
    else:
        print(f'\n‚ùå Failed: {result.errors}')

In [None]:
#@title üì∫ Cell 5: Play Video

from IPython.display import Video
import glob, os
os.chdir('/content/uvg-max-engine')

videos = glob.glob('uvg_output/**/*.mp4', recursive=True)
if videos:
    latest = max(videos, key=os.path.getctime)
    print(f'üé¨ {latest}')
    display(Video(latest, embed=True, width=400))
else:
    print('‚ùå No video found.')

In [None]:
#@title üì• Cell 6: Download

from google.colab import files
import glob, os
os.chdir('/content/uvg-max-engine')

for f in glob.glob('uvg_output/final/*.mp4'):
    print(f'üì• {f}')
    files.download(f)