# Fish Speech

### Prepare Model

In [None]:
# For Chinese users, you probably want to use mirror to accelerate downloading
# !set HF_ENDPOINT=https://hf-mirror.com
# !export HF_ENDPOINT=https://hf-mirror.com 

!huggingface-cli download fishaudio/fish-speech-1.5 --local-dir stores/checkpoints/fish-speech-1.5/

## WebUI Inference

> You can use --compile to fuse CUDA kernels for faster inference (10x).

In [None]:
%env PYTORCH_ENABLE_MPS_FALLBACK=1

!python ../../fish-speech/tools/run_webui.py \
    --llama-checkpoint-path stores/checkpoints/fish-speech-1.5 \
    --decoder-checkpoint-path stores/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth \
    --compile

## CLI Inference

### 1. Encode reference audio: 

> You should get a `fake.npy` file.

In [None]:
%env PYTORCH_ENABLE_MPS_FALLBACK=1

src_audio = "datas/audios/小敏/小敏_01.mp3"
!python ../../fish-speech/fish_speech/models/vqgan/inference.py \
    -i {src_audio} \
    --checkpoint-path "stores/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
    --device "mps"

from IPython.display import Audio, display
audio = Audio(filename="fake.wav")
display(audio)

env: PYTORCH_ENABLE_MPS_FALLBACK=1
[32m2025-02-13 14:32:20.030[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m46[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-02-13 14:32:20.031[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m75[0m - [1mProcessing in-place reconstruction of /Users/WangHao/Sites/学习/LargeData/音频语素/小敏/小敏_01.mp3[0m
[32m2025-02-13 14:32:20.039[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m86[0m - [1mLoaded audio with 9.22 seconds[0m
[32m2025-02-13 14:32:20.409[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m94[0m - [1mGenerated indices of shape torch.Size([8, 199])[0m
[32m2025-02-13 14:32:20.950[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m113[0m - [1mGenerated audio of shape torch.Size([1, 1, 407552]), equivalent to 9.24 seconds from 199 features, features/second: 21.53[0m
[32m2025-02-13 14:32:20.954[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain

### 2. Generate semantic tokens from text:

> This command will create a codes_N file in the working directory, where N is an integer starting from 0.

> You may want to use `--compile` to fuse CUDA kernels for faster inference (~30 tokens/second -> ~300 tokens/second).

In [6]:
%env PYTORCH_ENABLE_MPS_FALLBACK=1

!python ../../fish-speech/fish_speech/models/text2semantic/inference.py \
    --text "你好呀我的朋友" \
    --prompt-text "昨天我出门你猜怎么着手机居然没电了这大热天的我连个导航都用不了你说气人不气人" \
    --prompt-tokens "fake.npy" \
    --checkpoint-path "stores/checkpoints/fish-speech-1.5" \
    --device "mps" \
    --num-samples 2 \
    --compile

env: PYTORCH_ENABLE_MPS_FALLBACK=1
[32m2025-02-13 14:35:11.197[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1056[0m - [1mLoading model ...[0m
[32m2025-02-13 14:35:20.101[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m681[0m - [1mRestored model from checkpoint[0m
[32m2025-02-13 14:35:20.102[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m687[0m - [1mUsing DualARTransformer[0m
[32m2025-02-13 14:35:20.102[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m695[0m - [1mCompiling function...[0m
[32m2025-02-13 14:35:20.339[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m1070[0m - [1mTime to load model: 9.14 seconds[0m
[32m2025-02-13 14:35:20.393[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_long[0m:[36m788[0m - [1mEncoded text: 你好呀我的朋友[0m
[32m2025-02-13 14:35:20.394[0m | [1mINFO    [0m | [36m__main__[0m:[36mgenerate_long[0m:[36m806[0m - [1mGenerating sentenc

### 3. Generate speech from semantic tokens:

In [8]:
%env PYTORCH_ENABLE_MPS_FALLBACK=1

!python ../../fish-speech/fish_speech/models/vqgan/inference.py \
    -i "temp/codes_1.npy" \
    --checkpoint-path "stores/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth" \
    --device "mps"

from IPython.display import Audio, display
audio = Audio(filename="fake.wav")
display(audio)

env: PYTORCH_ENABLE_MPS_FALLBACK=1
[32m2025-02-13 14:36:17.220[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_model[0m:[36m46[0m - [1mLoaded model: <All keys matched successfully>[0m
[32m2025-02-13 14:36:17.221[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mProcessing precomputed indices from temp/codes_1.npy[0m
[32m2025-02-13 14:36:17.765[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m113[0m - [1mGenerated audio of shape torch.Size([1, 1, 75776]), equivalent to 1.72 seconds from 37 features, features/second: 21.53[0m
[32m2025-02-13 14:36:17.767[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m120[0m - [1mSaved audio to fake.wav[0m
