# Run both Whisper and Voicevox in a single Colab Session

To enable GPU in this notebook, select Runtime -> Change runtime type in the Menu bar. Under Hardware Accelerator, select GPU.

Then, scroll to the Configuration [cell](#scrollTo=8WIVDY-V-kVw&line=1&uniqifier=1) and update it with your ngrok authentication token.

To run, select Runtime -> Run all. Go to this [cell](#scrollTo=5M_2NlAXB89F&line=1&uniqifier=1) and read the instructions on how to update your `.env` file.

# Check GPU

In [None]:
!nvidia-smi

# Install Deps

In [None]:
!pip install flask -q
!pip install pyngrok -q
!pip install git+https://github.com/openai/whisper.git -q
!pip install requests -q
!pip install flask-cors -q

In [None]:
# VoiceVox Engine
ENGINE_VER = '0.14.2'
ZIP_FILENAME = f'voicevox_engine-linux-nvidia-{ENGINE_VER}.7z.001'
DOWNLOAD_LINK = f'https://github.com/VOICEVOX/voicevox_engine/releases/download/{ENGINE_VER}/{ZIP_FILENAME}'

!wget $DOWNLOAD_LINK
!7z x $ZIP_FILENAME -y
!rm $ZIP_FILENAME

!git clone https://github.com/VOICEVOX/voicevox_engine -q

# Install deps, ignoring python version
!pip install -r <(sed 's/;.*//' voicevox_engine/requirements.txt) -q
!pip install numpy==1.22 -q

# Configuration

Please set the NGROK auth token to access the tunnel.

In [None]:
NGROK_AUTH_TOKEN = '' #@param {type:'string'}
TRANSLATE_FILENAME = 'translate.wav' #@param {type:'string'}
TRANSCRIBE_FILENAME = 'transcribe.wav' #@param {type:'string'}
WHISPER_MODEL = 'small' #@param ['tiny', 'base', 'small', 'medium', 'large']
VOICEVOX_URL = 'http://localhost:50021' #@param {type:'string'}
CHUNK_SIZE = 4096 #@param {type:'integer'}

# Main Code

In [None]:
from flask import Flask, request, Response
import json
import whisper
import subprocess
import requests
from pyngrok import ngrok
from flask_cors import CORS
import os
from urllib.parse import urlencode

model = whisper.load_model(WHISPER_MODEL)

# Update .env file Instructions

After the cell below has started running, copy the public url provided by ngrok and update both WHISPER_BASE_URL and VOICEVOX_BASE_URL in your `.env` file. Below is an example output that you will see. 

```
NgrokTunnel: "http://f9a8-34-73-238-198.ngrok.io" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off
INFO:werkzeug:WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
 * Running on http://127.0.0.1:5000
INFO:werkzeug:Press CTRL+C to quit
```

DO NOT use this url, use the URL provided by the actual output from running the cell below. In this example, you will update your WHISPER_BASE_URL and VOICEVOX_BASE_URL variable with:

```
WHISPER_BASE_URL=http://f9a8-34-73-238-198.ngrok.io
VOICEVOX_BASE_URL=http://f9a8-34-73-238-198.ngrok.io
```

This url will change every time you rerun this cell, so remember to update your `.env` file when that happens.

In [None]:
app = Flask(__name__)
ngrok.set_auth_token(NGROK_AUTH_TOKEN)
CORS(app)

@app.route('/', methods=['GET'])
def test():
  response = {'status':'OK','message':'Test'}
  return Response(json.dumps({'status':'OK','message':'Test'}), mimetype='application/json')

@app.route('/speakers', methods=['GET'])
def speakers():
  res = requests.get(f"{VOICEVOX_URL}/speakers")

  return res.json()

# Raw
@app.route('/audio_query', methods=['POST'])
def audio_query():
  try:
    params_encoded = urlencode(request.args)
    r = requests.post(f'{VOICEVOX_URL}/audio_query?{params_encoded}')
    return r.json()

  except Exception:
    return Response(json.dumps({ 'message': 'Failed to request audio_query', 'json': r.json(), 'status': 'Server Error' }), mimetype='application/json', status=500)

@app.route('/synthesis', methods=['POST'])
def synthesis():
  json = request.get_json()
  params_encoded = urlencode(request.args)
  r = requests.post(f'{VOICEVOX_URL}/synthesis?{params_encoded}', json=json)

  return Response(r.content, mimetype='audio/wav')

# All in one
@app.route('/tts', methods=['POST'])
def tts():
  text = request.args.get('text')
  speaker = int(request.args.get('speaker') or '5')

  if (text is None):
    return json.dumps({ 'message': 'No text', 'status': 'BAD_REQUEST' }), 400

  speed_scale = float(request.args.get('speed_scale') or '1.7')
  volume_scale = float(request.args.get('volume_scale') or '4.0')
  intonation_scale = float(request.args.get('intonation_scale') or '1.5')
  pre_phoneme_length = float(request.args.get('pre_phoneme_length') or '1.0')
  post_phoneme_length = float(request.args.get('post_phoneme_length') or '1.0')

  params_encoded = urlencode({'text': text, 'speaker': speaker})
  r = requests.post(f'{VOICEVOX_URL}/audio_query?{params_encoded}')

  if r.status_code == 404:
    return Response(json.dumps({ 'message': 'Failed to request audio_query', 'json': r.json(), 'status': 'Server Error' }), mimetype='application/json', status=500)

  query = r.json()
  query['speedScale'] = speed_scale
  query['volumeScale'] = volume_scale
  query['intonationScale'] = intonation_scale
  query['prePhonemeLength'] = pre_phoneme_length
  query['postPhonemeLength'] = post_phoneme_length

  params_encoded = urlencode({'speaker': speaker})
  r = requests.post(f'{VOICEVOX_URL}/synthesis?{params_encoded}', json=query)

  return Response(r.content, mimetype='audio/wav')

# Whisper speech-to-text endpoints
@app.route('/asr', methods=['POST'])
def asr():
  task = request.args.get('task') or 'transcribe'
  language = request.args.get('language') or 'ja'

  if task == 'transcribe':
    if (request.content_type.startswith('multipart/form-data')):
      audio_data = request.files['audio_file']

      if (audio_data is None):
        return Response(json.dumps({ 'message': '"file" is missing on form data' }), mimetype='application/json', status=422)

      audio_data.save(TRANSCRIBE_FILENAME)

    else:
      with open(TRANSCRIBE_FILENAME, "bw") as f:
        while True:
          chunk = request.stream.read(CHUNK_SIZE)

          if len(chunk) == 0:
              break

          f.write(chunk)

        f.close()

    result = model.transcribe(TRANSCRIBE_FILENAME)
    return Response(json.dumps(result), mimetype='application/json')

  elif task == 'translate':
    if (request.content_type.startswith('multipart/form-data')):
      audio_data = request.files['audio_file']

      if (audio_data is None):
        return Response(json.dumps({ 'message': '"file" is missing on form data' }), mimetype='application/json', status=422)

      audio_data.save(TRANSLATE_FILENAME)
      
    else:
      with open(TRANSLATE_FILENAME, 'bw') as f:
        while True:
          chunk = request.stream.read(CHUNK_SIZE)

          if len(chunk) == 0:
              break

          f.write(chunk)

        f.close()

    result = model.transcribe(TRANSLATE_FILENAME, language=language, task='translate')
    return Response(json.dumps(result), mimetype='application/json')
  
  else:
    return Response(json.dumps({ 'message': 'Unknown task', 'status': 'Bad Request' }), mimetype='application/json', status=400)

def main():
  # Start voicevox
  sub = subprocess.Popen(
    "python /content/voicevox_engine/run.py --voicevox_dir='linux-nvidia' --use_gpu --allow_origin * --cors_policy_mode all",
    shell=True,
    stdout=subprocess.PIPE
  )

  # Open tunnel
  http_tunnel = ngrok.connect(5000)
  print(http_tunnel)

  # Run app
  app.run()

if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        print('ded')

## Leave the above cell running and the tab open

This is to ensure the runtime does not disconnect and shut down the server. 

When you're done remember to disconnect the runtime.