In [2]:
!pip install pygrok



In [5]:
!pip install fastapi[all] transformers soundfile pyngrok -q
!pip install git+https://github.com/huggingface/parler-tts.git -q
!ngrok config add-authtoken 2q1HRNUeH5qgPjfT9xAulRslkas_82HHj811NUPfLacXnzMPe

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [23]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import FileResponse
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from parler_tts import ParlerTTSForConditionalGeneration
from fastapi.middleware.cors import CORSMiddleware
from transformers import AutoTokenizer
import soundfile as sf
import google.generativeai as genai
import os
import uuid
import tempfile
from pyngrok import ngrok
import uvicorn
import nest_asyncio
import asyncio

nest_asyncio.apply()


# Set up the Google Gemini API key
os.environ["GEMINI_API_KEY"] = "AIzaSyB_FPiUvIUALsLfSgt5rxcbLP_nakezRQ8"
genai.configure(api_key=os.environ["GEMINI_API_KEY"])


In [24]:
app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Set up device and model configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load Whisper model for Speech-to-Text
stt_model_id = "openai/whisper-large-v3-turbo"
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    stt_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
).to(device)
stt_processor = AutoProcessor.from_pretrained(stt_model_id)
stt_pipe = pipeline(
    "automatic-speech-recognition",
    model=stt_model,
    tokenizer=stt_processor.tokenizer,
    feature_extractor=stt_processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

# Load Indic Parler TTS for Text-to-Speech
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)


In [None]:
# @app.get("/")
# def home():
#   return "YAY GIRLIES ITS RUNNIN'!"

# @app.post("/process-speech/")
# async def process_speech(file: UploadFile = File(...)):
#     # Step 1: Save uploaded file as temporary WAV
#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
#         temp_audio.write(file.file.read())
#         audio_path = temp_audio.name

#     async def remove():
#         """
#         Asynchronously removes the temporary audio files after processing.
#         """
#     loop = asyncio.get_event_loop()
#     await loop.run_in_executor(None, lambda: os.remove(audio_path))
#     await loop.run_in_executor(None, lambda: os.remove(out))

#     # Step 2: Transcribe the audio using Whisper
#     transcription = stt_pipe(audio_path)["text"]
#     print(f"Transcription: {transcription}")

#     # Step 3: Generate a response using Google Gemini (or any other model)
#     response = genai.GenerativeModel("gemini-1.5-flash").generate_content(transcription)
#     generated_text = response.text
#     print(f"Generated Text: {generated_text}")

#     # Step 4: Convert the response text to speech using Indic Parler TTS
#     description = "A female speaker with a British accent delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
    
#     description_input_ids = description_tokenizer(description, return_tensors="pt").to(device)
#     text_input_ids = tts_tokenizer(generated_text, return_tensors="pt").to(device)
    
#     generation = tts_model.generate(
#         input_ids=description_input_ids.input_ids,
#         attention_mask=description_input_ids.attention_mask,
#         prompt_input_ids=text_input_ids.input_ids,
#         prompt_attention_mask=text_input_ids.attention_mask,
#     )
#      # Save the TTS output as a WAV file
#     output_audio_path = f"{uuid.uuid4()}.wav"
#     sf.write(output_audio_path, generation.cpu().numpy().squeeze(), tts_model.config.sampling_rate)

#     return FileResponse(output_audio_path, media_type="audio/wav", filename="output.wav")



In [27]:
@app.get("/")
def home():
    return {"message": "YAY GIRLIES ITS RUNNIN'!"}

@app.post("/process-speech/")
async def process_speech(file: UploadFile = File(...)):
    if not file.content_type.startswith("audio/"):
        raise HTTPException(status_code=400, detail="Uploaded file is not an audio file.")

    # Step 1: Save uploaded file as temporary WAV
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
        temp_audio.write(await file.read())
        audio_path = temp_audio.name

    try:
        # Step 2: Transcribe the audio using Whisper
        transcription = stt_pipe(audio_path)["text"]
        print(f"Transcription: {transcription}")

        # Step 3: Generate a response using Google Gemini (or any other model)
        response = genai.GenerativeModel("gemini-1.5-flash").generate_content(transcription)
        generated_text = response.text
        print(f"Generated Text: {generated_text}")

        # Step 4: Convert the response text to speech using Indic Parler TTS
        description = "A female speaker with a Indian accent delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
        
        description_input_ids = description_tokenizer(description, return_tensors="pt").to(device)
        text_input_ids = tts_tokenizer(generated_text, return_tensors="pt").to(device)
        
        generation = tts_model.generate(
            input_ids=description_input_ids.input_ids,
            attention_mask=description_input_ids.attention_mask,
            prompt_input_ids=text_input_ids.input_ids,
            prompt_attention_mask=text_input_ids.attention_mask,
        )
        
        # Save the TTS output as a WAV file
        output_audio_path = f"{uuid.uuid4()}.wav"
        sf.write(output_audio_path, generation.cpu().numpy().squeeze(), tts_model.config.sampling_rate)

        return FileResponse(output_audio_path, media_type="audio/wav", filename="output.wav")

    finally:
        # Clean up temporary files
        os.remove(audio_path)

In [None]:
public_url = ngrok.connect(8000).public_url
print(f"Public URL: {public_url}")
uvicorn.run(app, port=8000)

Public URL: https://0095-35-185-207-105.ngrok-free.app


INFO:     Started server process [43]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


INFO:     2409:40f2:3054:1379:7198:6302:733d:2bec:0 - "GET / HTTP/1.1" 200 OK
INFO:     2409:40f2:3054:1379:7198:6302:733d:2bec:0 - "GET /process-speech HTTP/1.1" 307 Temporary Redirect
INFO:     2409:40f2:3054:1379:7198:6302:733d:2bec:0 - "GET /process-speech/ HTTP/1.1" 405 Method Not Allowed
INFO:     2409:40f2:3054:1379:7198:6302:733d:2bec:0 - "GET /process-speech/ HTTP/1.1" 405 Method Not Allowed




Transcription:  Hey, how are you?
Generated Text: I'm doing well, thank you for asking!  How are you today?

INFO:     2401:4900:901b:2843:6433:aa68:8062:b825:0 - "POST /process-speech/ HTTP/1.1" 200 OK
INFO:     2401:4900:901b:2843:6433:aa68:8062:b825:0 - "POST /process-speech// HTTP/1.1" 404 Not Found




Transcription:  Fifth largest planet in the universe
Generated Text: There's no fifth-largest planet in the *universe*.  We only have a good understanding of planets in our own solar system and a few thousand exoplanets around other stars.  The universe is unimaginably vast, and we've only discovered a tiny fraction of the planets that likely exist.  Therefore, ranking planets by size across the entire universe is currently impossible.

INFO:     2401:4900:901b:2843:6433:aa68:8062:b825:0 - "POST /process-speech/ HTTP/1.1" 200 OK




Transcription:  what is the fifth largest planet in solar system
Generated Text: The fifth largest planet in the solar system is Earth.

INFO:     2401:4900:901b:2843:6433:aa68:8062:b825:0 - "POST /process-speech/ HTTP/1.1" 200 OK




Transcription:  which is the closest planet to the Sun.
Generated Text: Mercury is the closest planet to the Sun.

INFO:     2401:4900:901b:2843:6433:aa68:8062:b825:0 - "POST /process-speech/ HTTP/1.1" 200 OK




Transcription:  How many planets are there in the solar system?
Generated Text: There are eight planets in our solar system.

INFO:     2401:4900:901b:2843:6433:aa68:8062:b825:0 - "POST /process-speech/ HTTP/1.1" 200 OK




Transcription:  can continue with the chart name all the planets in the solar system
Generated Text: Here's a chart showing the planets in our solar system.  Note that the order is based on distance from the sun.  Pluto is included as a dwarf planet, as it's a significant celestial body, even if no longer classified as a planet.


| Planet      | Classification | Distance from Sun (AU) (approx.) |
|-------------|-----------------|-------------------------------|
| Mercury     | Inner, terrestrial | 0.39                          |
| Venus       | Inner, terrestrial | 0.72                          |
| Earth       | Inner, terrestrial | 1.00                          |
| Mars        | Inner, terrestrial | 1.52                          |
| Jupiter     | Outer, gas giant   | 5.20                          |
| Saturn      | Outer, gas giant   | 9.54                          |
| Uranus      | Outer, ice giant   | 19.20                         |
| Neptune     | Outer, ice giant   | 30.06        



Transcription:  Write the 7 planets of the solar system.
Generated Text: The seven planets in our solar system, ordered by distance from the Sun, are:

1. Mercury
2. Venus
3. Earth
4. Mars
5. Jupiter
6. Saturn
7. Uranus


Note that this excludes Neptune and Pluto.  Pluto is now classified as a dwarf planet.





Transcription:  List the seven planets of the solar system.
Generated Text: The seven planets in our solar system, in order from the Sun, are:

1. Mercury
2. Venus
3. Earth
4. Mars
5. Jupiter
6. Saturn
7. Uranus


(Note:  Pluto is no longer classified as a planet.)

INFO:     2401:4900:901b:2843:6433:aa68:8062:b825:0 - "POST /process-speech/ HTTP/1.1" 200 OK
