In [None]:
#pip install pyaudio
#brew install egspeak
#pip install phonemizer
#pip install python-Levenshtein
#pip install requests ipywidgets sounddevice soundfile
#pip install sequence_align

In [None]:
import dotenv

import json
import os
import pyaudio
import wave
import base64
import requests
import io
import re

dotenv.load_dotenv()

# HF INFERENCE API
API_TOKEN = os.environ.get("HF_API_TOKEN") #https://huggingface.co/settings/profile
headers = {"Authorization": f"Bearer {API_TOKEN}"}

PHONEME_API_URL = "https://api-inference.huggingface.co/models/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme" # "https://api-inference.huggingface.co/facebook/wav2vec2-xlsr-53-phon-cv-ft"
STT_API_URL = "https://api-inference.huggingface.co/models/openai/whisper-small"

def query(filename, API_URL):
    with open(filename, "rb") as f:
        data = f.read()
    response = requests.request("POST", API_URL, headers=headers, data=data, json={"options": {"wait_for_model": True, "return_timestamps":True}})
    return json.loads(response.content.decode("utf-8"))


In [5]:
import ipywidgets as widgets
from IPython.display import display, Audio, HTML, Markdown

import sounddevice as sd
import soundfile as sf

# Define a function to record audio when the button is clicked
def record_audio(b):
    with output:
        if b.description == 'Start Recording':
            b.description = 'Stop Recording'
            print('Recording...')
            fs = 44100  # sample rate
            seconds = 10  # recording duration
            myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
            sd.wait()
            sf.write('output.wav', myrecording, fs)
            print('Recording saved as output.wav')
        else:
            b.description = 'Start Recording'

In [6]:
import random

# Reference text that the user is prompted to read out loud. 
sentences = [
    "The red boat sailed across the wide blue ocean.",
    "Mary had a little lamb, its fleece was white as snow.",
    "Can you imagine a world without books and stories?",
    "The early bird catches the worm, but the second mouse gets the cheese.",
    "The car swerved to avoid the squirrel running across the road.",
    "She sells seashells by the seashore.",
    "Peter Piper picked a peck of pickled peppers.",
    "A stitch in time saves nine.",
    "The boat sailed smoothly over the calm water yesterday."
]
reference_text = random.choice(sentences)
reference_text

'The early bird catches the worm, but the second mouse gets the cheese.'

In [7]:
# Create a button to start and stop recording
print(f'Read out loud - "{reference_text}"')
button = widgets.Button(description='Start Recording')
display(button)

# Create an output widget to display the recorded audio
output = widgets.Output()
display(output)

# Link the function to the button
button.on_click(record_audio)

Read out loud - "The early bird catches the worm, but the second mouse gets the cheese."


Button(description='Start Recording', style=ButtonStyle())

Output()

In [8]:
# Display the audio player widget
Audio('output.wav')

In [9]:
MODAL_URL = "https://<>--phoneme-recognizer-dev.modal.run"

def get_phones(filepath):

    # # Load your audio file
    with open(filepath, "rb") as f:
        audio_data = f.read()

    # Encode the WAV data as base64 data url
    audio_base64 = base64.b64encode(audio_data).decode("utf-8")
    data_url = f"data:audio/wav;base64,{audio_base64}"

    # Prepare the request payload
    payload = {
        "audio": data_url 
    }
    # token = "d"
    headers = {"Authorization": f"Bearer {token}"}
    response = requests.post(
        MODAL_URL, json=payload,  headers=headers
    )
    
    # Check if the request was successful
    if response.status_code == 200:
        # print("request completed")
        transcription = response.json()
        return transcription
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return 

In [None]:
# get recorded speech phoneme using deployed model
recorded_phoneme = get_phones('output.wav')['text']
recorded_phoneme

# or use HF hub model
# wait for the model to load
#print(query('output.wav', PHONEME_API_URL))
#recorded_phoneme = query('output.wav', PHONEME_API_URL)['text']

In [None]:
import phonemizer
from phonemizer.punctuation import Punctuation
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator

def generate_reference_phoneme(reference_text):
    text = Punctuation(';:,.!"?()').remove(reference_text)
    ref_words = [w.lower() for w in text.strip().split(' ') if w]
    
    
    # initialize the espeak backend for English
    backend = EspeakBackend('en-us')
    
    # separate phones by a space and ignoring words boundaries
    separator = Separator(phone='', word=None)
    
    # build the lexicon by phonemizing each word one by one. The backend.phonemize
    # function expect a list as input and outputs a list.
    lexicon = [ (word, backend.phonemize([word], separator=separator, strip=True)[0])
        for word in ref_words]
    
    return lexicon, ref_words 