In [11]:
import io
import IPython.display as ipd
import grpc
import os, sys, contextlib
import riva.client
from txtai.embeddings import Embeddings
from txtai.pipeline import Extractor
from ast import literal_eval
from nemollm.api import NemoLLM
from riva.client import RecognitionConfig, StreamingRecognitionConfig, AudioEncoding
import riva.client.audio_io
from riva.client.audio_io import MicrophoneStream
from typing import Iterable
import riva.client.proto.riva_asr_pb2 as rasr
from dotenv import load_dotenv

In [None]:
load_dotenv()

URI_TTS = os.get_env('URI_TTS')
URI_ASR = os.get_env('URI_ASR')
AVATAR_INSTANCE_PATH = os.get_env('AVATAR_INSTANCE_PATH')
NGC_ORG_ID = os.get_env('NGC_ORG_ID')
NGC_API_KEY = os.get_env('NGC_API_KEY')

In [None]:
# Change to your knowledge base files here
K_B = ['wara.txt', 'wasp_1.txt', 'wasp_2.txt']
K_B = ['docs/' + file for file in K_B]

In [13]:
riva.client.audio_io.list_input_devices()

Input audio devices:
0: Microsoft Sound Mapper - Input
1: Remote Audio
4: Primary Sound Capture Driver
5: Remote Audio
9: Remote Audio


In [30]:
# Change to your input device here
ASR_INPUT_DEVICE = 1

In [31]:
# Audio to Speech Module utilizing the Riva SDK
config = riva.client.StreamingRecognitionConfig(
    config=riva.client.RecognitionConfig(
        encoding=riva.client.AudioEncoding.LINEAR_PCM,
        language_code='en-US',
        max_alternatives=1,
        profanity_filter=False,
        enable_automatic_punctuation=True,
        verbatim_transcripts=True,
        sample_rate_hertz=16000
    ),
    interim_results=False,
)


class ASRService:
    def __init__(self):
        """
        """
        self.auth = riva.client.Auth(uri=URI_ASR)
        self.service = riva.client.ASRService(self.auth)
        self.sample_rate_hz = 16000
        self.file_streaming_chunk = 1600
        self.transcript = ""
        self.default_device_info = riva.client.audio_io.get_default_input_device_info()
        self.default_device_index = None if self.default_device_info is None else self.default_device_info['index']

    def run(self) -> None:
        """
        :return: None
        """
        print("ASR service running")
        with riva.client.audio_io.MicrophoneStream(
                rate=self.sample_rate_hz,
                chunk=self.file_streaming_chunk,
                device=ASR_INPUT_DEVICE,
        ) as audio_chunk_iterator:
            print("mic working")
            self.print_response(responses=self.service.streaming_response_generator(
                audio_chunks=audio_chunk_iterator,
                streaming_config=config))

    def print_response(self, responses: Iterable[rasr.StreamingRecognizeResponse]) -> None:
        """
        :param responses: Streaming Response
        :return: None
        """
        self.transcript = ""
        for response in responses:
            if not response.results:
                continue

            for result in response.results:
                if not result.alternatives:
                    continue
                if result.is_final:
                    partial_transcript = result.alternatives[0].transcript
                    self.transcript += partial_transcript
                    # print(self.transcript)
                    return

In [32]:
uri = URI_TTS
auth = riva.client.Auth(uri=uri)

tts_service = riva.client.SpeechSynthesisService(auth)

sample_rate_hz = 44100
req = { 
        "language_code"  : "en-US",
        "encoding"       : riva.client.AudioEncoding.LINEAR_PCM ,   # Currently only LINEAR_PCM is supported
        "sample_rate_hz" : sample_rate_hz,                          # Generate 44.1KHz audio
        "voice_name"     : "English-US.Female-1"                    # The name of the voice to generate
}
nchannels = 1
sampwidth = 2

In [33]:
# speech to Audio2Face module utilizing the gRPC protocal from audio2face_streaming_utils
from audio2face_streaming_utils import push_audio_track
import riva.client
import io
from pydub import AudioSegment
from scipy.io.wavfile import read
import numpy as np


class Audio2FaceService:
    def __init__(self, sample_rate=44100):
        """
        :param sample_rate: sample rate
        """
        self.a2f_url = 'localhost:50051'   # Set it to the port of your local host 
        self.sample_rate = 44100
        self.avatar_instance = AVATAR_INSTANCE_PATH # Set it to the name of your Audio2Face Streaming Instance

    def tts_to_wav(self, tts_byte, framerate=22050) -> str:
        """
        :param tts_byte: tts data in byte
        :param framerate: framerate
        :return: wav byte
        """
        seg = AudioSegment.from_raw(io.BytesIO(tts_byte), sample_width=2, frame_rate=22050, channels=1)
        wavIO = io.BytesIO()
        seg.export(wavIO, format="wav")
        rate, wav = read(io.BytesIO(wavIO.getvalue()))
        return wav

    def wav_to_numpy_float32(self, wav_byte) -> float:
        """
        :param wav_byte: wav byte
        :return: float32
        """
        return wav_byte.astype(np.float32, order='C') / 32768.0

    def get_tts_numpy_audio(self, audio) -> float:
        """
        :param audio: audio from tts_to_wav
        :return: float32 of the audio
        """
        wav_byte = self.tts_to_wav(audio)
        return self.wav_to_numpy_float32(wav_byte)

    def make_avatar_speaks(self, audio) -> None:
        """
        :param audio: tts audio
        :return: None
        """
        push_audio_track(self.a2f_url, self.get_tts_numpy_audio(audio), self.sample_rate, self.avatar_instance)

In [18]:
# extra content reading and chunking 
# add the files you want to add to the K.B Model in the K_B list, below
from langchain.text_splitter import RecursiveCharacterTextSplitter 

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 500,
    chunk_overlap  = 100,
    length_function = len,
    is_separator_regex = False,
)

docs = []

for file in K_B:
    with open(file,  encoding="utf8") as f:
        doc = f.read()
        docs.append(doc)

texts = text_splitter.create_documents(docs)
data = [t.page_content for t in texts]

In [19]:
from pprint import pprint

connection = NemoLLM(api_host="https://api.llm.ngc.nvidia.com/v1", org_id=NGC_ORG_ID, api_key=NGC_API_KEY, )

response = connection.generate(
  prompt="context: The Intergovernmental Panel on Climate Change (IPCC) is a scientific intergovernmental body under the auspices of the United Nations, set up at the request of member governments. It was first established in 1988 by two United Nations organizations, the World Meteorological Organization (WMO) and the United Nations Environment Programme (UNEP), and later endorsed by the United Nations General Assembly through Resolution 43/53. Membership of the IPCC is open to all members of the WMO and UNEP. The IPCC produces reports that support the United Nations Framework Convention on Climate Change (UNFCCC), which is the main international treaty on climate change. The ultimate objective of the UNFCCC is to \"stabilize greenhouse gas concentrations in the atmosphere at a level that would prevent dangerous anthropogenic [i.e., human-induced] interference with the climate system\". IPCC reports cover \"the scientific, technical and socio-economic information relevant to understanding the scientific basis of risk of human-induced climate change, its potential impacts and options for adaptation and mitigation.\" question: What does the UN want to stabilize?",
  model="gpt-43b-002",
  stop=[],
  tokens_to_generate=20,
  temperature=1.0,
  top_k=1,
  top_p=0.0,
  random_seed=0,
  beam_search_diversity_rate=0.0,
  beam_width=1,
  repetition_penalty=1.0,
  length_penalty=1.0,
)
pprint(response)

{'completion_labels': [{'class_name': 'nontoxic', 'score': 0.96920747}],
 'cumlogprobs': -0.023525752,
 'prompt_labels': [{'class_name': 'nontoxic', 'score': 0.9968957}],
 'text': ' greenhouse gas concentrations concentrations'}


In [20]:
# A general function that considers the both cases of having and non having extra context together with the input prompt 

def prompt(question):
    return f'Q: {question}[|||||]'

def api(prompts):
        new_prompts = [prompt.split('[|||||]')[1] + '\n' + prompt.split('[|||||]')[0] + '\nA: ' for prompt in prompts]
        print("CONTEXT: ", new_prompts)
        responses = connection.generate_multiple(
          prompts=new_prompts,
          model="gpt-43b-002",
          stop=['\nQ', '\nA'],
          tokens_to_generate=200,
          temperature=0.9,
          top_k=1,
          top_p=0.0,
          random_seed=0,
          beam_search_diversity_rate=0.0,
          beam_width=1,
          repetition_penalty=1.0,
          length_penalty=1.0,
        )
        return [response['text'] for response in responses]


#Create embeddings model with content support
embeddings = Embeddings({"path": "sentence-transformers/all-MiniLM-L6-v2", "content": True})
#embeddings = Embeddings({"path": "intfloat/e5-base-v2", "content": True})

# Create extractor instance, submit prompts to the Hugging Face inference API
#https://medium.com/neuml/introducing-txtai-the-all-in-one-embeddings-database-c721f4ff91ad
# The extractor pipeline is txtai’s spin on retrieval augmented generation (RAG).
#This pipeline extracts knowledge from content by joining a prompt, context data store (which is the data we want to add to the model, for instance about Wara) and generative model together.
extractor = Extractor(embeddings, api, minscore=0.25)

#Ad-hoc questions
question = "Where is Åre event?"

print("----", question, "----")
print(extractor([(question, question, prompt(question), False)], data))


---- Where is Åre event? ----
CONTEXT:  [' We also arrange different workshops and event with our collaborative partners such as NVIDIA and Google.\n\n\nCollaborate with us\nWe welcome any party that is willing and able to drive progress in AI for Media and Language. We interact with both established and emerging enterprises, as well as international initiatives such as Mila in Montreal and the British DPP.\n\nFollow us on LinkedIn to be the first to know about upcoming events and opportunities. Follow us on LinkedIn to be the first to know about upcoming events and opportunities.\n\n\nThe Core Team\nJohanna Björklund, Project Manager, Umeå University & Codemill AB\n\nSandor Albrecht, Co-project Manager, KAW\n\nIvana von Proschwitz, Community Manager, WARA Media & Language\n\nAnastasia Varava, Data Scientist, SEB\n\nKonrad Tollmar, Research Director, EA Games / Associate Professor, KTH Royal Institute of Technology\n\nGustav Eje Henter, Assistant Professor, KTH Royal Institute of Techn

In [21]:
# function to filter out half sentences of the response of the LLM
def remove_half_seq(text) -> str:
    """
    :param text: a textual string
    :return: a textual string similar or shorter than the input
    """
    marks = [".", "?", "!",":",",",";"]
    if text.strip()[-1] in marks:
        return text
    len_1= []
    lens = []
    for mark in marks:
        splitted_text = text.strip().split(mark)
        if len(splitted_text)>1:
            len_mark= len(splitted_text[-1])
            lens.append(len_mark)
        elif len(splitted_text) == 1:
            len_1.append(1)
    if len(lens)>0:
        return text.strip()[:(len(text)-min(lens))]
    if len(len_1) == len(marks):
        return text
        
    

# function for tts modification for pronunciation:
def phonetic_modification(text) -> str:
    """
    :param text: a textual string
    :return: a textual in which some words are tagged with new phonetics
    """
    token_list = ["Johanna","Umeå", "AI","LLMs", "II", "VII", "VI", "åre" ]
    modified_phonetic = ["ˈjohana","ˈoomijo", "ˈeiˈai", "ˈellˈellˈemz", "sekˈend","ˈseven", "ˈsix", ""]
    for token, phon in zip(token_list,modified_phonetic) :
        customized_phoneme = '<phoneme ph='+'"'+ phon+ '"'+'>'+token+'</phoneme>'
        text = text.replace(token,customized_phoneme ) 
    return text

def splitting_text(text) -> list:
    """
    :param text: a textual string
    :return:  list of <speak> tag added substrings of the input text
    """
    ## splitting text considering .:
    output_chunks0 = text.split(".")
    # splitting text considering \n:
    output_chunks = []
    for opt_ch in output_chunks0:
        output_chunks.extend(opt_ch.split("\n"))
    # adding <speak> tags to the substrings
    output_ch_sk = []
    for chunk in output_chunks:
        if chunk.strip() =="":
            print("chunk", chunk)
        else:
            output_ch_sk.append(f'<speak>{chunk}</speak>')
    return output_ch_sk 

In [19]:
# Avatar + ASR block

# K.B model PLUS ASR block + tts modification for pronunciation
# Saving the ASR transcripts of the prompts
# Filtering out half sequences in the LLM responses
# resolving the issue of long input for tts block
# saving avatars answer (users prompts are in prompts.txt)

audio2face_service = Audio2FaceService()
asr_service = ASRService()

# writing the prompts in a file:
file_asr_transcripts = "./docs/prompts_transcripts_noisy.txt"
file_avatar_replies = "./docs/full_model_avatar_replies_noisy.txt"
    
with  open(file_asr_transcripts,"a", encoding="utf-8")  as f1, open(file_avatar_replies,"a", encoding="utf-8")  as f2:
    while True:
        print("Ask avatar")
        
        # speech recognition
        asr_service.run()
        transcript = asr_service.transcript
        print("ASR transcript:",transcript)
        
        # inference block
        text = extractor([(transcript,transcript, prompt(transcript), False)], data)[0][1]
        # Detecting removing empty replies from the LLM
        if len(text.strip()) == 0:
            text = "Sorry, I do not understand you!"
        
        # removing half sequences in the end of LLM responses
        output0 = remove_half_seq(text)

        # tts 
        # modification for pronunciation:
        output0 = phonetic_modification(output0) 
        
        # saving the ASR transcripts and avatars answer
        f1.write(transcript)
        f1.write("\n")
        f2.write(output0)
        f2.write("\n")
        
        # tranforming the text into chunks to avoid sending long string to tts
        output_ch_sk = splitting_text(output0)
            
        for text in output_ch_sk:
            print(text)
            audio = tts_service.synthesize(text, language_code="en-US", sample_rate_hz=sample_rate_hz,  voice_name="English-US.Male-1")
            audio_bytes = audio.audio
            audio2face_service.make_avatar_speaks(audio_bytes)


Ask avatar
ASR service running
mic working
ASR transcript: Hello. 
CONTEXT:  [' \nQ: Hello. \nA: ']


KeyboardInterrupt: 