In [1]:
import gradio as gr
import torch
from torchaudio.sox_effects import apply_effects_file
from transformers import AutoFeatureExtractor, AutoModelForAudioXVector
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
import pandas as pd
import numpy as numpy
import gradio
from speechbrain.pretrained import EncoderDecoderASR
from speechbrain.pretrained import EncoderClassifier
from sentence_transformers import SentenceTransformer
from transformers import pipeline


![Architecture](AudioArchitecture.png)

![Architecture](Speechbrain.png)

In [2]:
from speechbrain.pretrained import Tacotron2
from speechbrain.pretrained import HIFIGAN
import torchaudio

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [4]:
transformer = SentenceTransformer('all-MiniLM-L6-v2')
question_answerer = pipeline("question-answering")


No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [5]:
EFFECTS = [
    ["remix", "-"],
    ["channels", "1"],
    ["rate", "16000"],
    ["gain", "-1.0"],
    ["silence", "1", "0.1", "0.1%", "-1", "0.1", "0.1%"],
    ["trim", "0", "10"],
]

THRESHOLD = 0.85

In [6]:
# All models are located in /home/jupyter/.cache/huggingface/hub
model_name = "microsoft/unispeech-sat-base-plus-sv"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model = AutoModelForAudioXVector.from_pretrained(model_name).to(device)
cosine_sim = torch.nn.CosineSimilarity(dim=-1)

In [7]:
embedding_generator = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")

In [8]:
from speechbrain.pretrained.interfaces import foreign_class
classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")


speechbrain.lobes.models.huggingface_wav2vec - wav2vec 2.0 is frozen.


In [9]:
# Intialize TTS (tacotron2) and Vocoder   warnings.warn(
#(HiFIGAN)
tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmpdir_tts")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmpdir_vocoder")


In [10]:
asr_model = EncoderDecoderASR.from_hparams(source="speechbrain/asr-wav2vec2-commonvoice-en", savedir="pretrained_models/asr-wav2vec2-commonvoice-en")

speechbrain.lobes.models.huggingface_wav2vec - wav2vec 2.0 is frozen.


In [11]:
embedding_generator = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb")

In [12]:
def getText(path):
    return asr_model.transcribe_file(path)

In [13]:
def get_embedding(audio):
    text=getText(audio)
    return text,transformer.encode(text)


In [14]:
def extract_embeddings(path1):
    wav1, _ = apply_effects_file(path1, EFFECTS)
    print(wav1.shape,)

    input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)

    with torch.no_grad():
        emb1 = model(input1).embeddings
        print("mb1 shape")
        print(emb1.shape)
    emb1 = torch.nn.functional.normalize(emb1, dim=-1).cpu()
    print("After normalize emb1 shape")
    print(emb1.shape)
    return emb1


In [15]:
import pymilvus
import librosa
import torch

In [16]:
HOST=""
PORT="19530"
COLLECTION_NAME = 'customerdata'
USER_COLLECTION_NAME='user'
INDEX_TYPE = 'IVF_SQ8'
METRIC_TYPE = 'L2'
DIMENSION = 384
TOPK = 4

In [17]:
connections.connect(host=HOST, port=PORT)


In [18]:
usercollection = Collection(USER_COLLECTION_NAME)
userresult = usercollection.query(expr="id >= 0",output_fields=["name", "embedding"],)


In [19]:
# Create Milvus collection
fields = [
    FieldSchema(name='id', dtype=DataType.INT64, description='embedding ids', is_primary=True, auto_id=True),
    FieldSchema(name='text', dtype=DataType.VARCHAR, description='user name', max_length=100),
    FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='text audio embeddings', dim=DIMENSION)
    ]
schema = CollectionSchema(fields=fields, description='Text audio embeddings')

# if utility.has_collection(COLLECTION_NAME):
#     collection = Collection(COLLECTION_NAME)
#     collection.drop() # drop collection if it exists
   
collection = Collection(name=COLLECTION_NAME, schema=schema)

# # Create index
# index_params = {
#     'metric_type': METRIC_TYPE,
#     'index_type': INDEX_TYPE,
#     'params':{"nlist":1536}
# }

# status = collection.create_index(field_name='embedding', index_params=index_params)

In [None]:
column_names=["text", "embedding"]
data = []
df=pd.DataFrame(data, columns=column_names)
lst=[]

In [None]:
path="/home/jupyter/demo/addresswavinputsindiafemale/wav/"
for i in range(18):
    if (i !=0 ):
        lst.append(path+str(i)+".wav")

In [None]:
lst

In [None]:
text, embed = get_embedding(lst[0])
print(text)
print(embed.shape)

In [None]:
for i in range(len(lst)):
    print(lst[i])
    text, embed=get_embedding(lst[i])
    df.loc[len(df.index)] = [text, embed] 

In [None]:
from towhee import ops, pipe, DataCollection


insert_pipe =  (pipe.input('df')
            .flat_map('df', 'data', lambda df: df.values.tolist())
            .map('data', 'res', ops.ann_insert.milvus_client(host=HOST, 
                                                            port=PORT,
                                                            collection_name=COLLECTION_NAME))
             .output('res')
)

In [None]:
insert_pipe(df)

In [None]:
collection.load()
collection.num_entities

In [20]:
import numpy as np
from towhee import ops, pipe, DataCollection
search_pipe = (pipe.input('vec')
                    .flat_map('vec', 'rows',  ops.ann_search.milvus_client(host=HOST, metric_type=METRIC_TYPE,
                                                                                   port=PORT, limit= 3,reverse=True,
                                                                                   collection_name=COLLECTION_NAME, **{'output_fields': ['text']}))
                    .map('rows', ('id', 'score', 'text'), lambda x: (x[0], x[1], x[2])) 
                    .output('id','score','text')
               )

In [21]:
def user_voice_extract_embeddings(path1):
    wav1, _ = apply_effects_file(path1, EFFECTS)
    print(wav1.shape,)

    input1 = feature_extractor(wav1.squeeze(0), return_tensors="pt", sampling_rate=16000).input_values.to(device)

    with torch.no_grad():
        emb1 = model(input1).embeddings
        print("mb1 shape")
        print(emb1.shape)
    emb1 = torch.nn.functional.normalize(emb1, dim=-1).cpu()
    print("After normalize emb1 shape")
    print(emb1.shape)
    return emb1

In [56]:
def match_user(audio):
    currentuserembed=user_voice_extract_embeddings(audio)
    print(currentuserembed.shape)

    for i in range(len(userresult)):
        print(userresult[i]['name']+" Comparing is it this person")
        emb=numpy.array([userresult[i]['embedding']])
        temb = torch.from_numpy(emb)
        similarity = cosine_sim(currentuserembed,temb).numpy()[0]
        if similarity >= 0.85:
            return userresult[i]['name']
    return "Anonymouse"

In [63]:
def read_audio(audio):
    current_user=match_user(audio)
    print("User who spoke is current_user ="+current_user)
    #Emotions
    out_prob, score, index, emotion = classifier.classify_file(audio)
    contextStr=""
    answer=""
    print("read_audio = "+audio)
    print("emotion = "+emotion[0])
    # Milvus search on the text embedding
    qtn, embed=get_embedding(audio)
    print("question = "+qtn)
    res = search_pipe.batch([embed])
    ans = DataCollection(res[0])
    for i in range(len(ans)):
        id=ans[i]['id']
        score=ans[i]['score']
        text=ans[i]['text']
        if (i==0):
            answer=text.lower()
        print(str(id)+", "+str(score))
        contextStr=contextStr+text.lower()+". "    
    print("answer="+answer)
    print("context="+contextStr)
    # Running the TTS
    mel_output, mel_length, alignment = tacotron2.encode_text(answer)

    # Running Vocoder (spectrogram-to-waveform)
    waveforms = hifi_gan.decode_batch(mel_output)

    # Save the waverform
    torchaudio.save(audio,waveforms.squeeze(1), 22050)

    
    query_response={"current_user": current_user, "id": id, "score": score, "question": qtn, "emotion":emotion[0], 
                    "answer": answer, "context": contextStr, "response_audio": audio }  

    return current_user,emotion[0],qtn,audio,score,answer,contextStr

In [64]:
inputs = [
    gradio.inputs.Audio(source="microphone", type="filepath", optional=True, label="Ask Question")
]
output = [
    gradio.Textbox(label="Current User"),
    gradio.Textbox(label="Emotion"),
    gradio.Textbox(label="Question"),
    gradio.Audio(label="Response Audio", type="filepath"),
    gradio.Textbox(label="Score"),
    gradio.Textbox(label="Answer Text"),
    gradio.Textbox(label="Context"),
]

  gradio.inputs.Audio(source="microphone", type="filepath", optional=True, label="Ask Question")
  gradio.inputs.Audio(source="microphone", type="filepath", optional=True, label="Ask Question")


In [65]:
interface = gradio.Interface(
    fn=read_audio,
    inputs=inputs,
    outputs=output,
    layout="horizontal",
    allow_flagging=False,
    live=False,
    cache_examples=False
)
interface.launch(share=True)


  interface = gradio.Interface(


Running on local URL:  http://127.0.0.1:7875
Running on public URL: https://c3b2f17214bec32735.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)






torch.Size([1, 42061])
mb1 shape
torch.Size([1, 512])
After normalize emb1 shape
torch.Size([1, 512])
torch.Size([1, 512])
user/Samantha Comparing is it this person
user/Sridhanya Comparing is it this person
User who spoke is current_user =user/Sridhanya
read_audio = /var/tmp/gradio/5d8610151d2962b50ac86593e2159ac802e44ca0/audio-0-100.wav
emotion = neu
question = WHAT IS DELIVERY POINT VALIDATION
441360469201835733, 0.45261478424072266
441360469201835750, 0.6864280104637146
441360469201835835, 1.4276584386825562
answer=delivery point validation that is dpv is a usps technology that validates address information down to the individual mailing address without dpv verification is individual address is within a range of valid addresses
context=delivery point validation that is dpv is a usps technology that validates address information down to the individual mailing address without dpv verification is individual address is within a range of valid addresses. delivery point validation that i

In [None]:
import IPython
IPython.display.Audio("/var/tmp/gradio/56b9b711429eb5668e18728741123aab1a3d915c/audio-0-100.wav")

In [None]:
for index, row in df.iterrows():
    print(row['text'])