In [None]:
from transformers import pipeline, Conversation
from transformers.utils import logging
# from sentence_transformers import util, SentenceTransformer
from datasets import load_dataset, load_from_disk, Audio
import torch
import gc
import os
import io
import gradio
import soundfile
import numpy as np
import librosa
from IPython.display import Audio as IPythonAudio
logging.set_verbosity_error()

##### Conversational Agent

In [None]:
chatbot = pipeline(task="conversational",
                   model="facebook/blenderbot-400M-distill")

user_message = """
Define PI
"""

conversation = Conversation(user_message)
conversation_response = chatbot(conversation)
print(conversation_response)

conversation.add_message({
    "role": "user",
    "content": """
Define square root?
"""
})

In [None]:
del chatbot, user_message, conversation, conversation_response
gc.collect()

##### Transational and Summarization Agent

In [None]:
transalator = pipeline(task="translation",
                       model="facebook/nllb-200-distilled-600M",
                       torch_dtype=torch.bfloat16)

text = """
We are the children of planet earth.\
We are the most intelligent species on earth.
"""

translated_text = transalator(text,
                              src_lang='eng_Latin',
                              tgt_lang='hin_Deva',)

print(translated_text)



In [None]:
del transalator, translated_text, text
gc.collect()

In [None]:
summarizer = pipeline(task="summarization",
                       model="facebook/bart-large-cnn",
                       torch_dtype=torch.bfloat16)

text = """Paris is the capital and most populous city of France, with
          an estimated population of 2,175,601 residents as of 2018,
          in an area of more than 105 square kilometres (41 square
          miles). The City of Paris is the centre and seat of
          government of the region and province of Île-de-France, or
          Paris Region, which has an estimated population of
          12,174,880, or about 18 percent of the population of France
          as of 2017."""

summarized_text = summarizer(text,
                              min_length=10,
                              max_length=100)

print(summarized_text)

In [None]:
del summarizer, translated_text, text
gc.collect()

#### Sentence Embeddings

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

sentences1 = ['The cat sits outside',
              'A man is playing guitar',
              'The movies are awesome']
embeddings1 = model.encode(sentences1, convert_to_tensor=True)

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
cosine_scores = util.cos_sim(embeddings1, embeddings2)
print(cosine_scores)

In [None]:
del model, sentences1, sentences2, embeddings1, embeddings1, cosine_scores
gc.collect()

#### Zero-Shot Audio Classification

In [None]:
dataset = load_dataset("ashraq/esc50", split="train[0:10]")
# dataset = load_from_disk("./models/ashraq/esc50/train")

audio_sample = dataset[0]
IPythonAudio(audio_sample["audio"]["array"], rate=audio_sample["audio"]["sampling_rate"])

zero_shot_classifier = pipeline(task="zero-shot-audio-classification",
                                model="laion/clap-htsat-unfused")

dataset = dataset.cast_column("audio", Audio(sampling_rate=48_000))

audio_sample = dataset[0]

candidate_labels = [
    "sound of dog",
    "sound of vacuum cleaner"
]

zero_shot_classifier(audio_sample["audio"]["array"], candidate_labels=candidate_labels)

In [None]:
del dataset, audio_sample, zero_shot_classifier, candidate_labels
gc.collect()

#### Automatic Speech Recognition

In [None]:
dataset = load_dataset("librispeech_asr", split="train.clean.100",
                       streaming=True, trsut_remote_code=True)

example = next(iter(dataset))
IPythonAudio(example["audio"]["array"],
             rate=example["audio"]["sampling_rate"])

asr = pipeline(task="automatic-speech-recognition",
               model="distil-whisper/distil-small.en")

demo = gradio.Blocks()


def transcribe_speech(filepath):
    if (filepath is None):
        gradio.warning("No audio found, please try again!")
        return ""
    audio, sampling_rate = soundfile.read(filepath)
    audio_transposed = np.transpose(audio)
    audio_mono = librosa.to_mono(audio_transposed)
    audio_resampled = librosa.resample(audio_mono,
                                       orig_sr=sampling_rate,
                                       target_sr=16000)
    output = asr(audio_resampled,
                 max_new_tokens=256,
                 chunk_length_s=30,
                 batch_size=8)
    return output["text"]

mic_transcribe = gradio.Interface(fn=transcribe_speech,
                                  inputs=gradio.Audio(sources="microphone",
                                                      type="filepath",),
                                  outputs=gradio.Textbox(label="Transcription",
                                                         lines=3),
                                  allow_flaggin="never")

file_transcribe = gradio.Interface(fn=transcribe_speech,
                                   inputs=gradio.Audio(sources="upload",
                                                       type="filepath"),
                                    outputs=gradio.Textbox(label="Transcription",
                                                           lines=3),
                                    allow_flagging="never")

with demo:
    gradio.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"]
    )

demo.launch(share=True, server_port=int(os.environ['PORT1']))

In [None]:
demo.close()

In [None]:
del dataset, example, asr, demo, mic_transcribe, file_transcribe
gc.collect()

#### Text to Speech

In [None]:
narrator = pipeline(task="text-to-speech",
                    model="kakao-enterprise/vits-ljs")

text = """
Researchers at the Allen Institute for AI, \
HuggingFace, Microsoft, the University of Washington, \
Carnegie Mellon University, and the Hebrew University of \
Jerusalem developed a tool that measures atmospheric \
carbon emitted by cloud servers while training machine \
learning models. After a model’s size, the biggest variables \
were the server’s location and time of day it was active.
"""

narrated_text = narrator(text)

IPythonAudio(narrated_text["audio"][0], rate=narrated_text["sampling_rate"])