In [None]:
import torch
import gc
import os
import io
import gradio
import soundfile
import numpy as np
import librosa
import requests
from PIL import Image
from IPython.display import Audio as IPythonAudio
from transformers import pipeline, Conversation, SamModel, SamProcessor, BlipForImageTextRetrieval, AutoProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering, CLIPModel
from transformers.utils import logging
# from sentence_transformers import util, SentenceTransformer
from datasets import load_dataset, load_from_disk, Audio
from helper import load_image_from_url, render_results_in_image, ignore_warnings, summarize_predictions_natural_language, show_pipe_masks_on_image
logging.set_verbosity_error()
ignore_warnings()

##### Conversational Agent

In [None]:
chatbot = pipeline(task="conversational",
                   model="facebook/blenderbot-400M-distill")

user_message = """
Define PI
"""

conversation = Conversation(user_message)
conversation_response = chatbot(conversation)
print(conversation_response)

conversation.add_message({
    "role": "user",
    "content": """
Define square root?
"""
})

In [None]:
del chatbot, user_message, conversation, conversation_response
gc.collect()

##### Transational and Summarization Agent

In [None]:
transalator = pipeline(task="translation",
                       model="facebook/nllb-200-distilled-600M",
                       torch_dtype=torch.bfloat16)

text = """
We are the children of planet earth.\
We are the most intelligent species on earth.
"""

translated_text = transalator(text,
                              src_lang='eng_Latin',
                              tgt_lang='hin_Deva',)

print(translated_text)



In [None]:
del transalator, translated_text, text
gc.collect()

In [None]:
summarizer = pipeline(task="summarization",
                       model="facebook/bart-large-cnn",
                       torch_dtype=torch.bfloat16)

text = """Paris is the capital and most populous city of France, with
          an estimated population of 2,175,601 residents as of 2018,
          in an area of more than 105 square kilometres (41 square
          miles). The City of Paris is the centre and seat of
          government of the region and province of Île-de-France, or
          Paris Region, which has an estimated population of
          12,174,880, or about 18 percent of the population of France
          as of 2017."""

summarized_text = summarizer(text,
                              min_length=10,
                              max_length=100)

print(summarized_text)

In [None]:
del summarizer, translated_text, text
gc.collect()

#### Sentence Embeddings

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

sentences1 = ['The cat sits outside',
              'A man is playing guitar',
              'The movies are awesome']
embeddings1 = model.encode(sentences1, convert_to_tensor=True)

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
cosine_scores = util.cos_sim(embeddings1, embeddings2)
print(cosine_scores)

In [None]:
del model, sentences1, sentences2, embeddings1, embeddings1, cosine_scores
gc.collect()

#### Zero-Shot Audio Classification

In [None]:
dataset = load_dataset("ashraq/esc50", split="train[0:10]")
# dataset = load_from_disk("./models/ashraq/esc50/train")

audio_sample = dataset[0]
IPythonAudio(audio_sample["audio"]["array"], rate=audio_sample["audio"]["sampling_rate"])

zero_shot_classifier = pipeline(task="zero-shot-audio-classification",
                                model="laion/clap-htsat-unfused")

dataset = dataset.cast_column("audio", Audio(sampling_rate=48_000))

audio_sample = dataset[0]

candidate_labels = [
    "sound of dog",
    "sound of vacuum cleaner"
]

zero_shot_classifier(audio_sample["audio"]["array"], candidate_labels=candidate_labels)

In [None]:
del dataset, audio_sample, zero_shot_classifier, candidate_labels
gc.collect()

#### Automatic Speech Recognition

In [None]:
dataset = load_dataset("librispeech_asr", split="train.clean.100",
                       streaming=True, trsut_remote_code=True)

example = next(iter(dataset))
IPythonAudio(example["audio"]["array"],
             rate=example["audio"]["sampling_rate"])

asr = pipeline(task="automatic-speech-recognition",
               model="distil-whisper/distil-small.en")

demo = gradio.Blocks()


def transcribe_speech(filepath):
    if (filepath is None):
        gradio.warning("No audio found, please try again!")
        return ""
    audio, sampling_rate = soundfile.read(filepath)
    audio_transposed = np.transpose(audio)
    audio_mono = librosa.to_mono(audio_transposed)
    audio_resampled = librosa.resample(audio_mono,
                                       orig_sr=sampling_rate,
                                       target_sr=16000)
    output = asr(audio_resampled,
                 max_new_tokens=256,
                 chunk_length_s=30,
                 batch_size=8)
    return output["text"]

mic_transcribe = gradio.Interface(fn=transcribe_speech,
                                  inputs=gradio.Audio(sources="microphone",
                                                      type="filepath",),
                                  outputs=gradio.Textbox(label="Transcription",
                                                         lines=3),
                                  allow_flaggin="never")

file_transcribe = gradio.Interface(fn=transcribe_speech,
                                   inputs=gradio.Audio(sources="upload",
                                                       type="filepath"),
                                    outputs=gradio.Textbox(label="Transcription",
                                                           lines=3),
                                    allow_flagging="never")

with demo:
    gradio.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"]
    )

demo.launch(share=True, server_port=int(os.environ['PORT1']))

In [None]:
demo.close()

In [None]:
del dataset, example, asr, demo, mic_transcribe, file_transcribe
gc.collect()

#### Text to Speech

In [None]:
narrator = pipeline(task="text-to-speech",
                    model="kakao-enterprise/vits-ljs")

text = """
Researchers at the Allen Institute for AI, \
HuggingFace, Microsoft, the University of Washington, \
Carnegie Mellon University, and the Hebrew University of \
Jerusalem developed a tool that measures atmospheric \
carbon emitted by cloud servers while training machine \
learning models. After a model’s size, the biggest variables \
were the server’s location and time of day it was active.
"""

narrated_text = narrator(text)

IPythonAudio(narrated_text["audio"][0], rate=narrated_text["sampling_rate"])

#### Object Detection

In [None]:
object_detector = pipeline(task="object-detection", model="facebook/detr-resnet-50")
filepath = ""
raw_img = Image.open(filepath)
raw_img.resize((569, 491))
pipeline_output = object_detector(raw_img)
processed_img = render_results_in_image(raw_img, pipeline_output)

def get_pipeline_prediction(pil_img):
    pipeline_output = object_detector(pil_img)
    # text = summarize_predictions_natural_language(pipeline_output)
    processed_img = render_results_in_image(pil_img, pipeline_output)
    return processed_img, text

demo = gradio.Interface(fn=get_pipeline_prediction,
                        inputs=gradio.Image(label="Input Image",
                            type="pil"),
                        outputs=gradio.Image(label="Output Image with predicted instances", type="pil"),
)

demo.launch(share=True, server_port=int(os.environ['PORT1']))

In [None]:
demo.close()

In [None]:
del object_detector, filepath, raw_img, pipeline_output, processed_img, demo
gc.collect()

#### Image Segmentation

In [None]:
sam_pipeline = pipeline(task="mask-generation", model="Zigeng/SlimSAM-uniform-77")
filepath = ""
raw_image = Image.open(filepath)
raw_image.resize((720, 375))
output = sam_pipeline(raw_image, points_per_batch=32)
show_pipe_masks_on_image(raw_image, output)

In [None]:
depth_estimator = pipeline(task="depth-estimation",
                           model="Intel/dpt-hybrid-midas")

In [None]:
def launch(input_image):
    output = depth_estimator(input_image)

    prediction = torch.nn.fucntional.interpolate(output["predicted_depth"].unsqueeze(1),
                                                 size=input_image.size[::-1],
                                                 mode="bicubic",
                                                 align_corners=False)
    
    output = prediction.squeeze().numpy()
    formatted = (output * 255 / np.max(output)).astype("uint8")
    depth = Image.fromarray(formatted)
    return depth

iface = gradio.Interface(fn=launch,
                         inputs=gradio.Image(type="pil"),
                         outputs=gradio.Image(type="pil"))
iface.launch(share=True, server_port=int(os.environ["PORT1"]))

In [None]:
iface.close()

In [None]:
del sam_pipline, filepath, raw_image, output, iface
gc.collect()

In [None]:
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')

#### Image Retrieval

In [None]:
img_text_model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
text = "an image of a woman and a dog on the beach"

inputs = processor(images=raw_image,
                   text=text,
                   return_tensors='pt')
itm_scores = model(**inputs)[0]
itm_scores = torch.nn.functional.softmax(itm_scores, dim=1)

print(f"The image and text match score is: {itm_scores[0][1] * 100}")

In [None]:
del img_text_model, processor, text, inputs, itm_scores
gc.collect()

#### Image Captioning

In [None]:
img_cap_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")

text = "a photograph of"
inputs = processor(raw_img, text, return_tensors="pt")
output= model.generate(**inputs)
print(f"Conditional Captioning: {processor.decode(output[0], skip_special_tokens=True)}")

inputs = processor(raw_img, return_tensors="pt")
output= model.generate(**inputs)
print(f"Unconditional Captioning: {processor.decode(output[0], skip_special_tokens=True)}")

In [None]:
del img_cap_model, processor, text, inputs, output
gc.collect()

#### Multimodal Visual Question Answering

In [None]:
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")

question = "How many dogs are in the picture?"

inputs = processor(raw_img, question, return_tensors="pt")
output= model.generate(**inputs)

print(f"Model Answer: {processor.decode(output[0], skip_special_tokens=True)}")

In [None]:
del model, processor, question, inputs, output
gc.collect()

#### Zero-Shot Image Classification

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")

labels = ["a photo of a cat", "a photo of a dog"]
inputs = processor(text=labels,
                   images=raw_img,
                   return_tensors="pt",
                   padding=True)
output = model(**inputs)
probs = list(output.logits_per_image.softmax(dim=1)[0])

for i in range(len(labels)):
  print(f"label: {labels[i]} - probability of {probs[i].item():.4f}")