In [32]:
from transformers import pipeline
import torch
from transformers import DetrForObjectDetection, DetrImageProcessor, pipeline
import torchvision.transforms as transforms
import cv2 as cv
from PIL import Image
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import gradio as gr 
import requests
import numpy as np
from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration

In [33]:
model_path = "./models/object-detection-resnet-50"
processor =  DetrImageProcessor.from_pretrained(model_path)
model = DetrForObjectDetection.from_pretrained(model_path)

In [None]:
cap = cv.VideoCapture(0)
while True:    
    ret, frame = cap.read()
    if not ret:
        break
    frame_tensor = torch.as_tensor(frame, dtype=torch.float32)
    image_frame = processor(images=frame_tensor, return_tensors="pt")
    outputs= model(**image_frame)
    
    target_sizes = torch.tensor([[frame_tensor.shape[1], frame_tensor.shape[2]]])

    detections = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]
    for score, label, box in zip(detections["scores"], detections["labels"], detections["boxes"]):
        box = [int(i) for i in box.tolist()]
        cv.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2) 
        label_text = f"{model.config.id2label[label.item()]}: {round(score.item(), 2)}"
        cv.putText(frame, label_text, (box[0], box[1] - 10), cv.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    if label_text.startswith ("person"):
        start_conversation()
        break
    cv.imshow("frame", frame)
    if cv.waitKey(1) == ord('q'):
         break
cap.release()
cv.destroyAllWindows()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


#### creating a conversation with someone
Reading the input from the microphone
detect the spoken langauge 
convert the microphone sampling rate to 16kz
convert the spoken words to text

In [29]:
def start_conversation():
    whisper_model_path= "./models/automatic-speech-recognition"
    processor = AutoProcessor.from_pretrained(whisper_model_path)
    model = AutoModelForSpeechSeq2Seq.from_pretrained(whisper_model_path, use_safetensors=True)
    output= ""
    torch_dtype = torch.float32

    whisper_pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    torch_dtype=torch_dtype
    )

    demo = gr.Blocks()

    def transcribe_speech(file_path):
        if file_path is None:
            gr.Warning("No audio found please try again")
            return ""
        output = whisper_pipe(file_path)
        creating_conversation_with_blenderbot(output["text"])
        return output["text"]
        
    mic_transcribe = gr.Interface(
    fn =transcribe_speech,
    inputs = gr.Audio(sources="microphone",
                      type="filepath"),
    outputs = gr.Textbox(label="Transcription_results",
                         lines=3),
    allow_flagging="never")
    
    def launch_demo():
        with demo:
            gr.TabbedInterface(
                [mic_transcribe],
                ["Transcribe Microphone"]
            )
            demo.launch(debug=True, share=True)
    launch_demo()
    

In [31]:
start_conversation()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


ERROR:    Exception in ASGI application
Traceback (most recent call last):
  File "C:\Users\Design Dep\Desktop\Rhodrick\enviro\Lib\site-packages\uvicorn\protocols\http\httptools_impl.py", line 411, in run_asgi
    result = await app(  # type: ignore[func-returns-value]
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Design Dep\Desktop\Rhodrick\enviro\Lib\site-packages\uvicorn\middleware\proxy_headers.py", line 69, in __call__
    return await self.app(scope, receive, send)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Design Dep\Desktop\Rhodrick\enviro\Lib\site-packages\fastapi\applications.py", line 1054, in __call__
    await super().__call__(scope, receive, send)
  File "C:\Users\Design Dep\Desktop\Rhodrick\enviro\Lib\site-packages\starlette\applications.py", line 123, in __call__
    await self.middleware_stack(scope, receive, send)
  File "C:\Users\Design Dep\Desktop\Rhodrick\enviro\Lib\site-packages\starlette\middleware\errors.py", 

Keyboard interruption in main thread... closing server.


##### creating conversation context using blenderbot model
from the transcribed spoken words create a conversation chat with
create a context to follow up the text messages converted and spoken

In [30]:
def creating_conversation_with_blenderbot(transcribed_text, conversation_history=[]):
    from transformers import Conversation, BlenderbotTokenizer, BlenderbotForConditionalGeneration
    blender_model_path = "./models/facebook"
    
    tokenizer = BlenderbotTokenizer.from_pretrained(blender_model_path)
    model = BlenderbotForConditionalGeneration.from_pretrained(blender_model_path)

    if conversation_history:
        transcribed_text = '\n'.join(conversation_history + [transcribed_text])

    inputs = tokenizer([transcribed_text], return_tensors="pt", max_length=128, truncation=True)

    reply_ids = model.generate(**inputs)
    
    response = tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0]
    conversation_history.append(transcribed_text)
    conversation_history.append(response)

    produce_reply_sound(response)
    return conversation_history


['i love doing NLP do u know anything?',
 " I don't know much about it, but I do know that it's a non-profit organization.",
 "i love doing NLP do u know anything?\n I don't know much about it, but I do know that it's a non-profit organization.\nNope i meant natural language processing",
 ' National Language Academy of the United States is a nonprofit organization that aims to educate and develop native languages.',
 "i love doing NLP do u know anything?\n I don't know much about it, but I do know that it's a non-profit organization.\ni love doing NLP do u know anything?\n I don't know much about it, but I do know that it's a non-profit organization.\nNope i meant natural language processing\n National Language Academy of the United States is a nonprofit organization that aims to educate and develop native languages.\noh really i didnt know about that then? so what do know about financial crisis",
 ' Financial crisis is a serious issue that affects a lot of people globally.']

In [3]:
def produce_reply_sound(text):
    import pyttsx3
    engine = pyttsx3.init()
    rate = engine.getProperty('rate')
    engine.setProperty('rate', rate- 50)
    engine.say(text)
    engine.runAndWait()